In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


In [2]:
spark = SparkSession.builder.appName("SalaryPrediction").getOrCreate()
data = spark.read.csv("dados_caged_2022_parsed.csv", header=True, inferSchema=True)
# Removendo registros com valores nulos
data = data.na.drop()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/16 21:37:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

# Feature Engineering e Transformações

In [4]:
categorical_values = ["uf", "cnae_2_secao", "cnae_2_subclasse", "sexo", "grau_de_instrucao"]

indexers = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in categorical_values]
encoders = [OneHotEncoder(inputCol=column+"_index", outputCol=column+"_vec") for column in categorical_values]
assembler = VectorAssembler(inputCols=["horas_contratuais", "idade"] + [column+"_vec" for column in categorical_values], outputCol="features")

# Pipeline de transformação

In [6]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler])

In [7]:
data = pipeline.fit(data).transform(data)
data = data.select("features", "salario_mensal")


                                                                                

# Treinamento dos modelos

In [8]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

lr = LinearRegression(labelCol="salario_mensal", featuresCol="features", regParam=0.1)
rf = RandomForestRegressor(labelCol="salario_mensal", featuresCol="features")

lr_model = lr.fit(train_data)
rf_model = rf.fit(train_data)

24/06/16 21:44:13 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/06/16 21:44:13 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/06/16 21:46:58 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
24/06/16 21:57:44 WARN MemoryStore: Not enough space to cache rdd_94_5 in memory! (computed 30.2 MiB so far)
24/06/16 21:57:44 WARN BlockManager: Persisting block rdd_94_5 to disk instead.
24/06/16 21:57:44 WARN MemoryStore: Not enough space to cache rdd_94_7 in memory! (computed 30.2 MiB so far)
24/06/16 21:57:44 WARN BlockManager: Persisting block rdd_94_7 to disk instead.
24/06/16 21:57:44 WARN MemoryStore: Not enough space to cache rdd_94_0 in memory! (computed 74.2 MiB so far)
24/06/16 21:57:44 WARN BlockManager: Persisting block rdd_94_0 to disk instead.
24/06/16 21:57:44 WARN MemoryStore: Not enough space to cache rdd_94_4 in memory! (computed 49.4 MiB so f

# Avaliação dos modelos

In [9]:
evaluator = RegressionEvaluator(labelCol="salario_mensal", predictionCol="prediction", metricName="rmse")


## Previsões e métricas para Linear Regression

In [10]:
lr_predictions = lr_model.transform(test_data)
lr_rmse = evaluator.evaluate(lr_predictions)
lr_r2 = evaluator.evaluate(lr_predictions, {evaluator.metricName: "r2"})
print(f"Linear Regression - RMSE: {lr_rmse}, R2: {lr_r2}")




Linear Regression - RMSE: 18029.23075821563, R2: 0.009639975007766699


                                                                                

## Previsões e métricas para Random Forest

In [11]:
rf_predictions = rf_model.transform(test_data)
rf_rmse = evaluator.evaluate(rf_predictions)
rf_r2 = evaluator.evaluate(rf_predictions, {evaluator.metricName: "r2"})
print(f"Random Forest - RMSE: {rf_rmse}, R2: {rf_r2}")



Random Forest - RMSE: 17960.568764086354, R2: 0.017168927335120765


                                                                                

In [12]:
# Comparação dos modelos
if lr_rmse < rf_rmse:
    print("Linear Regression performs better.")
else:
    print("Random Forest performs better.")

Random Forest performs better.


In [13]:
# Mostrando as predições do modelo Random Forest
rf_predictions.show()

[Stage 43:>                                                         (0 + 1) / 1]

+--------------------+--------------+-----------------+
|            features|salario_mensal|       prediction|
+--------------------+--------------+-----------------+
|(1397,[0,1,2,3,4,...|       1478.08|2343.661923453783|
|(1397,[0,1,2,3,4,...|       1625.42|2343.661923453783|
|(1397,[0,1,2,3,4,...|        1548.0|2343.661923453783|
|(1397,[0,1,2,3,4,...|        1684.0|2203.640768557899|
|(1397,[0,1,2,3,4,...|        2656.0|2343.661923453783|
|(1397,[0,1,2,3,4,...|        1094.4|2203.640768557899|
|(1397,[0,1,2,3,4,...|         687.7|2203.640768557899|
|(1397,[0,1,2,3,4,...|       1829.26|2203.640768557899|
|(1397,[0,1,2,3,4,...|       1438.77|2203.640768557899|
|(1397,[0,1,2,3,4,...|         826.5|2203.640768557899|
|(1397,[0,1,2,3,4,...|       1024.69|2203.640768557899|
|(1397,[0,1,2,3,4,...|       1736.07|2203.640768557899|
|(1397,[0,1,2,3,4,...|        854.04|2203.640768557899|
|(1397,[0,1,2,3,4,...|        1505.0|2203.640768557899|
|(1397,[0,1,2,3,4,...|       1795.43|2203.640768

                                                                                

In [14]:
lr_predictions.show()

[Stage 44:>                                                         (0 + 1) / 1]

+--------------------+--------------+------------------+
|            features|salario_mensal|        prediction|
+--------------------+--------------+------------------+
|(1397,[0,1,2,3,4,...|       1478.08|2425.7851103078283|
|(1397,[0,1,2,3,4,...|       1625.42| 2245.208198931563|
|(1397,[0,1,2,3,4,...|        1548.0| 2387.002817376342|
|(1397,[0,1,2,3,4,...|        1684.0|2017.3664147403906|
|(1397,[0,1,2,3,4,...|        2656.0| 2442.750270074699|
|(1397,[0,1,2,3,4,...|        1094.4|1852.0998232872516|
|(1397,[0,1,2,3,4,...|         687.7|1705.8106683389342|
|(1397,[0,1,2,3,4,...|       1829.26|2326.8412913057255|
|(1397,[0,1,2,3,4,...|       1438.77|2105.2288837296073|
|(1397,[0,1,2,3,4,...|         826.5|1824.8002702980884|
|(1397,[0,1,2,3,4,...|       1024.69|1955.1121488005156|
|(1397,[0,1,2,3,4,...|       1736.07|2128.8613201370754|
|(1397,[0,1,2,3,4,...|        854.04|1848.4327067055565|
|(1397,[0,1,2,3,4,...|        1505.0|2152.4937565445434|
|(1397,[0,1,2,3,4,...|       17

                                                                                