In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


In [2]:
spark = SparkSession.builder.appName("SalaryPrediction").getOrCreate()
data = spark.read.csv("dados_caged_2022_parsed.csv", header=True, inferSchema=True)
# Removendo registros com valores nulos
data = data.na.drop()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/17 00:48:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

# Feature Engineering e Transformações

In [3]:
categorical_values = ["uf", "cnae_2_secao", "cnae_2_subclasse", "sexo", "grau_de_instrucao"]

indexers = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in categorical_values]
encoders = [OneHotEncoder(inputCol=column+"_index", outputCol=column+"_vec") for column in categorical_values]
assembler = VectorAssembler(inputCols=["horas_contratuais", "idade"] + [column+"_vec" for column in categorical_values], outputCol="features")

# Pipeline de transformação

In [4]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler])

In [5]:
data = pipeline.fit(data).transform(data)
data = data.select("features", "salario_mensal")


                                                                                

# Treinamento dos modelos

In [6]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

lr = LinearRegression(labelCol="salario_mensal", featuresCol="features", regParam=0.1)
lr_model = lr.fit(train_data)

24/06/17 00:55:03 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/06/17 00:55:03 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/06/17 00:57:39 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

In [7]:
rf = RandomForestRegressor(labelCol="salario_mensal", featuresCol="features")
rf_model = rf.fit(train_data)

24/06/17 01:08:08 WARN MemoryStore: Not enough space to cache rdd_94_3 in memory! (computed 74.5 MiB so far)
24/06/17 01:08:08 WARN BlockManager: Persisting block rdd_94_3 to disk instead.
24/06/17 01:08:08 WARN MemoryStore: Not enough space to cache rdd_94_7 in memory! (computed 30.3 MiB so far)
24/06/17 01:08:08 WARN BlockManager: Persisting block rdd_94_7 to disk instead.
24/06/17 01:08:08 WARN MemoryStore: Not enough space to cache rdd_94_5 in memory! (computed 111.9 MiB so far)
24/06/17 01:08:08 WARN BlockManager: Persisting block rdd_94_5 to disk instead.
24/06/17 01:08:08 WARN MemoryStore: Not enough space to cache rdd_94_2 in memory! (computed 5.8 MiB so far)
24/06/17 01:08:08 WARN BlockManager: Persisting block rdd_94_2 to disk instead.
24/06/17 01:08:08 WARN MemoryStore: Not enough space to cache rdd_94_0 in memory! (computed 3.7 MiB so far)
24/06/17 01:08:08 WARN BlockManager: Persisting block rdd_94_0 to disk instead.
24/06/17 01:08:08 WARN MemoryStore: Not enough space to 

# Avaliação dos modelos

In [8]:
evaluator = RegressionEvaluator(labelCol="salario_mensal", predictionCol="prediction", metricName="rmse")


## Previsões e métricas para Linear Regression

In [9]:
lr_predictions = lr_model.transform(test_data)
lr_rmse = evaluator.evaluate(lr_predictions)
lr_r2 = evaluator.evaluate(lr_predictions, {evaluator.metricName: "r2"})
print(f"Linear Regression - RMSE: {lr_rmse}, R2: {lr_r2}")




Linear Regression - RMSE: 18712.84732039367, R2: 0.009406203320376894


                                                                                

## Previsões e métricas para Random Forest

In [10]:
rf_predictions = rf_model.transform(test_data)
rf_rmse = evaluator.evaluate(rf_predictions)
rf_r2 = evaluator.evaluate(rf_predictions, {evaluator.metricName: "r2"})
print(f"Random Forest - RMSE: {rf_rmse}, R2: {rf_r2}")



Random Forest - RMSE: 18673.99822915848, R2: 0.013515008444804688


                                                                                

In [11]:
# Comparação dos modelos
if lr_rmse < rf_rmse:
    print("Linear Regression performs better.")
else:
    print("Random Forest performs better.")

Random Forest performs better.


In [12]:
# Mostrando as predições do modelo Random Forest
rf_predictions.show()

[Stage 43:>                                                         (0 + 1) / 1]

+--------------------+--------------+------------------+
|            features|salario_mensal|        prediction|
+--------------------+--------------+------------------+
|(1403,[0,1,2,28,5...|        781.63|2100.8623941186274|
|(1403,[0,1,2,28,5...|       2428.46|2209.8372833936587|
|(1403,[0,1,2,28,5...|         600.0|2100.8623941186274|
|(1403,[0,1,2,28,5...|         897.0|2100.8623941186274|
|(1403,[0,1,2,28,5...|        2057.0|2209.8372833936587|
|(1403,[0,1,2,28,5...|         689.0|2100.8623941186274|
|(1403,[0,1,2,28,5...|        1804.0|2100.8623941186274|
|(1403,[0,1,2,28,5...|         977.6|2100.8623941186274|
|(1403,[0,1,2,28,5...|        1657.0|2209.8372833936587|
|(1403,[0,1,2,28,5...|         888.0|2100.8623941186274|
|(1403,[0,1,2,28,5...|        917.59|2100.8623941186274|
|(1403,[0,1,2,28,5...|        1044.0|2100.8623941186274|
|(1403,[0,1,2,28,5...|        917.59|2100.8623941186274|
|(1403,[0,1,2,28,5...|         826.5|2100.8623941186274|
|(1403,[0,1,2,28,5...|        8

                                                                                

In [13]:
lr_predictions.show()

[Stage 44:>                                                         (0 + 1) / 1]

+--------------------+--------------+------------------+
|            features|salario_mensal|        prediction|
+--------------------+--------------+------------------+
|(1403,[0,1,2,28,5...|        781.63|1512.1442888413585|
|(1403,[0,1,2,28,5...|       2428.46|1890.2131616225333|
|(1403,[0,1,2,28,5...|         600.0|1473.7055085233615|
|(1403,[0,1,2,28,5...|         897.0|1607.2244802754312|
|(1403,[0,1,2,28,5...|        2057.0|2114.8932003841446|
|(1403,[0,1,2,28,5...|         689.0|1635.4630834371164|
|(1403,[0,1,2,28,5...|        1804.0|1741.4417465142442|
|(1403,[0,1,2,28,5...|         977.6|1663.7016865988016|
|(1403,[0,1,2,28,5...|        1657.0| 2411.283949991839|
|(1403,[0,1,2,28,5...|         888.0|1714.9624219295592|
|(1403,[0,1,2,28,5...|        917.59|1714.9624219295592|
|(1403,[0,1,2,28,5...|        1044.0|1735.5269810348873|
|(1403,[0,1,2,28,5...|        917.59|1756.0915401402153|
|(1403,[0,1,2,28,5...|         826.5|1776.6560992455434|
|(1403,[0,1,2,28,5...|        8

                                                                                