In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import RandomForestRegressor

from pyspark.ml.evaluation import RegressionEvaluator



# CARREGAR DADOS

### VARIÁVEIS DUMMY

In [4]:
def dummy(df, key, column_name):
    dummy = df\
        .groupBy(key)\
        .pivot(column_name)\
        .agg(f.lit(1))\
        .na\
        .fill(0)
    return dummy

In [None]:
#renomeia coluna target para label (esperado pelans funções de machine learning)
dataset = dataset.withColumnRenamed('target','label')

In [None]:
#cria culunas dummy
dummy_var1 = dummy(df, 'cpf', 'regional')
dummy_var2 = dummy(df, 'cpf', 'BU')

In [None]:
#adiciona colunas dummy ao dataframe principal
df = df\
    .join(dummy_var1, 'cpf', how='inner')\
    .join(dummy_var2, 'cpf', how='inner')

In [None]:
#ajuste o dataset para que tenha apenas as variáveis explicativas e a variável resposta

### CORRELAÇÃO

In [None]:
correlacao = Correlation.corr(dataset_prep, 'features').collect()

In [None]:
pd.DataFrame(correlacao.toArray(), columns=X, index=X)
dataframe_correlacao

In [None]:
plt.figure(figsize=(12,10))
paleta = sns.color_palette("light:salmon", as_cmap=True)
sns.heatmap(dataframe_correlacao.round(1), annot=True, cmap=paleta)

# LOGISTIC REGRESSION

### AJUSTE E PREVISÃO

In [None]:
treino, teste = dataset_prep.randomSplit([0.7, 0.3], seed=1112)

In [None]:
treino.count()

In [None]:
teste.count()

In [None]:
lr = LinearRegression()

In [None]:
modelo_lr = lr.fit(treino)

In [None]:
previsoes_lr_treino = modelo_lr.transform(treino)

### MÉTRICAS

#### TREINO

In [6]:
#R2
metrica_treino = modelo_lr.summary

In [None]:
#RMSE
metrica_treino.rootMeanSquaredError

#### TESTE

In [None]:
metrica_teste = modelo_lr.evaluate(teste)

In [None]:
metrica_teste.r2

In [None]:
metrica_teste.rootMeanSquaredError

#### RESULTADOS

In [None]:
print('Linear Regression')
print("="*30)
print("Dados de Treino")
print("="*30)
print("R²: %f" % resumo_treino.r2)
print("RMSE: %f" % resumo_treino.rootMeanSquaredError)
print("")
print("="*30)
print("Dados de Teste")
print("="*30)
print("R²: %f" % resumo_teste.r2)
print("RMSE: %f" % resumo_teste.rootMeanSquaredError)

# DECISION TREE

### AJUSTE E PREVISÃO

In [None]:
dtr = DecisionTreeRegressor(seed=1112, maxDepth=7)

In [None]:
modelo_dtr = dtr.fit(treino)

In [None]:
previsoes_dtr_treino = modelo_dtr.transform(treino)

### MÉTRICAS

#### TREINO

In [None]:
evaluator = RegressionEvaluator()

In [None]:
print(evaluator.evaluate(previsoes_dtr_treino, {evaluator.metricName: "r2"}))
print(evaluator.evaluate(previsoes_dtr_treino, {evaluator.metricName: "rmse"}))

#### TESTE

In [None]:
previsoes_dtr_teste = modelo_dtr.transform(teste)

In [None]:
previsoes_dtr_teste.show()

#### RESULTADOS

In [None]:
print('Decision Tree Regression')
print("="*30)
print("Dados de Treino")
print("="*30)
print("R²: %f" % evaluator.evaluate(previsoes_dtr_treino, {evaluator.metricName: "r2"}))
print("RMSE: %f" % evaluator.evaluate(previsoes_dtr_treino, {evaluator.metricName: "rmse"}))
print("")
print("="*30)
print("Dados de Teste")
print("="*30)
print("R²: %f" % evaluator.evaluate(previsoes_dtr_teste, {evaluator.metricName: "r2"}))
print("RMSE: %f" % evaluator.evaluate(previsoes_dtr_teste, {evaluator.metricName: "rmse"}))

# RANDOM FOREST

### AJUSTE E PREVISÃO

In [None]:
rfr = RandomForestRegressor(seed=1112, maxDepth=7, numTrees=10)

In [None]:
modelo_rfr = rfr.fit(treino)

In [None]:
previsoes_rfr_treino = modelo_rfr.transform(treino)

### MÉTRICAS

#### TREINO

In [None]:
evaluator = RegressionEvaluator()

In [None]:
print(evaluator.evaluate(previsoes_rfr_treino, {evaluator.metricName: "r2"}))
print(evaluator.evaluate(previsoes_rfr_treino, {evaluator.metricName: "rmse"}))

#### TESTE

In [None]:
previsoes_rfr_teste = modelo_rfr.transform(teste)

#### RESULTADOS

In [None]:
print('Random Forest Regression')
print("="*30)
print("Dados de Treino")
print("="*30)
print("R²: %f" % evaluator.evaluate(previsoes_rfr_treino, {evaluator.metricName: "r2"}))
print("RMSE: %f" % evaluator.evaluate(previsoes_rfr_treino, {evaluator.metricName: "rmse"}))
print("")
print("="*30)
print("Dados de Teste")
print("="*30)
print("R²: %f" % evaluator.evaluate(previsoes_rfr_teste, {evaluator.metricName: "r2"}))
print("RMSE: %f" % evaluator.evaluate(previsoes_rfr_teste, {evaluator.metricName: "rmse"}))

### ==================================
#### OTIMIZADO
### ==================================

# DECISION TREE

In [None]:
dtr = DecisionTreeRegressor()

In [None]:
grid = ParamGridBuilder() \
    .addGrid(dtr.maxDepth, [2, 5, 10]) \
    .addGrid(dtr.maxBins, [10, 32, 45]) \
    .build()

In [None]:
evaluator = RegressionEvaluator()

In [None]:
dtr_cv = CrossValidator(
    estimator=dtr,
    estimatorParamMaps=grid,
    evaluator=evaluator,
    numFolds=3
        seed = 101
)

In [None]:
modelo_dtr_cv = dtr_cv.fit(treino)

In [None]:
previsoes_dtr_cv_teste = modelo_dtr_cv.transform(teste)

In [None]:
print('Decision Tree Regression')
print("="*30)
print("Sem Cross Validation")
print("="*30)
print("R²: %f" % evaluator.evaluate(previsoes_dtr_teste, {evaluator.metricName: "r2"}))
print("RMSE: %f" % evaluator.evaluate(previsoes_dtr_teste, {evaluator.metricName: "rmse"}))
print("")
print("="*30)
print("Com Cross Validation")
print("="*30)
print("R²: %f" % evaluator.evaluate(previsoes_dtr_cv_teste, {evaluator.metricName: "r2"}))
print("RMSE: %f" % evaluator.evaluate(previsoes_dtr_cv_teste, {evaluator.metricName: "rmse"}))

# RANDOM FOREST

In [None]:
rfr = RandomForestRegressor()

In [None]:
grid = ParamGridBuilder() \
    .addGrid(rfr.numTrees, [10, 20, 30]) \
    .addGrid(rfr.maxDepth, [5, 10]) \
    .addGrid(rfr.maxBins, [10, 32, 45]) \
    .build()

In [None]:
evaluator = RegressionEvaluator()

In [None]:
rfr_cv = CrossValidator(
    estimator=rfr,
    estimatorParamMaps=grid,
    evaluator=evaluator,
    numFolds=3
)

In [None]:
modelo_rfr_cv = rfr_cv.fit(treino)

In [None]:
previsoes_rfr_cv_teste = modelo_rfr_cv.transform(teste)

In [None]:
print('Random Forest Regression')
print("="*30)
print("Sem Cross Validation")
print("="*30)
print("R²: %f" % evaluator.evaluate(previsoes_rfr_teste, {evaluator.metricName: "r2"}))
print("RMSE: %f" % evaluator.evaluate(previsoes_rfr_teste, {evaluator.metricName: "rmse"}))
print("")
print("="*30)
print("Com Cross Validation")
print("="*30)
print("R²: %f" % evaluator.evaluate(previsoes_rfr_cv_teste, {evaluator.metricName: "r2"}))
print("RMSE: %f" % evaluator.evaluate(previsoes_rfr_cv_teste, {evaluator.metricName: "rmse"}))