In [13]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_validate


# Dados
fonte = 'https://github.com/alura-cursos/imersao-dados-2-2020/blob/master/MICRODADOS_ENEM_2019_SAMPLE_43278.csv?raw=true'
dados = pd.read_csv(fonte)

# Nota total
provas = ["NU_NOTA_CN", "NU_NOTA_CH", "NU_NOTA_MT", "NU_NOTA_LC", "NU_NOTA_REDACAO"]
dados["NU_NOTA_TOTAL"] = dados[provas].sum(axis=1)
provas.append("NU_NOTA_TOTAL")

# Nota das provas que não zeraram
nao_zerou = dados.query("NU_NOTA_TOTAL != 0")


In [8]:
provas_entrada = ["NU_NOTA_CN", "NU_NOTA_CH", "NU_NOTA_LC", "NU_NOTA_REDACAO"]
prova_saida = "NU_NOTA_MT"

nao_zerou = nao_zerou[provas].dropna()

notas_entrada = nao_zerou[provas_entrada]
notas_saida = nao_zerou[prova_saida]

In [12]:
x_treino, x_teste, y_treino, y_teste = train_test_split(notas_entrada, notas_saida, test_size=0.25)

modelo_arvore = DecisionTreeRegressor(max_depth=3)
modelo_arvore.fit(x_treino, y_treino)

predicoes_mat_arvore = modelo_arvore.predict(x_teste)

mean_squared_error(y_teste, predicoes_mat_arvore)

6115.26065423655

In [16]:
modelo_arvore = DecisionTreeRegressor(max_depth=2)
cross_validate(modelo_arvore, notas_entrada, notas_saida, cv=10, scoring="neg_mean_squared_error")

{'fit_time': array([0.06579852, 0.05772471, 0.05155134, 0.05059385, 0.05197954,
        0.05179429, 0.0513742 , 0.0504756 , 0.05140114, 0.05117297]),
 'score_time': array([0.00211406, 0.00217509, 0.00215864, 0.00218034, 0.00215936,
        0.00217104, 0.0021832 , 0.00217867, 0.00216031, 0.00219107]),
 'test_score': array([-6118.6911947 , -6536.09830186, -6338.64260896, -6456.70685686,
        -6234.97203593, -6794.25120283, -6713.45694262, -6693.04880297,
        -6610.58461646, -7085.82408339])}

In [20]:
from sklearn.model_selection import KFold
import numpy as np

SEED=1234
np.random.seed(SEED)

partes = KFold(n_splits=10, shuffle=True)
cross_validate(modelo_arvore, notas_entrada, notas_saida, cv=partes, scoring="neg_mean_squared_error")

{'fit_time': array([0.06174636, 0.05185032, 0.05100846, 0.05210376, 0.0533042 ,
        0.05078626, 0.05173469, 0.05162954, 0.05237865, 0.05147338]),
 'score_time': array([0.00223613, 0.00221539, 0.00220299, 0.0021944 , 0.00219321,
        0.00234485, 0.00216484, 0.00213861, 0.00204587, 0.00214529]),
 'test_score': array([-6415.49542109, -6428.46080207, -6495.00611061, -6654.34052317,
        -6533.59848794, -6755.61908227, -6534.44559998, -6570.42476948,
        -6486.86865141, -6760.715922  ])}