In [1]:
# Instruções basiconas de sempre.

# Lidar com dataframes.
import numpy as np
import pandas as pd

# Lidar com gráficos.
import matplotlib.pyplot as plt
import seaborn as sns
from plotting import (multiple_histograms_plot,
                      bar_plot_with_categorical,
                      plot_confusion_matrix,
                      plot_confusion_matrix_2,
                      plot_roc)

# Lidar com preparação de dados.
from data_prep import data_prep as dp # Eu que fiz esse modulinho ("uuuuuuuuuma bosts!").
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler
                                   , MinMaxScaler)
from sklearn.model_selection import (train_test_split
                                     , cross_val_score
                                     , StratifiedKFold
                                     , TimeSeriesSplit
                                     , KFold)

# Lidar com validação de modelos.
from sklearn.metrics import (confusion_matrix
                             , accuracy_score
                             , classification_report
                             , mean_squared_error)

pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

In [2]:
random_state = 42

nomeDaBase = "regressao_Q2.csv"
df = pd.read_csv(nomeDaBase, encoding="utf-8")

print(df.shape)
df.head()

(2500, 13)


Unnamed: 0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,target
0,1.92864,1.48414,0.86814,-0.67666,-0.28747,-1.45108,-0.73662,0.03134,-0.53872,1.30562,0.11557,-0.30478,127.682465
1,0.22185,-0.5532,-0.29845,0.6587,-0.30132,1.49319,-0.43096,0.33835,-0.30827,1.25765,1.88584,-0.57726,50.022972
2,-0.02183,0.13602,-0.37426,-1.29096,0.71912,1.95088,1.99309,-1.24197,-2.15377,-2.01455,-0.84625,0.29845,-24.364369
3,0.86528,1.36937,1.27999,1.18124,-0.72465,-0.02175,0.4034,-0.28272,-0.4439,0.84051,0.03326,-0.9855,96.415408
4,1.41333,-0.0292,-0.67228,1.76116,-1.12178,0.18002,0.48476,-0.76394,-0.16421,-2.42048,0.79727,-0.44033,157.407129


In [3]:
x = df.drop("target", axis=1)
y = df.target

### Usando a validação cruzada aleatória

In [4]:
from sklearn.svm import LinearSVR
from sklearn.model_selection import cross_validate

regressor = LinearSVR(C=0.001
                     , random_state=random_state)

# o método cross_validate é bem mais completo que 
cross = cross_validate(regressor, x.values, y, cv=5
                       , scoring="neg_mean_squared_error"
                       , return_train_score=True
                       , n_jobs=-1)

# print(cross.keys())

for key in cross.keys():
    print(f"""{key}: {round(-cross[key].mean())}""")

fit_time: 0
score_time: 0
test_score: 20203
train_score: 20200


### Usando o TimeSeriesSplit
Para obter uma validação cruzada sequencial.

In [5]:
tscv = TimeSeriesSplit(n_splits=5, max_train_size=2000)

tscv_split = {}
train_score = []
test_score = []
i = 1

for train_index, test_index in tscv.split(x):
    tscv_split[i] = {"TRAIN":train_index, "TEST":test_index}
    i += 1
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    regressor.fit(x_train, y_train)
    y_pred = regressor.predict(x_train)
    train_score.append(mean_squared_error(y_train, y_pred))
    y_pred = regressor.predict(x_test)
    test_score.append(mean_squared_error(y_test, y_pred))

tscv_split = pd.DataFrame.from_dict(tscv_split, orient="index")
    
print(f"""train_score: {round(np.mean(train_score))}\ntest_score: {round(np.mean(test_score))}\n""")
    
for i in tscv_split.index:
    print(f"""- Split {i}: Qtd de train: {len(tscv_split.loc[i,"TRAIN"])} --- Qtd de test: {len(tscv_split.loc[i,"TEST"])}""")

train_score: 19746
test_score: 20560

- Split 1: Qtd de train: 420 --- Qtd de test: 416
- Split 2: Qtd de train: 836 --- Qtd de test: 416
- Split 3: Qtd de train: 1252 --- Qtd de test: 416
- Split 4: Qtd de train: 1668 --- Qtd de test: 416
- Split 5: Qtd de train: 2000 --- Qtd de test: 416


### Usando o KFold

In [6]:
kfold = KFold(n_splits=5, shuffle=False)

kfold_split = {}
train_score = []
test_score = []
i = 1

for train_index, test_index in kfold.split(x):
    kfold_split[i] = {"TRAIN":train_index, "TEST":test_index}
    i += 1
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    regressor.fit(x_train, y_train)
    y_pred = regressor.predict(x_train)
    train_score.append(mean_squared_error(y_train, y_pred))
    y_pred = regressor.predict(x_test)
    test_score.append(mean_squared_error(y_test, y_pred))

kfold_split = pd.DataFrame.from_dict(kfold_split, orient="index")
    
print(f"""train_score: {round(np.mean(train_score))}\ntest_score: {round(np.mean(test_score))}\n""")
    
for i in tscv_split.index:
    print(f"""- Split {i}: Qtd de train: {len(kfold_split.loc[i,"TRAIN"])} --- Qtd de test: {len(kfold_split.loc[i,"TEST"])}""")

train_score: 20200
test_score: 20203

- Split 1: Qtd de train: 2000 --- Qtd de test: 500
- Split 2: Qtd de train: 2000 --- Qtd de test: 500
- Split 3: Qtd de train: 2000 --- Qtd de test: 500
- Split 4: Qtd de train: 2000 --- Qtd de test: 500
- Split 5: Qtd de train: 2000 --- Qtd de test: 500
