<a href="https://colab.research.google.com/github/ericcurtygiorno/Desafio_Tecnico_OnCase/blob/main/Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score


sns.set(rc={'figure.figsize':(6,4)})
import warnings
warnings.simplefilter("ignore")

In [None]:
train = pd.read_csv('/content/regression_train.csv')
test = pd.read_csv('/content/regression_test.csv')

In [None]:
train.describe()

Unnamed: 0,target,X1,X2,X3,X4,X5,X6,X7
count,715.0,1042.0,1031.0,1042.0,1042.0,1042.0,1042.0,1042.0
mean,1349.940229,2.421925,-0.460781,52.094329,2.462871,-65.284233,4.381958,-1.287435
std,1593.201095,3.182976,4.426286,31.374828,2.541632,9.033489,3.290306,11.06638
min,0.062551,-8.699406,-13.65154,38.035456,0.00236,-121.450423,0.0,-69.835905
25%,230.632308,0.17059,-3.307931,47.087106,0.671196,-69.12663,0.0,-7.830007
50%,808.374368,2.508671,-0.46115,49.972867,1.652609,-65.36066,5.0,-1.074191
75%,1849.754221,4.540218,2.406651,52.674305,3.361181,-61.187635,8.0,5.659476
max,10382.46948,12.380693,12.926106,500.0,18.813009,53.919202,8.0,30.376348


In [None]:
test.describe()

Unnamed: 0,target,X1,X2,X3,X4,X5,X6,X7
count,180.0,261.0,261.0,261.0,261.0,261.0,261.0,261.0
mean,1083.16762,2.118804,-0.729926,50.019371,2.570686,-65.634193,4.153257,-1.896956
std,1266.764414,2.92407,4.06592,4.109684,2.456733,7.827388,3.345416,10.133573
min,0.619759,-7.187993,-12.530861,38.282205,0.017793,-123.280755,0.0,-56.751775
25%,203.306229,0.220241,-3.113019,47.368662,0.798704,-69.361831,0.0,-7.315595
50%,623.905972,2.038765,-0.816475,50.070453,1.824582,-65.517502,5.0,-1.918715
75%,1522.742335,4.008733,1.836272,52.659709,3.587555,-61.681399,8.0,4.315239
max,8270.877273,9.586412,11.610249,65.704951,16.79891,-43.237419,8.0,27.284085


In [None]:
train.duplicated().sum()

0

In [None]:
test.duplicated().sum()

0

In [None]:
train.isnull().sum()

target    327
X1          0
X2         11
X3          0
X4          0
X5          0
X6          0
X7          0
dtype: int64

In [None]:
test.isnull().sum()

target    81
X1         0
X2         0
X3         0
X4         0
X5         0
X6         0
X7         0
dtype: int64

In [None]:
train = train.dropna()

In [None]:
test = test.dropna()

In [None]:
X_train = train.drop('target', axis=1)
X_test = test.drop('target', axis=1)
y_train = train['target']
y_test = test['target']

## Normalização das variáveis



In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Transformando para dataframe para visualização
X_train = pd.DataFrame(X_train_scaled,columns = X_train.columns)
X_test = pd.DataFrame(X_test_scaled,columns = X_test.columns)

## Regressão Linear

In [None]:
regLinear = LinearRegression().fit(X_train, y_train)
yLinear = regLinear.predict(X_test)
RMSELinear = mean_squared_error(y_test, yLinear, squared=False)
R2Linear = r2_score(y_test, yLinear)
print("RMSE:", RMSELinear)
print("R2:", R2Linear)

RMSE: 404.20243974379787
R2: 0.8976175245733276


## Support Vector Regression

In [None]:
regSVR = SVR().fit(X_train, y_train)
ySVR = regSVR.predict(X_test)
RMSESVR = mean_squared_error(y_test, ySVR, squared=False)
R2SVR = r2_score(y_test, ySVR)
print("RMSE:", RMSESVR)
print("R2:", R2SVR)

RMSE: 1252.8082370765107
R2: 0.01644881718107538


## Decision Tree Regression (XGBoost) -> O melhor resultado entre os 3 modelos

In [None]:
regXGB = XGBRegressor().fit(X_train, y_train)
yXGB = regXGB.predict(X_test)
RMSEXGB = mean_squared_error(y_test, yXGB, squared=False)
R2XGB = r2_score(y_test, yXGB)
print("RMSE:", RMSEXGB)
print("R2:", R2XGB)

RMSE: 97.81524854212032
R2: 0.9940042862521397


## Otimização de Hiperparâmetros

In [None]:
regXGB.get_params().keys()

dict_keys(['base_score', 'booster', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'gamma', 'importance_type', 'learning_rate', 'max_delta_step', 'max_depth', 'min_child_weight', 'missing', 'n_estimators', 'n_jobs', 'nthread', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'seed', 'silent', 'subsample', 'verbosity'])

In [None]:
parameters = {
    "max_depth": [5, 6, 7],
    "learning_rate": [0.1, 0.2,0.3],
    "objective": ['reg:squarederror'],
    "booster": ['gbtree'],
    "n_jobs": [5],
    "gamma": [0, 1],
    "min_child_weight": [1,3],
    "max_delta_step": [0,1],
    "subsample": [0.5, 1]
}

In [None]:
xgbGrid = GridSearchCV(XGBRegressor(),parameters, refit= 'neg_mean_squared_error', verbose=True)

In [None]:
xgbGridModel = xgbGrid.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


In [None]:
xgbGridModel.best_params_

{'booster': 'gbtree',
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 1,
 'n_jobs': 5,
 'objective': 'reg:squarederror',
 'subsample': 0.5}

In [None]:
yGrid = xgbGridModel.predict(X_test)

In [None]:
MSEGrid = mean_squared_error(y_test, yGrid)
R2Grid = r2_score(y_test, yGrid)

## Resultado Final

### Importância das Features

In [None]:
regXGB.feature_importances_

array([0.03560435, 0.9467944 , 0.00713695, 0.00781668, 0.00107387,
       0.00157388, 0.        ], dtype=float32)

In [None]:
X_train.head(1)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7
0,-0.103498,-1.353772,-0.170488,-0.82826,-0.166405,-1.414602,-1.353772


In [None]:
# A variável X2 é a que tem a maior importância no modelo, sendo 94.68% de contribuição para o resultado do modelo.

In [None]:
# Resultado com os hiperparâmetros otimizados
print('RMSE XGB Grid:', np.sqrt(MSEGrid))
print('R2 XGB Grid:', R2Grid)

RMSE XGB Grid: 88.39489527175544
R2 XGB Grid: 0.9951035408470853


In [None]:
# Lista com o target observado e o target predito
list(zip(y_test, yGrid))

[(3.436244142616716, 14.586692),
 (1525.839412077425, 1605.8099),
 (455.6001910204114, 450.68774),
 (0.6197594789212001, 17.918169),
 (10.99647244879627, 26.640768),
 (1538.9970024215677, 1566.6284),
 (1120.697908841679, 1081.8884),
 (1131.694281676317, 1169.6913),
 (575.8971789794581, 594.6796),
 (1762.4206341244096, 1749.2473),
 (5187.44557290687, 5690.058),
 (104.50505365594084, 127.13171),
 (1173.3597654695168, 1154.8363),
 (201.81976823714672, 171.47215),
 (1107.155927628755, 1168.8962),
 (850.3374754695748, 867.9772),
 (1042.8765757196827, 964.7497),
 (945.4616414522768, 1022.97235),
 (1572.8164733712586, 1679.6105),
 (1140.5659935610047, 1146.585),
 (110.41457644487686, 137.11166),
 (348.16429278087406, 327.44608),
 (58.29355649815622, 88.73511),
 (4624.417119726351, 4288.9355),
 (505.4668144608417, 508.97327),
 (1877.327031711008, 1951.3376),
 (222.6322000186324, 239.73485),
 (1427.0208292379489, 1461.583),
 (431.3180746626064, 475.36044),
 (4748.364444906259, 4565.1646),
 (263