In [None]:
pip install catboost

In [None]:
pip install lightgbm

In [None]:
pip install catboost

# Case de Estudos: Santander Value Prediction

Ajude Santander a identificar o valor das transações para cada cliente potencial. Esse é um primeiro passo que o Santander precisa acertar para personalizar seus serviços em grande escala.
De acordo com uma pesquisa da Epsilon, 80% dos clientes tendem a voltar a fazer negócios com a sua empresa se a mesma entregar um serviço personalizado.

<br>
## Link para os dados e o desafio: 

https://www.kaggle.com/c/santander-value-prediction-challenge/data

O case podera ser quebrado nas 6 partes seguintes:

    Identificar o problema
        Qual o tipo de problema(classificação, regressão, clustering)?
    Necessidades de aplicar transformaçoes?
        Ex: imputing de valores null, encoding de colunas string, etc
    Separar os sets de treinamento e teste
    Baseline
        Achar uma baseline, um primeiro modelo para ter uma referencia
    Escolher a metrica
    Melhorar o resultado
        Feature engineering, otimizaçao do modelo, hiperparametros, etc



In [9]:
#Criação e manipulação dos dataframes
import pandas as pd
pd.set_option('display.max_rows', 500)

#Operações matemáticas
import numpy as np

#Separar o dataframe em treino e teste
from sklearn.model_selection import train_test_split, GridSearchCV

#Métricas de avaliação do modelo
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

#Regressores utilizados
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.neural_network import MLPRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

#Normalização dos dados 
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

#Ignorar os avisos do sklearn
import warnings
warnings.filterwarnings('ignore')

#Verificar o tempo de run de cada modelo 
from time import time

#Visualização gráfica dos dados
import seaborn as sns
import matplotlib.pyplot as plt

#df_pred = pd.DataFrame({'Valores Reais': y_test, 'Valores preditos': y_pred})
#df_pred

In [2]:
df = pd.read_csv('train.csv')

display(df)

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4454,ff85154c8,1065000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4455,ffb6b3f4f,48000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,80000.0,0,0,0,0,0,0,0
4456,ffcf61eb6,2800000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4457,ffea67e98,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


## Visualização gráfica do target

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(range(df.shape[0]), np.sort(df['target'].values))
plt.xlabel('Index', fontsize=16)
plt.ylabel('Target', fontsize=16)

## Verificar por valores nulos no dataset

In [None]:
def verificar_nulos(df):
    i = 0
    for coluna in df.columns:
        if df[coluna].isnull().sum() > 0:
            print(f'A coluna {coluna} possui {df[coluna].isnull().sum()} valores nulos.')
        
        if df[coluna].isnull().sum() == 0:
            i += 1
    
    if i == df.shape[1]:
        print('O dataframe não possui nenhum valor nulo')

verificar_nulos(df)

In [None]:
#Verificando o uso de memória do dataframe
print(f'Memória usada: {df.memory_usage().sum()/(1024*1024):.2f} MBs')

## Separando o dataset em treino e teste

In [11]:
y = df['target']
X = df.drop(['ID', 'target'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Testando os diferentes tipos de escalonamento com alguns modelos

In [None]:
scalers = [
    StandardScaler(),
    MinMaxScaler(),
    RobustScaler()
]

models = [
    LinearRegression(),
    Ridge(),
    MLPRegressor(),
    DecisionTreeRegressor(),
    KNeighborsRegressor()
]

for scaler in scalers:
    scaler = scaler.fit(X_train, X_test)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    print(f'Atual método utilizado: {scaler.__class__.__name__}')
    print(f'')
    
    for model in models:
        regressor = model.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        print(f'O modelo {model.__class__.__name__} teve um MAE de {mean_absolute_error(y_test, y_pred)} e um r2 score de {r2_score(y_test, y_pred)}')

## Normalizando os dados com o Standard Scaler

In [None]:
standardscaler = StandardScaler().fit(X_train, X_test)
X_train = standardscaler.transform(X_train)
X_test = standardscaler.transform(X_test)

## Normalizando os dados com o MinMaxScaler

In [None]:
minmaxscaler = MinMaxScaler().fit(X_train, X_test)
X_train = minmaxscaler.transform(X_train)
X_test = minmaxscaler.transform(X_test)

## Normalizando os dados com o Robust Scaler

In [12]:
robustscaler = RobustScaler().fit(X_train, X_test)
X_train = robustscaler.transform(X_train)
X_test = robustscaler.transform(X_test)

## Testando os modelos

In [None]:
#Criando a lista dos modelos que serão testados
models = [
    LinearRegression(),
    Ridge(),
    Lasso(),
    RandomForestRegressor(),
    MLPRegressor(),
    DecisionTreeRegressor(),
    KNeighborsRegressor(),
    SVR(),
    GaussianProcessRegressor(),
    XGBRegressor(),
    CatBoostRegressor(),
    LGBMRegressor()
]

#Criando as listas vazias das métricas do modelo
MAE = []
RMSE = []
MSE = []
r2 = []
names = []
parameters = []
tempo = []

#Testando modelo por modelo
for model in models:
    #Startando o cronometro para ver o tempo de cada modelo 
    tempo_inicial = time()
    
    #Chamando o regressor e fazendo o predict
    regressor = model.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    
    #Adicionando o valor da métrica de cada modelo para criar o dataframe depois
    MAE.append(mean_absolute_error(y_test, y_pred))
    RMSE.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    MSE.append(mean_squared_error(y_test, y_pred))
    r2.append(r2_score(y_test, y_pred))
               
    #Para pegar o nome do modelo basta pegar o nome da clas
    names.append(model.__class__.__name__)
               
    #Para pegar os parametros utilizados
    parameters.append(model.get_params())
               
    #Parando o cronometro para ver o tempo e adicionando na lista 
    tempo_final = time() - tempo_inicial
    tempo.append(tempo_final)
               
               
#Criando o dataframe com as métricas de todos os modelos 
resultado = pd.DataFrame({'Nome': names,
                         'Parametros': parameters,
                         'MAE': MAE,
                         'RMSE': RMSE,
                         'MSE': MSE,
                         'R2': r2,
                         'Run Time': tempo})


In [None]:
display(resultado)

# Tuning dos hiperparametros 

### Utilizando o SelectKBest para verificar as features mais importantes para o modelo

In [10]:
#regressor = LGBMRegressor().fit(X_train, y_train)
#regressor = XGBRegressor().fit(X_train, y_train)
regressor = CatBoostRegressor().fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print(mean_absolute_error(y_test, y_pred))

Learning rate set to 0.050055
0:	learn: 8332296.3917070	total: 264ms	remaining: 4m 24s
1:	learn: 8287699.4554209	total: 340ms	remaining: 2m 49s
2:	learn: 8232927.2670952	total: 416ms	remaining: 2m 18s
3:	learn: 8188299.1527343	total: 494ms	remaining: 2m 3s
4:	learn: 8138589.2515928	total: 589ms	remaining: 1m 57s
5:	learn: 8093788.2198169	total: 665ms	remaining: 1m 50s
6:	learn: 8047694.6576948	total: 743ms	remaining: 1m 45s
7:	learn: 8008319.4061927	total: 826ms	remaining: 1m 42s
8:	learn: 7973180.0880999	total: 911ms	remaining: 1m 40s
9:	learn: 7944597.2105278	total: 995ms	remaining: 1m 38s
10:	learn: 7909322.7143734	total: 1.07s	remaining: 1m 36s
11:	learn: 7876739.9566002	total: 1.15s	remaining: 1m 34s
12:	learn: 7845274.2416537	total: 1.23s	remaining: 1m 33s
13:	learn: 7814430.7282045	total: 1.32s	remaining: 1m 33s
14:	learn: 7787025.1922510	total: 1.4s	remaining: 1m 32s
15:	learn: 7760200.0486700	total: 1.48s	remaining: 1m 31s
16:	learn: 7738176.3222175	total: 1.56s	remaining: 1m 

141:	learn: 6774477.5738208	total: 11.7s	remaining: 1m 10s
142:	learn: 6772635.1150238	total: 11.8s	remaining: 1m 10s
143:	learn: 6767546.0723769	total: 11.9s	remaining: 1m 10s
144:	learn: 6766109.0583063	total: 11.9s	remaining: 1m 10s
145:	learn: 6764148.9844886	total: 12s	remaining: 1m 10s
146:	learn: 6761493.2150242	total: 12.1s	remaining: 1m 10s
147:	learn: 6756643.4553280	total: 12.2s	remaining: 1m 10s
148:	learn: 6751551.8990158	total: 12.3s	remaining: 1m 10s
149:	learn: 6747619.9302930	total: 12.3s	remaining: 1m 9s
150:	learn: 6745105.2235808	total: 12.4s	remaining: 1m 9s
151:	learn: 6742005.9509514	total: 12.5s	remaining: 1m 9s
152:	learn: 6739801.1757921	total: 12.6s	remaining: 1m 9s
153:	learn: 6737821.3523706	total: 12.7s	remaining: 1m 9s
154:	learn: 6736218.0492364	total: 12.7s	remaining: 1m 9s
155:	learn: 6729895.3336599	total: 12.8s	remaining: 1m 9s
156:	learn: 6722420.5910713	total: 12.9s	remaining: 1m 9s
157:	learn: 6719907.8837818	total: 13s	remaining: 1m 9s
158:	learn

285:	learn: 6086035.2685650	total: 23.4s	remaining: 58.4s
286:	learn: 6084210.5580279	total: 23.5s	remaining: 58.3s
287:	learn: 6079654.0775862	total: 23.5s	remaining: 58.2s
288:	learn: 6071071.4312353	total: 23.6s	remaining: 58.1s
289:	learn: 6069252.8637985	total: 23.7s	remaining: 58s
290:	learn: 6067362.0579557	total: 23.8s	remaining: 57.9s
291:	learn: 6061888.4464368	total: 23.8s	remaining: 57.8s
292:	learn: 6056808.9929263	total: 23.9s	remaining: 57.7s
293:	learn: 6055166.1605054	total: 24s	remaining: 57.7s
294:	learn: 6051605.3192470	total: 24.1s	remaining: 57.6s
295:	learn: 6049953.1781092	total: 24.2s	remaining: 57.5s
296:	learn: 6044347.6545203	total: 24.3s	remaining: 57.4s
297:	learn: 6042015.3521320	total: 24.3s	remaining: 57.3s
298:	learn: 6039609.5478574	total: 24.4s	remaining: 57.2s
299:	learn: 6033287.3701745	total: 24.5s	remaining: 57.1s
300:	learn: 6028109.7999180	total: 24.6s	remaining: 57s
301:	learn: 6024296.8653046	total: 24.6s	remaining: 57s
302:	learn: 6022566.47

429:	learn: 5571657.6318704	total: 34.8s	remaining: 46.2s
430:	learn: 5569961.4932492	total: 34.9s	remaining: 46.1s
431:	learn: 5568811.8125990	total: 35s	remaining: 46s
432:	learn: 5567143.8670057	total: 35.1s	remaining: 45.9s
433:	learn: 5565856.3724944	total: 35.1s	remaining: 45.8s
434:	learn: 5563666.1659030	total: 35.2s	remaining: 45.8s
435:	learn: 5561261.0756085	total: 35.3s	remaining: 45.7s
436:	learn: 5560161.9067973	total: 35.4s	remaining: 45.6s
437:	learn: 5558298.3290946	total: 35.5s	remaining: 45.5s
438:	learn: 5554620.9405022	total: 35.5s	remaining: 45.4s
439:	learn: 5552025.0247584	total: 35.6s	remaining: 45.3s
440:	learn: 5550316.6718029	total: 35.7s	remaining: 45.3s
441:	learn: 5547653.5785354	total: 35.8s	remaining: 45.2s
442:	learn: 5546423.7657464	total: 35.9s	remaining: 45.1s
443:	learn: 5543268.3627989	total: 35.9s	remaining: 45s
444:	learn: 5542138.4702135	total: 36s	remaining: 44.9s
445:	learn: 5540857.4910941	total: 36.1s	remaining: 44.8s
446:	learn: 5534573.06

573:	learn: 5165521.5094238	total: 46.7s	remaining: 34.7s
574:	learn: 5162552.7966594	total: 46.8s	remaining: 34.6s
575:	learn: 5159322.5764659	total: 46.9s	remaining: 34.5s
576:	learn: 5155953.9483172	total: 47s	remaining: 34.4s
577:	learn: 5152221.5318742	total: 47s	remaining: 34.3s
578:	learn: 5151285.9799737	total: 47.1s	remaining: 34.3s
579:	learn: 5147713.4231770	total: 47.2s	remaining: 34.2s
580:	learn: 5142614.7136237	total: 47.3s	remaining: 34.1s
581:	learn: 5141560.5953666	total: 47.4s	remaining: 34s
582:	learn: 5139231.3381892	total: 47.4s	remaining: 33.9s
583:	learn: 5138010.3718483	total: 47.5s	remaining: 33.8s
584:	learn: 5137051.1078836	total: 47.6s	remaining: 33.8s
585:	learn: 5135241.7772144	total: 47.7s	remaining: 33.7s
586:	learn: 5132961.5799172	total: 47.8s	remaining: 33.6s
587:	learn: 5129863.1288365	total: 47.8s	remaining: 33.5s
588:	learn: 5128672.3682800	total: 47.9s	remaining: 33.4s
589:	learn: 5126557.8905205	total: 48s	remaining: 33.4s
590:	learn: 5125667.57

717:	learn: 4834633.9305348	total: 58.2s	remaining: 22.9s
718:	learn: 4833993.3633391	total: 58.3s	remaining: 22.8s
719:	learn: 4832034.1103594	total: 58.4s	remaining: 22.7s
720:	learn: 4828380.9904152	total: 58.5s	remaining: 22.6s
721:	learn: 4827730.6397311	total: 58.5s	remaining: 22.5s
722:	learn: 4826933.3541535	total: 58.6s	remaining: 22.5s
723:	learn: 4825792.3922188	total: 58.7s	remaining: 22.4s
724:	learn: 4821588.9833349	total: 58.8s	remaining: 22.3s
725:	learn: 4819874.0965186	total: 58.9s	remaining: 22.2s
726:	learn: 4815271.2914260	total: 58.9s	remaining: 22.1s
727:	learn: 4813829.2513464	total: 59s	remaining: 22.1s
728:	learn: 4813160.6966158	total: 59.1s	remaining: 22s
729:	learn: 4811868.0009336	total: 59.2s	remaining: 21.9s
730:	learn: 4809680.1703097	total: 59.3s	remaining: 21.8s
731:	learn: 4808802.9308096	total: 59.3s	remaining: 21.7s
732:	learn: 4805241.8683634	total: 59.4s	remaining: 21.6s
733:	learn: 4803468.0551071	total: 59.5s	remaining: 21.6s
734:	learn: 480099

861:	learn: 4574918.1498787	total: 1m 9s	remaining: 11.2s
862:	learn: 4571762.1544362	total: 1m 9s	remaining: 11.1s
863:	learn: 4570271.4904597	total: 1m 9s	remaining: 11s
864:	learn: 4567089.3710240	total: 1m 9s	remaining: 10.9s
865:	learn: 4566565.3707482	total: 1m 9s	remaining: 10.8s
866:	learn: 4563122.6007742	total: 1m 10s	remaining: 10.8s
867:	learn: 4562616.6071045	total: 1m 10s	remaining: 10.7s
868:	learn: 4560144.1357494	total: 1m 10s	remaining: 10.6s
869:	learn: 4558853.1752087	total: 1m 10s	remaining: 10.5s
870:	learn: 4558419.4855032	total: 1m 10s	remaining: 10.4s
871:	learn: 4557360.2091702	total: 1m 10s	remaining: 10.3s
872:	learn: 4556498.7255342	total: 1m 10s	remaining: 10.3s
873:	learn: 4554349.0342169	total: 1m 10s	remaining: 10.2s
874:	learn: 4553625.6860533	total: 1m 10s	remaining: 10.1s
875:	learn: 4551680.1656731	total: 1m 10s	remaining: 10s
876:	learn: 4549745.2714050	total: 1m 10s	remaining: 9.95s
877:	learn: 4546476.3232254	total: 1m 10s	remaining: 9.86s
878:	l

In [None]:
#Melhorar o resultado do SelectKBest utilizando o RandomForest Regressor
sel_kbest = SelectKBest(f_regression, k=45).fit(X_train, y_train)
X_train_sel = sel_kbest.transform(X_train)
X_test_sel = sel_kbest.transform(X_test)

regressor = RandomForestRegressor(max_depth=6, random_state=0)
regressor.fit(X_train_sel, y_train)

y_pred = regr.predict(X_test_sel)

print(mean_absolute_error(y_test, y_pred))

## Utilizando o GridSearchCV para encontrar os melhores parametros para o modelo

In [None]:
#Utilizando o GridSearchCV para encontrar os melhores valores para os parametros
parameters = {'max_depth':[2,6,10,20,30,40],
              'n_estimators':[20,50,100,200,300,500],
              'max_features':['sqrt', 'log2']}

regressor = GridSearchCV(RandomForestRegressor(), parameters, n_jobs=-1, verbose=1)
regressor.fit(X_train_sel, y_train)

y_pred = regressor.predict(X_test_sel)

print('MAE do modelo utilizando gridsearchCV')
print(mean_absolute_error(y_train, y_train_pred))
print(mean_absolute_error(y_test, y_pred))
print(regressor.best_params_)