In [None]:
pip install catboost

In [None]:
pip install lightgbm

In [None]:
pip install catboost

# Case de Estudos: Santander Value Prediction

Ajude Santander a identificar o valor das transações para cada cliente potencial. Esse é um primeiro passo que o Santander precisa acertar para personalizar seus serviços em grande escala.
De acordo com uma pesquisa da Epsilon, 80% dos clientes tendem a voltar a fazer negócios com a sua empresa se a mesma entregar um serviço personalizado.

<br>
## Link para os dados e o desafio: 

https://www.kaggle.com/c/santander-value-prediction-challenge/data

O case podera ser quebrado nas 6 partes seguintes:

    Identificar o problema
        Qual o tipo de problema(classificação, regressão, clustering)?
    Necessidades de aplicar transformaçoes?
        Ex: imputing de valores null, encoding de colunas string, etc
    Separar os sets de treinamento e teste
    Baseline
        Achar uma baseline, um primeiro modelo para ter uma referencia
    Escolher a metrica
    Melhorar o resultado
        Feature engineering, otimizaçao do modelo, hiperparametros, etc



In [1]:
#Criação e manipulação dos dataframes
import pandas as pd
pd.set_option('display.max_rows', 500)

#Operações matemáticas
import numpy as np

#Separar o dataframe em treino e teste
from sklearn.model_selection import train_test_split, GridSearchCV

#Métricas de avaliação do modelo
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

#Regressores utilizados
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.neural_network import MLPRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

#Criação de Data Matrix para o XGBoost
import xgboost as xgb

#Normalização dos dados 
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

#Ignorar os avisos do sklearn
import warnings
warnings.filterwarnings('ignore')

#Verificar o tempo de run de cada modelo 
from time import time

#Visualização gráfica dos dados
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df_test = pd.read_csv('test.csv')
df_submission = pd.read_csv('sample_submission.csv')
display(df_test)

In [2]:
df = pd.read_csv('train.csv')

display(df)

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4454,ff85154c8,1065000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4455,ffb6b3f4f,48000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,80000.0,0,0,0,0,0,0,0
4456,ffcf61eb6,2800000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4457,ffea67e98,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


## Visualização gráfica do target

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(range(df.shape[0]), np.sort(df['target'].values))
plt.xlabel('Index', fontsize=16)
plt.ylabel('Target', fontsize=16)

## Verificar por valores nulos no dataset

In [None]:
def verificar_nulos(df):
    i = 0
    for coluna in df.columns:
        if df[coluna].isnull().sum() > 0:
            print(f'A coluna {coluna} possui {df[coluna].isnull().sum()} valores nulos.')
        
        if df[coluna].isnull().sum() == 0:
            i += 1
    
    if i == df.shape[1]:
        print('O dataframe não possui nenhum valor nulo')
        
verificar_nulos(df)

In [None]:
colunas = []

print(f'Shape do dataframe antes: {df.shape}')

for col in df.columns:
    if col != 'ID' and col != 'target':
        if df[col].std() == 0:
            colunas.append(col)
            
df.drop(columns=colunas, inplace=True)

print(f'Shape do dataframe depois: {df.shape}')

print(f'As colunas {colunas} foram removidas')

In [None]:
#Verificando o uso de memória do dataframe
print(f'Memória usada: {df.memory_usage().sum()/(1024*1024):.2f} MBs')

## Separando o dataset em treino, teste e validação (80/10/10)

In [None]:
y = df['target']
X = df.drop(['ID', 'target'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size=0.50, random_state=42)

print(f'Shape X_Train: {X_train.shape}')
print(f'Shape y_Train: {y_train.shape}')
print(f'Shape X_Test: {X_test.shape}')
print(f'Shape y_Test: {y_test.shape}')
print(f'Shape X_Validation: {X_validation.shape}')
print(f'Shape y_Validation: {y_validation.shape}')

## Testando os diferentes tipos de escalonamento com alguns modelos

In [None]:
scalers = [
    StandardScaler(),
    MinMaxScaler(),
    RobustScaler()
]

models = [
    LinearRegression(),
    Ridge(),
    MLPRegressor(),
    DecisionTreeRegressor(),
    KNeighborsRegressor()
]

for scaler in scalers:
    scaler = scaler.fit(X_train, X_test)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    print(f'Atual método utilizado: {scaler.__class__.__name__}')
    print(f'')
    
    for model in models:
        regressor = model.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        print(f'O modelo {model.__class__.__name__} teve um MAE de {mean_absolute_error(y_test, y_pred)} e um r2 score de {r2_score(y_test, y_pred)}')

## Testando os scalers com a divisão de apenas treino/teste de 80-20 no CatBoost

In [None]:
scalers = [
    StandardScaler(),
    MinMaxScaler(),
    RobustScaler()
]

#Separando o conjunto de dados em treino e teste
y = df['target']
X = df.drop(['ID', 'target'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size=0.50, random_state=42)

#Testando cada scaler unicamente
for scaler in scalers:
    print(f'Método utilizado: {scaler.__class__.__name__}')
    scaler = scaler.fit(X_train, X_test)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    regressor = CatBoostRegressor(iterations=500, learning_rate=0.01, depth=10, eval_metric='RMSE',
                             random_seed=42, bagging_temperature=0.2, od_type='Iter', 
                             metric_period=50, od_wait=20)

    regressor.fit(X_train, y_train, eval_set=(X_validation, y_validation), use_best_model=True, verbose=50)
    y_pred = regressor.predict(X_test)
    print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
    print(f'R2:{r2_score(y_test, y_pred)}')

Método utilizado: StandardScaler




0:	learn: 8385726.8574788	test: 7652349.2934808	best: 7652349.2934808 (0)	total: 1.61s	remaining: 13m 23s


## Normalizando os dados com o Standard Scaler

In [None]:
standardscaler = StandardScaler().fit(X_train, X_test)
X_train = standardscaler.transform(X_train)
X_test = standardscaler.transform(X_test)

## Normalizando os dados com o MinMaxScaler

In [None]:
minmaxscaler = MinMaxScaler().fit(X_train, X_test)
X_train = minmaxscaler.transform(X_train)
X_test = minmaxscaler.transform(X_test)

## Normalizando os dados com o Robust Scaler

In [None]:
robustscaler = RobustScaler().fit(X_train, X_test)
X_train = robustscaler.transform(X_train)
X_test = robustscaler.transform(X_test)

## Testando os modelos

In [None]:
#Criando a lista dos modelos que serão testados
models = [
    LinearRegression(),
    Ridge(),
    Lasso(),
    RandomForestRegressor(),
    MLPRegressor(),
    DecisionTreeRegressor(),
    KNeighborsRegressor(),
    SVR(),
    GaussianProcessRegressor(),
    XGBRegressor(),
    CatBoostRegressor(),
    LGBMRegressor()
]

#Criando as listas vazias das métricas do modelo
MAE = []
RMSE = []
MSE = []
r2 = []
names = []
parameters = []
tempo = []

#Testando modelo por modelo
for model in models:
    #Startando o cronometro para ver o tempo de cada modelo 
    tempo_inicial = time()
    
    #Chamando o regressor e fazendo o predict
    regressor = model.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    
    #Adicionando o valor da métrica de cada modelo para criar o dataframe depois
    MAE.append(mean_absolute_error(y_test, y_pred))
    RMSE.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    MSE.append(mean_squared_error(y_test, y_pred))
    r2.append(r2_score(y_test, y_pred))
               
    #Para pegar o nome do modelo basta pegar o nome da clas
    names.append(model.__class__.__name__)
               
    #Para pegar os parametros utilizados
    parameters.append(model.get_params())
               
    #Parando o cronometro para ver o tempo e adicionando na lista 
    tempo_final = time() - tempo_inicial
    tempo.append(tempo_final)
               
               
#Criando o dataframe com as métricas de todos os modelos 
resultado = pd.DataFrame({'Nome': names,
                         'Parametros': parameters,
                         'MAE': MAE,
                         'RMSE': RMSE,
                         'MSE': MSE,
                         'R2': r2,
                         'Run Time': tempo})


In [None]:
display(resultado)

# Tuning dos hiperparametros 

### Utilizando o SelectKBest para verificar as features mais importantes para o modelo

In [None]:
#Melhorar o resultado do SelectKBest utilizando o RandomForest Regressor
sel_kbest = SelectKBest(f_regression, k=45).fit(X_train, y_train)
X_train_sel = sel_kbest.transform(X_train)
X_test_sel = sel_kbest.transform(X_test)

regressor = RandomForestRegressor(max_depth=6, random_state=0)
regressor.fit(X_train_sel, y_train)

y_pred = regr.predict(X_test_sel)

print(mean_absolute_error(y_test, y_pred))

## Utilizando o GridSearchCV para encontrar os melhores parametros para o modelo

In [None]:
#Utilizando o GridSearchCV para encontrar os melhores valores para os parametros
parameters = {'max_depth':[2,6,10,20,30,40],
              'n_estimators':[20,50,100,200,300,500],
              'max_features':['sqrt', 'log2']}

regressor = GridSearchCV(RandomForestRegressor(), parameters, n_jobs=-1, verbose=1)
regressor.fit(X_train_sel, y_train)

y_pred = regressor.predict(X_test_sel)

print('MAE do modelo utilizando gridsearchCV')
print(mean_absolute_error(y_train, y_train_pred))
print(mean_absolute_error(y_test, y_pred))
print(regressor.best_params_)

## Utilizando o GridSearch no XGBoost 

In [None]:
parameters = {
    'learning_rate': np.arange(0, 0.35, 0.05),
    'max_depth': [3, 4, 5, 6, 8, 10, 12, 15],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': np.arange(0, 0.5, 0.1),
    'colsample_bytree': [0.3, 0.4, 0.5 , 0.7]
}

regressor = GridSearchCV(XGBRegressor(), parameters, n_jobs=-1, verbose=2)
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

print(mean_absolute_error(y_test, y_pred))
print(regressor.best_params_)

In [None]:
def xgb_regressor(train_x, train_y, validation_x, validation_y, test_x):
    parameters = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': 0.001,
        'max_depth': 10,
        'subsample': 0.6,
        'colsample_bytree': 0.6,
        'alpha': 0.001,
        'random_state': 42,
        'tree_method': 'gpu_hist'
    }
    
    
    #Transformando os dados em uma Data Matrix do XGboost
    training_data = xgb.DMatrix(train_x, train_y)
    validation_data = xgb.DMatrix(validation_x, validation_y)
    testing_data = xgb.DMatrix(test_x)
    
    watchlist = [(training_data, 'train'), (validation_data, 'valid')]
    
    model_xgb = xgb.train(parameters, training_data, 50, watchlist, maxizime=False,
                         early_stopping_rounds=100, verbose_eval=100)
    
    
    predict_test_xgb = np.expm1(model_xgb.predict(data_test, ntree_limit=model_xgb.best_ntree_limit))
    
    return predict_test_xgb, model_xgb


predictions_test_y_xgb, model_xgb = xgb_regressor(X_train, y_train, X_validation, y_validation, X_test)


## Tuning do modelo CatBoost 

In [None]:
regressor = CatBoostRegressor(iterations=1000, learning_rate=0.01, depth=10, eval_metric='RMSE',
                             random_seed=42, bagging_temperature=0.2, od_type='Iter', 
                             metric_period=50, od_wait=20)

regressor.fit(X_train, y_train, eval_set=(X_validation, y_validation), use_best_model=True, verbose=50)
y_pred = regressor.predict(X_test)


In [None]:
r2_score(y_test, y_pred)

## Criando o dataframe para submissão do desafio