# Case de Estudos: Santander Value Prediction

Ajude **Santander** a identificar o valor das transações para cada cliente potencial. Esse é um primeiro passo que o **Santander** precisa acertar para personalizar seus serviços em grande escala.
<br>
De acordo com uma pesquisa da Epsilon, 80% dos clientes tendem a voltar a fazer negócios com a sua empresa se a mesma entregar um serviço personalizado.
<br>
https://www.kaggle.com/c/santander-value-prediction-challenge/data

O case podera ser quebrado nas **6** partes seguintes:  
- **Identificar o problema**
  - Qual o tipo de problema(classificação, regressão, clustering)? 
- **Necessidades de aplicar transformaçoes?**
  - Ex: *imputing* de valores null, *encoding* de colunas *string*, etc 
- **Separar os sets de treinamento e teste**
- **Baseline**
  - Achar uma baseline, um primeiro modelo para ter uma referencia
- **Escolher a metrica**
- **Melhorar o resultado**
  - Feature engineering, otimizaçao do modelo, hiperparametros, etc

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('santander/train.csv')
display(df)

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4454,ff85154c8,1065000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4455,ffb6b3f4f,48000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,80000.0,0,0,0,0,0,0,0
4456,ffcf61eb6,2800000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4457,ffea67e98,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 4993 entries, ID to 9fc776466
dtypes: float64(1845), int64(3147), object(1)
memory usage: 169.9+ MB


In [None]:
df["target"].describe()

count    4.459000e+03
mean     5.944923e+06
std      8.234312e+06
min      3.000000e+04
25%      6.000000e+05
50%      2.260000e+06
75%      8.000000e+06
max      4.000000e+07
Name: target, dtype: float64

In [None]:
id = df["ID"]
y = df["target"]
X = df.drop(["ID", "target"], axis=1)

## Split train test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

## Baseline

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error

y_train_pred = reg.predict(X_train)
y_pred = reg.predict(X_test)
print(mean_absolute_error(y_train, y_train_pred))
print(mean_absolute_error(y_test, y_pred))

13365.82028544444
406402827989822.06


In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
sel_kbest = SelectKBest(f_regression, k=45).fit(X_train, y_train)

  corr /= X_norms


In [None]:
X_train_sel = sel_kbest.transform(X_train)
X_test_sel = sel_kbest.transform(X_test)

In [None]:
X_train_sel.shape

(3567, 45)

In [None]:
reg = LinearRegression().fit(X_train_sel, y_train)

y_train_pred = reg.predict(X_train_sel)
y_pred = reg.predict(X_test_sel)

print(mean_absolute_error(y_train, y_train_pred))
print(mean_absolute_error(y_test, y_pred))

5322548.588998503
5230263.099091363


## Escolher a métrica

In [None]:
from sklearn.metrics import mean_squared_log_error

In [None]:
y_train_pred = reg.predict(X_train_sel)
y_pred = reg.predict(X_test_sel)

In [None]:
y_train = y_train[y_train_pred > 0]
y_train_pred = y_train_pred[y_train_pred > 0]
y_test = y_test[y_pred > 0]
y_pred = y_pred[y_pred > 0]

In [None]:
print(mean_squared_log_error(y_train, y_train_pred))
print(mean_squared_log_error(y_test, y_pred))

3.9112778699926625
3.74522707164624


## Melhorar o resultado

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

In [None]:
sel_kbest = SelectKBest(f_regression, k=45).fit(X_train, y_train)
X_train_sel = sel_kbest.transform(X_train)
X_test_sel = sel_kbest.transform(X_test)

  corr /= X_norms


In [None]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=6, random_state=0)
regr.fit(X_train_sel, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=6, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [None]:
y_train_pred = regr.predict(X_train_sel)
y_pred = regr.predict(X_test_sel)


print(mean_squared_log_error(y_train, y_train_pred))
print(mean_squared_log_error(y_test, y_pred))

3.8324753278229275
3.719739927585444


In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth':[2, 6, 10, 20], 'n_estimators':[20, 50, 100, 200]}
regr = RandomForestRegressor(random_state=0, n_jobs=-1)

clf = GridSearchCV(regr, parameters)
clf.fit(X_train_sel, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=-1,
                                             oob_score=False, random_state=0,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs

In [None]:
y_train_pred = clf.predict(X_train_sel)
y_pred = clf.predict(X_test_sel)


print(mean_squared_log_error(y_train, y_train_pred))
print(mean_squared_log_error(y_test, y_pred))

2.6419218238697284
3.2293333350212725


In [None]:
clf.best_estimator_

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=20, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=-1, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)