# Final Project, Gradient Boosting Model
[Santander Customer Satisfaction](https://www.kaggle.com/c/santander-customer-satisfaction) research

## Load open source libraries

In [23]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score,roc_curve

## Load datasets

In [16]:
df_train = pd.read_csv("data/clean_train.csv")
df_train.shape

(76020, 308)

In [17]:
df_train.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,39205.17,0
1,3,2,34,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,49278.03,0
2,4,2,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,67333.77,0
3,8,2,37,0,195,195,0,0,0,0,...,0,0,0,0,0,0,0,0,64007.97,0
4,10,2,39,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,117310.979016,0


In [18]:
df_test = pd.read_csv("data/clean_test.csv")
df_test.shape

(75818, 307)

In [19]:
df_test.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,2,2,32,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,40532.1
1,5,2,35,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,45486.72
2,6,2,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,46993.95
3,7,2,24,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,187898.61
4,9,2,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,73649.73


In [20]:
# train dataset
y = df_train['TARGET']
X = df_train.drop(['ID','TARGET'], axis=1)

# test dataset
test_id = df_test['ID']
test = df_test.drop(['ID'], axis=1)

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
print(X_train.shape, X_test.shape, test.shape)

((60816, 306), (15204, 306), (75818, 306))


In [24]:
#Feature selection
fs_model = ExtraTreesClassifier(n_estimators=50, max_depth=50)
selector = fs_model.fit(X_train, y_train)
fs = SelectFromModel(selector, prefit=True)

X_train = fs.transform(X_train)
X_test = fs.transform(X_test)
test = fs.transform(test)

print(X_train.shape, X_test.shape, test.shape)

((60816, 63), (15204, 63), (75818, 63))


In [25]:
# use GradientBoosting to find important features
model = GradientBoostingClassifier(n_estimators=150, max_depth=5, learning_rate=0.1)
model.fit(X_train, y_train)
scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=10)
print 'CV AUC {}, Average AUC {}'.format(scores, scores.mean())

CV AUC [ 0.81404256  0.83328768  0.83695157  0.84024698  0.80679962  0.84205268
  0.82523841  0.82106297  0.82912656  0.82269527], Average AUC 0.827150429984


In [26]:
roc_auc_score(y_test, model.predict_proba(X_test)[:,1],average='weighted')
# 0.83474214287524717 - simple GBC

0.85459009412282372

In [27]:
y_pred = model.predict_proba(test)

In [28]:
submission = pd.DataFrame({"ID":test_id, "TARGET": y_pred[:,1]})

In [29]:
submission.to_csv("submission_gb_params.csv", index=False)

In [None]:
#Public Leaderboard: 0.832551 that is less than simple GBC
#This leaderboard is calculated on approximately 50% of the test data.