### Sample program for Ada Boost Classifier  

#### Import libraries  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from joblib import dump

#### Parameters  

In [None]:
csv_in = '../ai-11/titanic.csv'

#### Read CSV file  

PassengerID: 乗客ID  
Survived: (1:survived, 2:dead)  生存結果 (1: 生存, 2: 死亡)  
Pclass: class (grade) of passenger (1: highest)  乗客の階級 1が最高級  
Name: 乗客の名前  
Sex: 性別  
Age: 年齢  
SibSp: #brothers/sisters + #spouse 一緒に乗船していた兄弟＋配偶者の数  
Parch: #parents + #children 一緒に乗船していた両親＋子供の数  
Ticket: チケット番号  
Fare: 乗船料金  
Cabin:  Cabin ID  部屋番号  
Embarked: (Cherbourg、Queenstown、Southampton) name of embarked port 乗船した港名  

In [None]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

#### Drop rows with missing value at Age and Embarked column  

In [None]:
df = df.dropna(subset=['Age','Embarked']).reset_index(drop=True)

#### Extract data  

In [None]:
X= df[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
X = pd.get_dummies(X, drop_first=True)
y = df['Survived']

In [None]:
print(X.shape)
print(X.info())
display(X.head())
print(y.shape)
print(y.head())

#### Make pipeline and set parameters for grid search 

In [None]:
gbc=GradientBoostingClassifier(random_state=0)            

# Hyperparameter settings for grid search
param_grid = {
    'n_estimators': [50, 100, 500],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth' : [2, 4, 6],
}

#### Preparation of objects for cross validation  

In [None]:
grid_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=7)
gen_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=11)

#### Define the grid search for hyperparameters  

In [None]:
gs = GridSearchCV(gbc, param_grid , cv=grid_cv, scoring='accuracy')

#### Estimation of generalization performance  

In [None]:
%%time
nested_score = cross_val_score(gs, X=X, y=y, cv=gen_cv,
                               scoring='accuracy')
print(nested_score)
print(nested_score.mean())

**Generalization performance (ave accuracy) / 汎化性能 (平均accuracy): 0.775**  

#### Cross-validation to obtain the model with the best hyperparameter set (best estimator)  
- Note: gs_best is already fit to the whole data (X) in gs.fit(X, y)  

In [None]:
%%time
gs.fit(X, y)
gs_best = gs.best_estimator_

In [None]:
print(gs_best)

**Optimal hyperparameters / 最適なハイパーパラメータ: max_depth=4, learning_rate=0.01, n_estimators=500**  

#### Show feature importances of the best model  

In [None]:
print(pd.Series(gs_best.feature_importances_, index=X.columns))

In [None]:
plt.bar(X.columns, gs_best.feature_importances_)
plt.ylabel('Importance')
plt.xticks(rotation=90)
plt.show()

#### Save the trained model  

In [None]:
tag = 'titanic'
model_file = 'gbc_best_{}.joblib'.format(tag)
dump(gs_best, model_file)