# LIBRARY IMPORTS

In [592]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import xgboost

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# FUNCTIONS

In [593]:
def eda(df, is_train=True):
    df = df.copy()

    # Cabin
    # most values are missing
    
    # Ticket
    # not useful
    
    # Name
    # possible use
    
    # Age
    df['Age'] = df.Age.fillna(df.Age.median())
    df['Age'] = pd.cut(df.Age, [0, 18, 48, 100], labels=[0,1,2]).astype('int8')
    
    # Fare
    df['Fare'] = df.Fare.fillna(df.Fare.median())
    
    # Gender
    df['IsMale'] = df.Sex.map({'male': 1, 'female':0})
    
    # Embarked
    edf = pd.get_dummies(df.Embarked)
    df[edf.columns] = edf
    
    # Family
    df['Family'] = df.SibSp + df.Parch
    
    # Columns
    columns = ['Pclass', 'SibSp', 'Parch', 'Fare', 'IsMale', 'C', 'Q', 'S', 'Age', 'Family']
    X = df[columns]
    y = df['Survived'] if is_train else None
    return X, y

In [594]:
def stacking(names, models, features):
    d = {n: m.predict_proba(features)[:, 1] for n, m in zip(names, models)}
    return pd.DataFrame(d)

# LOAD DATA

In [595]:
df = pd.read_csv('train.csv')
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [596]:
X, y = eda(df)
X.head(2)

Unnamed: 0,Pclass,SibSp,Parch,Fare,IsMale,C,Q,S,Age,Family
0,3,1,0,7.25,1,0,0,1,1,1
1,1,1,0,71.2833,0,1,0,0,1,1


In [618]:
# baseline accuracy
1 - (y.sum() / y.shape[0])

0.6161616161616161

# SCALE DATA

In [597]:
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

# SPLIT DATA

In [619]:
split_size = 0.30
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=split_size, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=split_size, random_state=42)

In [620]:
X_train.shape, X_val.shape, X_test.shape

((436, 10), (268, 10), (187, 10))

# MODELING LAYER 0

In [600]:
knn_params = {'n_neighbors': range(1, 10)}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, n_jobs=-1).fit(X_train, y_train)
knn = KNeighborsClassifier(**knn_grid.best_params_).fit(X_train, y_train)

In [601]:
gnb = GaussianNB(priors=[.80, .20]).fit(X_train, y_train)

In [602]:
rfc_params = {'n_estimators': [1000],
              'criterion': ['gini', 'entropy'],
              'max_depth': range(1, 10),
              'class_weight': ['balanced']}
rfc_grid = GridSearchCV(RandomForestClassifier(), rfc_params, cv=5, n_jobs=-1).fit(X_train, y_train)
rfc = RandomForestClassifier(**rfc_grid.best_params_).fit(X_train, y_train)

In [603]:
gbc_params = {'n_estimators': range(10, 600, 100),
              'learning_rate': [0.0001, 0.001, .01, .1],
              'max_depth': range(1, 10)}
gbc_grid = GridSearchCV(GradientBoostingClassifier(), gbc_params, cv=5, n_jobs=-1).fit(X_train, y_train)
gbc = GradientBoostingClassifier(**gbc_grid.best_params_).fit(X_train, y_train)

In [604]:
log_params = {'class_weight': ['balanced'],
              'penalty': ['l1', 'l2'],
              'C': np.logspace(-3, 2, 6)}
log_grid = GridSearchCV(LogisticRegression(), log_params, cv=5, n_jobs=-1).fit(X_train, y_train)
log = LogisticRegression(**log_grid.best_params_).fit(X_train, y_train)

In [605]:
# xgb_param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
# xgb = xgboost.train(xgb_param, dtrain, num_round)

In [606]:
print('knn', knn.score(X_val, y_val))
print('gnb', gnb.score(X_val, y_val))
print('rfc', rfc.score(X_val, y_val))
print('gbc', gbc.score(X_val, y_val))
print('log', log.score(X_val, y_val))

knn 0.8071748878923767
gnb 0.757847533632287
rfc 0.7982062780269058
gbc 0.8071748878923767
log 0.7847533632286996


# MODELING LAYER 1

In [607]:
df_layer1 = stacking(['knn', 'gnb', 'rfc', 'gbc', 'log'], [knn, gnb, rfc, gbc, log], X_val)
df_layer1.head()

Unnamed: 0,knn,gnb,rfc,gbc,log
0,0.125,0.210608,0.520323,0.25492,0.446378
1,0.125,0.027277,0.264134,0.173297,0.446378
2,0.375,0.012027,0.202766,0.295815,0.446378
3,0.625,0.767363,0.904982,0.950287,0.597645
4,0.625,0.942627,0.623743,0.682912,0.597645


In [608]:
model = LogisticRegression().fit(df_layer1, y_val)

# EVALUATE FINAL MODEL

In [609]:
model.score(stacking(['knn', 'gnb', 'rfc', 'gbc', 'log'], [knn, gnb, rfc, gbc, log], X_test), y_test)

0.8023952095808383

In [610]:
accuracy_score(stacking(['knn', 'gnb', 'rfc', 'gbc', 'log'], [knn, gnb, rfc, gbc, log], X_test).mean(axis=1) > 0.50, y_test)

0.8083832335329342

# PREDICTIONS

In [611]:
df_kaggle = pd.read_csv('test.csv')
X_kaggle, _ = eda(df_kaggle, False)

In [612]:
X_kaggle = scaler.transform(X_kaggle)

In [613]:
X_kaggle_stacked = stacking(['knn', 'gnb', 'rfc', 'gbc'], [knn, gnb, rfc, gbc], X_kaggle)

In [None]:
dict_kaggle = {'PassengerId' : df_kaggle['PassengerId'], 'Survived': model.predict(X_kaggle_stacked)}

In [None]:
pd.DataFrame(dict_kaggle).to_csv('predictions.csv', index=False)