# LIBRARY IMPORTS

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import xgboost
from catboost import CatBoostClassifier, Pool

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# FUNCTIONS

In [64]:
def eda(df, is_train=True):
    df = df.copy()

    # Cabin
    # most values are missing
    
    # Ticket
    # not useful
    
    # Name
    # possible use
    
    # Age
    df['Age'] = df.Age.fillna(df.Age.median())
    # df['Age'] = pd.cut(df.Age, [0, 18, 48, 100], labels=[0,1,2]).astype('int8')
    
    # Fare
    df['Fare'] = df.Fare.fillna(df.Fare.median())
    
    # Gender
    df['IsMale'] = df.Sex.map({'male': 1, 'female':0})
    
    # Embarked
    edf = pd.get_dummies(df.Embarked)
    df[edf.columns] = edf
    
    # Family
    df['Family'] = df.SibSp + df.Parch
    
    # Columns
    columns = ['Pclass', 'SibSp', 'Parch', 'Fare', 'IsMale', 'C', 'Q', 'S', 'Age', 'Family']
    X = df[columns]
    y = df['Survived'] if is_train else None
    return X, y

# LOAD DATA

In [65]:
df = pd.read_csv('train.csv')
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# BASELINE ACCURACY

In [66]:
# baseline accuracy
1 - (df.Survived.sum() / df.Survived.shape[0])

0.6161616161616161

# EDA

In [67]:
X, y = eda(df)
X.head(5)

Unnamed: 0,Pclass,SibSp,Parch,Fare,IsMale,C,Q,S,Age,Family
0,3,1,0,7.25,1,0,0,1,22.0,1
1,1,1,0,71.2833,0,1,0,0,38.0,1
2,3,0,0,7.925,0,0,0,1,26.0,0
3,1,1,0,53.1,0,0,0,1,35.0,1
4,3,0,0,8.05,1,0,0,1,35.0,0


# SCALE DATA

In [68]:
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

# SPLIT DATA

In [69]:
split_size = 0.30
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=split_size, random_state=42)

In [70]:
X_train.shape, X_val.shape

((623, 10), (268, 10))

# MODELING

In [71]:
knn_params = {'n_neighbors': range(1, 10)}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, n_jobs=-1).fit(X_train, y_train)
knn = KNeighborsClassifier(**knn_grid.best_params_).fit(X_train, y_train)

In [72]:
gnb = GaussianNB(priors=[.60, .40]).fit(X_train, y_train)

In [73]:
rfc_params = {'n_estimators': [1000],
              'criterion': ['gini', 'entropy'],
              'max_depth': range(1, 10),
              'class_weight': ['balanced']}
rfc_grid = GridSearchCV(RandomForestClassifier(), rfc_params, cv=5, n_jobs=-1).fit(X_train, y_train)
rfc = RandomForestClassifier(**rfc_grid.best_params_).fit(X_train, y_train)

In [74]:
gbc_params = {'n_estimators': range(10, 600, 100),
              'learning_rate': [0.0001, 0.001, .01, .1],
              'max_depth': range(1, 10)}
gbc_grid = GridSearchCV(GradientBoostingClassifier(), gbc_params, cv=5, n_jobs=-1).fit(X_train, y_train)
gbc = GradientBoostingClassifier(**gbc_grid.best_params_).fit(X_train, y_train)

In [75]:
log_params = {'class_weight': ['balanced'],
              'penalty': ['l1', 'l2'],
              'C': np.logspace(-3, 2, 6)}
log_grid = GridSearchCV(LogisticRegression(), log_params, cv=5, n_jobs=-1).fit(X_train, y_train)
log = LogisticRegression(**log_grid.best_params_).fit(X_train, y_train)

In [76]:
# cbc_params = {'iterations': range(10, 450, 100),
#               'learning_rate': np.logspace(-4, 1, 6),
#               'depth': range(1, 5)}
# cbc_grid = GridSearchCV(CatBoostClassifier(verbose=False, loss_function='Logloss'), cbc_params, cv=5, n_jobs=-1).fit(X_train, y_train)
# cbc = CatBoostClassifier(**cbc_grid.best_params_).fit(X_train, y_train)
cbc = CatBoostClassifier(**{'depth': 4, 'iterations': 310, 'learning_rate': 0.1, 'verbose': False}).fit(X_train, y_train)

In [77]:
print('knn', knn.score(X_val, y_val))
print('gnb', gnb.score(X_val, y_val))
print('rfc', rfc.score(X_val, y_val))
print('gbc', gbc.score(X_val, y_val))
print('log', log.score(X_val, y_val))
print('cbc', cbc.score(X_val, y_val))

knn 0.7910447761194029
gnb 0.7649253731343284
rfc 0.8022388059701493
gbc 0.8097014925373134
log 0.7910447761194029
cbc 0.8283582089552238


# FINAL MODEL

In [78]:
# use all data to train final model
model = CatBoostClassifier(**{'depth': 4, 'iterations': 310, 'learning_rate': 0.1, 'verbose': False}).fit(X, y)

# REMOVE OLD PREDICTIONS

In [79]:
! rm predictions.csv

# GENERATE NEW PREDICTIONS

In [80]:
df_kaggle = pd.read_csv('test.csv')
X_kaggle, _ = eda(df_kaggle, False)
X_kaggle = scaler.transform(X_kaggle)

In [81]:
dict_kaggle = {'PassengerId' : df_kaggle['PassengerId'], 'Survived': model.predict(X_kaggle).astype('int8')}
pd.DataFrame(dict_kaggle).to_csv('predictions.csv', index=False)

# FIN