# Applying baseline models to the prepared data

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import xgboost
import lightgbm
import catboost

from os import path
import os

Load the data...

In [3]:
data_folder = './source_data'
data = pd.read_csv(path.join(data_folder, 'prepr_train.csv'), index_col='passengerid').rename(str.lower, axis='columns')
data

Unnamed: 0_level_0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,...,c,d,e,f,g,t,cherbourg,queenstown,southampton,survived
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,U0,S,...,0,0,0,0,0,0,0,0,1,0
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C,...,1,0,0,0,0,0,1,0,0,1
3,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,U0,S,...,0,0,0,0,0,0,0,0,1,1
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S,...,1,0,0,0,0,0,0,0,1,1
5,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,U0,S,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,U0,S,...,0,0,0,0,0,0,0,0,1,0
888,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S,...,0,0,0,0,0,0,0,0,1,1
889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,21.774238,1,2,W./C. 6607,23.4500,U0,S,...,0,0,0,0,0,0,0,0,1,0
890,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C,...,1,0,0,0,0,0,1,0,0,1


In [4]:
num_model_features = ['age', 'sibsp', 'parch', 'fare', '1cl', '2cl', '3cl', 'capt.', 'col.',\
                      'countess.', 'don.', 'dona.', 'dr.', 'jonkheer.', 'lady.', 'major.', 'master.',\
                      'miss.', 'mlle.', 'mme.', 'mr.', 'mrs.', 'ms.', 'rev.', 'sir.', 'female', 'male',\
                      'room', 'ticket_num', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 't']
tree_model_features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked',\
                       'title', 'room', 'deck', 'ticket_srs', 'ticket_num']
col_mapper = {}

Now let's try the particular algorithms! The details could be found here:

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html  
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html  
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html  
https://scikit-learn.org/stable/modules/naive_bayes.html  
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html  
https://xgboost.readthedocs.io/en/latest/get_started.html  
https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html  
https://catboost.ai/en/docs/concepts/python-usages-examples  

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [6]:
def check_model_kfold(model, data, mapper):
    kf = StratifiedKFold(n_splits=5)
    if model.__class__.__name__ in mapper.keys():
        X, y = data[mapper[model.__class__.__name__]], data['survived']
    else:
        X, y = data[num_model_features], data['survived']
    acc_score = []
    for train, test in kf.split(X, y):
        X_train, y_train, X_test, y_test = X.iloc[train], y.iloc[train], X.iloc[test], y.iloc[test]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_pred, y_test)
        acc_score += [acc]
    return acc_score

In [11]:
def check_model_100(model, data, mapper):
    if model.__class__.__name__ in mapper.keys():
        X, y = data[mapper[model.__class__.__name__]], data['survived']
    else:
        X, y = data[num_model_features], data['survived']
    acc_score = []
    model.fit(X, y)
    y_pred = model.predict(X)
    acc = accuracy_score(y_pred, y)
    acc_score += [acc]
    return acc_score

In [7]:
models = [
    LogisticRegression(penalty='l1', solver='saga', random_state=0),\
    SVC(kernel='rbf'),\
    KNeighborsClassifier(),\
    GaussianNB(), MultinomialNB(), ComplementNB(), BernoulliNB(),\
    RandomForestClassifier(random_state=0), ExtraTreesClassifier(random_state=0), DecisionTreeClassifier(random_state=0),\
    XGBClassifier(random_state=0, verbosity=0), 
    LGBMClassifier(silent=True, verbose=-100), 
    CatBoostClassifier(random_state=0, silent=True)
]
for model in models:
    acc_score = check_model_kfold(model, data, col_mapper)
    print(f'\n{model.__class__.__name__}:')
    print(f'mean_score = {np.mean(acc_score)}; acc_score = {acc_score}')


LogisticRegression:
mean_score = 0.6161634548992531; acc_score = [0.6145251396648045, 0.6179775280898876, 0.6179775280898876, 0.6179775280898876, 0.6123595505617978]

SVC:
mean_score = 0.6161634548992531; acc_score = [0.6145251396648045, 0.6179775280898876, 0.6179775280898876, 0.6179775280898876, 0.6123595505617978]

KNeighborsClassifier:
mean_score = 0.6846525641830393; acc_score = [0.659217877094972, 0.6573033707865169, 0.6797752808988764, 0.7134831460674157, 0.7134831460674157]

GaussianNB:
mean_score = 0.66784884815768; acc_score = [0.6145251396648045, 0.6910112359550562, 0.6629213483146067, 0.6629213483146067, 0.7078651685393258]

MultinomialNB:
mean_score = 0.5859581947147071; acc_score = [0.4972067039106145, 0.5561797752808989, 0.5955056179775281, 0.6348314606741573, 0.6460674157303371]

ComplementNB:
mean_score = 0.5870817902203251; acc_score = [0.4972067039106145, 0.5561797752808989, 0.601123595505618, 0.6348314606741573, 0.6460674157303371]

BernoulliNB:
mean_score = 0.79012

Now let's make forecasts.

In [8]:
test = pd.read_csv(path.join(data_folder, 'prepr_test.csv'), index_col='passengerid').rename(str.lower, axis='columns')
test

Unnamed: 0_level_0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,...,b,c,d,e,f,g,t,cherbourg,queenstown,southampton
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
892,3,"Kelly, Mr. James",male,34.500000,0,0,330911,7.8292,U0,Q,...,0,0,0,0,0,0,0,0,1,0
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.000000,1,0,363272,7.0000,U0,S,...,0,0,0,0,0,0,0,0,0,1
894,2,"Myles, Mr. Thomas Francis",male,62.000000,0,0,240276,9.6875,U0,Q,...,0,0,0,0,0,0,0,0,1,0
895,3,"Wirz, Mr. Albert",male,27.000000,0,0,315154,8.6625,U0,S,...,0,0,0,0,0,0,0,0,0,1
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.000000,1,1,3101298,12.2875,U0,S,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,3,"Spector, Mr. Woolf",male,32.252151,0,0,A.5. 3236,8.0500,U0,S,...,0,0,0,0,0,0,0,0,0,1
1306,1,"Oliva y Ocana, Dona. Fermina",female,39.000000,0,0,PC 17758,108.9000,C105,C,...,0,1,0,0,0,0,0,1,0,0
1307,3,"Saether, Mr. Simon Sivertsen",male,38.500000,0,0,SOTON/O.Q. 3101262,7.2500,U0,S,...,0,0,0,0,0,0,0,0,0,1
1308,3,"Ware, Mr. Frederick",male,32.252151,0,0,359309,8.0500,U0,S,...,0,0,0,0,0,0,0,0,0,1


In [9]:
def apply_model(model, data, mapper):
    if model.__class__.__name__ in mapper.keys():
        X = data[mapper[model.__class__.__name__]]
    else:
        X = data[num_model_features]
    result = pd.DataFrame(index=X.index, data={'Survived': model.predict(X)})
    result.index.names = ['PassengerId']
    return result

In [10]:
result_folder = './predictions'
#if path.exists(result_folder):
#    os.remove(result_folder)
#os.mkdir(result_folder)

for model in models:
    y_test = apply_model(model, test, col_mapper)
    y_test.to_csv(path.join(result_folder, f'{model.__class__.__name__}_prediction.csv'))
#print(f'\n{model.__class__.__name__}:')

In [13]:
models = [
    LogisticRegression(penalty='l1', solver='saga', random_state=0),\
    SVC(kernel='rbf'),\
    KNeighborsClassifier(),\
    GaussianNB(), MultinomialNB(), ComplementNB(), BernoulliNB(),\
    RandomForestClassifier(random_state=0), ExtraTreesClassifier(random_state=0), DecisionTreeClassifier(random_state=0),\
    XGBClassifier(random_state=0, verbosity=0), 
    LGBMClassifier(silent=True, verbose=-100), 
    CatBoostClassifier(random_state=0, silent=True)
]
for model in models:
    acc_score = check_model_100(model, data, col_mapper)
    print(f'\n{model.__class__.__name__}:')
    print(f'mean_score = {np.mean(acc_score)}; acc_score = {acc_score}')


LogisticRegression:
mean_score = 0.6161616161616161; acc_score = [0.6161616161616161]

SVC:
mean_score = 0.6161616161616161; acc_score = [0.6161616161616161]

KNeighborsClassifier:
mean_score = 0.7833894500561167; acc_score = [0.7833894500561167]

GaussianNB:
mean_score = 0.6689113355780022; acc_score = [0.6689113355780022]

MultinomialNB:
mean_score = 0.5858585858585859; acc_score = [0.5858585858585859]

ComplementNB:
mean_score = 0.5858585858585859; acc_score = [0.5858585858585859]

BernoulliNB:
mean_score = 0.7946127946127947; acc_score = [0.7946127946127947]

RandomForestClassifier:
mean_score = 0.9988776655443322; acc_score = [0.9988776655443322]

ExtraTreesClassifier:
mean_score = 0.9988776655443322; acc_score = [0.9988776655443322]

DecisionTreeClassifier:
mean_score = 0.9988776655443322; acc_score = [0.9988776655443322]

XGBClassifier:
mean_score = 0.9943883277216611; acc_score = [0.9943883277216611]

LGBMClassifier:
mean_score = 0.9921436588103255; acc_score = [0.992143658810

In [14]:
result_folder = './predictions'
#if path.exists(result_folder):
#    os.remove(result_folder)
#os.mkdir(result_folder)

for model in models:
    y_test = apply_model(model, test, col_mapper)
    y_test.to_csv(path.join(result_folder, f'{model.__class__.__name__}_prediction_100.csv'))
#print(f'\n{model.__class__.__name__}:')