In [1]:
import numpy as np
import os
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from ipynb.fs.full.n01preprocessing import save_obj

> Trained imputers.
> Applied imputers.
> Trained encoders.
> Applied encoders.
> Applied imputers.
> Applied encoders.
X_train saved in: /home/walter/Documents/personal_projects/new-titan/data/processed/X_train.csv


In [2]:
# Project
workdir = '/home/walter/Documents/personal_projects/new-titan'
exp_prefix = 'notebooks/experiments/exp_04'
data_prefix = 'data'
chk_prefix = 'checkpoint'

# Params
target = 'Survived'
features = ['Sex', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Name', 'Ticket']
idx = 'Passengerid'

# Paths 
data_train_path = os.path.join(workdir, data_prefix, 'raw/train.csv')

dict_path = os.path.join(workdir, exp_prefix, chk_prefix, 'train_dict.pkl')

# Text Replacement
app_origin = ['Mr',
 'Mrs',
 'Miss',
 'Master',
 'Don',
 'Rev',
 'Dr',
 'Mme',
 'Ms',
 'Major',
 'Mrs. Martin (Elizabeth L',
 'Lady',
 'Sir',
 'Mlle',
 'Col',
 'Capt',
 'the Countess',
 'Jonkheer',
 'other']

replacements = ['Mr', 'Mrs', 'Miss', 'Master', 'Mr', 'Rev', 'Dr', 'Mrs', 'Mrs', 'other',
 'Mrs', 'Miss', 'Mr', 'Miss', 'other', 
 'other', 'other', 'other', 'other']

replace_app = dict(zip(app_origin, replacements))

In [3]:
# models to test
models = [
    DecisionTreeClassifier(random_state = 2),
    LogisticRegression(random_state=2, max_iter=1000),
    RandomForestClassifier(random_state=0),
    GradientBoostingClassifier()
]

In [4]:
def load_data(prefix):
    X_train = np.genfromtxt(os.path.join(prefix, 'data_train', 'X_train.csv'), delimiter=',')
    y_train = np.genfromtxt(os.path.join(prefix, 'data_train', 'y_train.csv'), delimiter=',').astype('int')
    label_train = np.genfromtxt(os.path.join(prefix, 'data_train', 'label_train.csv'), delimiter=',')
   
    return X_train, y_train, label_train

def eval_model(X, y, model):
    k = 5
    kf = KFold(n_splits=k, shuffle=True, random_state=2)

    acc_score = []

    for train_index, valid_index in kf.split(X):

        # load
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]

        # model
        model.fit(X_train, y_train)
        y_predict = model.predict(X_valid)

        # eval
        acc = accuracy_score(y_predict, y_valid)
        acc_score.append(acc)

    avg_acc_score = sum(acc_score)/k
    return avg_acc_score  

In [5]:
X, y, label = load_data(os.path.join(workdir, data_prefix, 'processed'))
print(os.path.join(workdir, data_prefix, 'processed'))
X.shape

/home/walter/Documents/personal_projects/new-titan/data/processed


(801, 20)

In [6]:
# Load
X, y, label = load_data(os.path.join(workdir, data_prefix, 'processed'))

# Eval
res_eval = [eval_model(X, y, model) for model in models]

# save selected model
selected_model = models[res_eval.index(max(res_eval))]
save_obj(selected_model, os.path.join(workdir, exp_prefix, 'artifacts', 'selected_model.pkl'))

print(res_eval)
print(selected_model)

[0.7953027950310559, 0.8227484472049689, 0.8052717391304348, 0.8264751552795031]
GradientBoostingClassifier()
