# Now a bit of Reversed Engineering

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

from os import path
import os

First let's restore the cheat answer and save it

In [2]:
data_folder = './source_data'
raw_train = pd.read_csv(path.join(data_folder, 'train.csv'), index_col='PassengerId')
raw_test = pd.read_csv(path.join(data_folder, 'test.csv'), index_col='PassengerId')
train_test = pd.concat([raw_train, raw_test])
train_test = train_test.sort_values(by='Name')

cheat = pd.read_csv(path.join('./special_data', 'cheat.csv')).rename(str.lower, axis='columns')
cheat = cheat.sort_values(by='name')

result = pd.DataFrame(index=train_test.index, data={'Survived': cheat['survived'].values})
result = result.sort_index()
result = result[891:]

result_folder = './predictions'
result.to_csv(path.join(result_folder, f'cheat_answers.csv'))

Now let's reproduce our best non-cheating classification and also save it

In [3]:
num_model_features = ['age', 'sibsp', 'parch', 'fare', '1cl', '2cl', '3cl', 'capt.', 'col.',\
                      'countess.', 'don.', 'dona.', 'dr.', 'jonkheer.', 'lady.', 'major.', 'master.',\
                      'miss.', 'mlle.', 'mme.', 'mr.', 'mrs.', 'ms.', 'rev.', 'sir.', 'female', 'male',\
                      'room', 'ticket_num', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 't']

def check_model_kfold(model, data):
    kf = StratifiedKFold(n_splits=5)
    X, y = data[num_model_features], data['survived']
    acc_score = []
    for train, test in kf.split(X, y):
        X_train, y_train, X_test, y_test = X.iloc[train], y.iloc[train], X.iloc[test], y.iloc[test]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_pred, y_test)
        acc_score += [acc]
    return acc_score

def check_model_100(model, data):
    X, y = data[num_model_features], data['survived']
    acc_score = []
    model.fit(X, y)
    y_pred = model.predict(X)
    acc = accuracy_score(y_pred, y)
    acc_score += [acc]
    return acc_score

data_folder = './source_data'
data = pd.read_csv(path.join(data_folder, 'prepr_train.csv'), index_col='passengerid').rename(str.lower, axis='columns')

models = [CatBoostClassifier(random_state=0, silent=True)]

for model in models:
    acc_score = check_model_100(model, data)
    print(f'\n{model.__class__.__name__}:')
    print(f'mean_score = {np.mean(acc_score)}; acc_score = {acc_score}')
    
def apply_model(model, data):
    X = data[num_model_features]
    result = pd.DataFrame(index=X.index, data={'Survived': model.predict(X)})
    result.index.names = ['PassengerId']
    return result

test = pd.read_csv(path.join(data_folder, 'prepr_test.csv'), index_col='passengerid').rename(str.lower, axis='columns')
result_folder = './predictions'
for model in models:
    y_test = apply_model(model, test)
    y_test.to_csv(path.join(result_folder, f'{model.__class__.__name__}_prediction_100.csv'))


CatBoostClassifier:
mean_score = 0.9248035914702581; acc_score = [0.9248035914702581]


Now let's take the wrong answers

In [9]:
wrong_answers = result.join(y_test, lsuffix='_cheat', rsuffix='_guess')
wrong_answers = wrong_answers[wrong_answers['Survived_cheat'] != wrong_answers['Survived_guess']]
wrong_answers

Unnamed: 0_level_0,Survived_cheat,Survived_guess
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
893,1,0
897,1,0
899,1,0
910,0,1
911,1,0
...,...,...
1286,1,0
1297,1,0
1301,0,1
1302,0,1


In [11]:
wrong_answers.to_csv(path.join('./special_data', 'wrong_answers.csv'))