In [None]:
import pandas as pd
import numpy as np
%pylab inline
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler

# Leitura e Tratamento dos Dados

In [None]:
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')
print(f'Train set shape: {train.shape}\nTest set shape: {test.shape}')

In [None]:
train.head()

In [None]:
train['Embarked'].value_counts()

In [None]:
train['Sex'].value_counts()

In [None]:
train['Sex_b'] = train['Sex'].map(lambda x: 1 if x == 'female' else 0)
test['Sex_b'] = test['Sex'].map(lambda x: 1 if x == 'female' else 0)

In [None]:
train['Sex_b'].value_counts()

In [None]:
train['Embarked_S'] = (train['Embarked'] == 'S').astype(int)
train['Embarked_C'] = (train['Embarked'] == 'C').astype(int)

train['Cabin_null'] = train['Cabin'].isnull().astype(int)
train['Cabin_C'] = train['Cabin'].fillna('').str.count('C').astype(int)
train['Cabin_E'] = train['Cabin'].fillna('').str.count('E').astype(int)
train['Cabin_G'] = train['Cabin'].fillna('').str.count('G').astype(int)
train['Cabin_D'] = train['Cabin'].fillna('').str.count('D').astype(int)
train['Cabin_A'] = train['Cabin'].fillna('').str.count('A').astype(int)
train['Cabin_B'] = train['Cabin'].fillna('').str.count('B').astype(int)
train['Cabin_F'] = train['Cabin'].fillna('').str.count('F').astype(int)
train['Cabin_T'] = train['Cabin'].fillna('').str.count('T').astype(int)

train['Name_Miss'] = train['Name'].str.contains('Miss.').astype(int)
train['Name_Mrs'] = train['Name'].str.contains('Mrs.').astype(int)
train['Name_Master'] = train['Name'].str.contains('Master.').astype(int)
train['Name_Col'] = train['Name'].str.contains('Col.').astype(int)
train['Name_Major'] = train['Name'].str.contains('Major.').astype(int)
train['Name_Mr'] = train['Name'].str.contains('Mr.').astype(int)
train['Name_Dr'] = train['Name'].str.contains('Dr.').astype(int)
train['Name_Don'] = train['Name'].str.contains('Don.').astype(int)
train['Name_Sir'] = train['Name'].str.contains('Sir.').astype(int)

train['Ticket_cat'] = train['Ticket'].map(lambda x: list(train['Ticket'].unique()).index(x) if x in list(train['Ticket'].unique()) else 0)
train['Ticket_num'] = train['Ticket'].str.split().map(lambda x: x[1] if len(x) > 1 and x[1].isnumeric() else x[0] if x[0].isnumeric() else 0)

In [None]:
test['Embarked_S'] = (test['Embarked'] == 'S').astype(int)
test['Embarked_C'] = (test['Embarked'] == 'C').astype(int)

#test['Cabin_null'] = test['Cabin'].map(lambda x: 0 if x is np.nan else len(str(x)))
test['Cabin_null'] = test['Cabin'].isnull().astype(int)
test['Cabin_C'] = test['Cabin'].fillna('').str.count('C').astype(int)
test['Cabin_E'] = test['Cabin'].fillna('').str.count('E').astype(int)
test['Cabin_G'] = test['Cabin'].fillna('').str.count('G').astype(int)
test['Cabin_D'] = test['Cabin'].fillna('').str.count('D').astype(int)
test['Cabin_A'] = test['Cabin'].fillna('').str.count('A').astype(int)
test['Cabin_B'] = test['Cabin'].fillna('').str.count('B').astype(int)
test['Cabin_F'] = test['Cabin'].fillna('').str.count('F').astype(int)
test['Cabin_T'] = test['Cabin'].fillna('').str.count('T').astype(int)

test['Name_Miss'] = test['Name'].str.contains('Miss.').astype(int)
test['Name_Mrs'] = test['Name'].str.contains('Mrs.').astype(int)
test['Name_Master'] = test['Name'].str.contains('Master.').astype(int)
test['Name_Col'] = test['Name'].str.contains('Col.').astype(int)
test['Name_Major'] = test['Name'].str.contains('Major.').astype(int)
test['Name_Mr'] = test['Name'].str.contains('Mr.').astype(int)
test['Name_Dr'] = test['Name'].str.contains('Dr.').astype(int)
test['Name_Don'] = test['Name'].str.contains('Don.').astype(int)
test['Name_Sir'] = test['Name'].str.contains('Sir.').astype(int)

test['Ticket_cat'] = test['Ticket'].map(lambda x: list(test['Ticket'].unique()).index(x) if x in list(test['Ticket'].unique()) else 0)
test['Ticket_num'] = test['Ticket'].str.split().map(lambda x: x[1] if len(x) > 1 and x[1].isnumeric() else x[0] if x[0].isnumeric() else 0)

In [None]:
variaveis = ['Sex_b', 'Age', 'Pclass', 'Embarked_S', 'Embarked_C', 'SibSp', 'Parch', 'Fare', 'Cabin_null',
             'Cabin_C', 'Cabin_E', 'Cabin_G', 'Cabin_D', 'Cabin_A', 'Cabin_B', 'Cabin_F', 'Cabin_T',
             'Name_Miss', 'Name_Mrs', 'Name_Master', 'Name_Col', 'Name_Major', 'Name_Mr', 'Name_Dr', 'Name_Don',
             'Name_Sir', 'Ticket_num', 'Ticket_cat']

In [None]:
X = train[variaveis].fillna(-1)
X_test = test[variaveis].fillna(-1)
y = train['Survived']

In [None]:
scaler = StandardScaler()
X['Fare'] = scaler.fit_transform(X[['Fare']])
X_test['Fare'] = scaler.transform(X_test[['Fare']])

In [None]:
X.head()

# Decision Tree

In [None]:
kf = RepeatedKFold(n_splits=2, n_repeats=10, random_state=42)

step_train = []
step = []

for linhas_train, linhas_cv in kf.split(X):
    X_train, X_cv = X.iloc[linhas_train].copy(), X.iloc[linhas_cv].copy()
    y_train, y_cv = y.iloc[linhas_train].copy(), y.iloc[linhas_cv].copy()
    
    model = DecisionTreeClassifier(max_depth=15)
    model.fit(X_train, y_train)

    yhat_train = model.predict(X_train)
    yhat = model.predict(X_cv)

    acc_train = np.mean(y_train == yhat_train)
    acc = np.mean(y_cv == yhat)
    print(f'acc_train: {acc_train}, acc_cv: {acc}\n')

    step_train.append(acc_train)
    step.append(acc)

In [None]:
print(f'Train mean: {np.mean(step_train)}, CV mean: {np.mean(step)}')

In [None]:
pylab.hist(step)

In [None]:
gender_submission = (X_cv['Sex_b'] == 1).astype(np.int64)
np.mean(y_cv == gender_submission)

# Criando o Resultado das Previsões para Importar ao Kaggle

In [None]:
result = pd.Series(yhat, index=test['PassengerId'], name='Survived')
result

In [None]:
result.to_csv('./yhat/first_decision_tree_model.csv', header=True)