In [3]:
# import  
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# read training data to a DataFrame df
df = pd.read_csv('train_titanic.csv')


# Preprocessing


# drop misssing 'Embarked' values
df = df.dropna(subset=['Embarked'])

# fill in missing 'Age values'
median_value = df['Age'].median()
df['Age'] = df['Age'].fillna(median_value)

# convert 'Sex' column type to category
df['Sex'] = df['Sex'].astype('category')

# Dummy variable encoding
dummies = pd.get_dummies(df[['Sex']], prefix_sep = '_')

# add new binary 'Female' column to df for numerical analysis
#df = pd.concat([df, dummies], axis = 1)
df['Female'] = dummies['Sex_female']

# convert 'Embarked' column type to category
df['Embarked'] = df['Embarked'].astype('category')

# Dummy variable encoding
dummies = pd.get_dummies(df[['Embarked']], prefix_sep = '_')

# add new binary 'Embarked' columns to df for numerical analysis
df['Embarked_C'] = dummies['Embarked_C']
df['Embarked_S'] = dummies['Embarked_S']



# read numerical columns of df into Numpy arrays in order to feed into model

X = df.drop(['Survived', 'Name', 'Ticket', 'Cabin', 'Embarked', 'Sex'], axis = 1).values
y = df['Survived'].values

X = scale(X) # same result without scaling

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, \
                                                    random_state = 1, \
                                                    stratify = y)

# Fitting a DT model

dt = DecisionTreeClassifier()

params_dt = { 
            'max_depth' : [3, 4, 5, 6, 7, 8],
            'min_samples_leaf' : [0.04, 0.06, 0.08, 0.10],
            'max_features' : [0.2, 0.4, 0.6, 0.8]}

grid_dt = GridSearchCV(estimator = dt,
                        param_grid = params_dt,
                        scoring = 'accuracy',
                       cv = 10, n_jobs = -1)

grid_dt.fit(X_train, y_train)

best_hyperparams = grid_dt.best_params_
print('best_hyperparams =', best_hyperparams)

best_cv_score = grid_dt.best_score_
print('best_cv_score =', best_cv_score)

best_model = grid_dt.best_estimator_
print('best_model =', best_model)

test_acc = best_model.score(X_test, y_test)
print('test set accuracy of best model =', test_acc)

# test_acc is the accuracy of the best model:
# y_pred = best_model.predict(X_test)
# print('accuracy=', accuracy_score(y_test, y_pred))


best_hyperparams = {'max_depth': 6, 'max_features': 0.6, 'min_samples_leaf': 0.08}
best_cv_score = 0.8022508038585209
best_model = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
                       max_features=0.6, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.08, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
test set accuracy of best model = 0.7827715355805244


