In [16]:
# import  
import pandas as pd
import numpy as np

# Import functions to scale, compute accuracy and split data

from sklearn.preprocessing import scale

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.model_selection import train_test_split

# Import models, including VotingClassifier meta-model

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import VotingClassifier

# Set seed for reproducibility
SEED =1


# read training data to a DataFrame df
df = pd.read_csv('train_titanic.csv')


# Preprocessing


# drop misssing 'Embarked' values
df = df.dropna(subset=['Embarked'])

# fill in missing 'Age values'
median_value = df['Age'].median()
df['Age'] = df['Age'].fillna(median_value)

# convert 'Sex' column type to category
df['Sex'] = df['Sex'].astype('category')

# Dummy variable encoding
dummies = pd.get_dummies(df[['Sex']], prefix_sep = '_')

# add new binary 'Female' column to df for numerical analysis
#df = pd.concat([df, dummies], axis = 1)
df['Female'] = dummies['Sex_female']

# convert 'Embarked' column type to category
df['Embarked'] = df['Embarked'].astype('category')

# Dummy variable encoding
dummies = pd.get_dummies(df[['Embarked']], prefix_sep = '_')

# add new binary 'Embarked' columns to df for numerical analysis
df['Embarked_C'] = dummies['Embarked_C']
df['Embarked_S'] = dummies['Embarked_S']



# read numerical columns of df into Numpy arrays in order to feed into model

X = df.drop(['Survived', 'Name', 'Ticket', 'Cabin', 'Embarked', 'Sex'], axis = 1).values
y = df['Survived'].values


X = scale(X) 

# Split data into 70% train and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, 
                                                    random_state = SEED, 
                                                    stratify = y)

# Instantiate individual classifiers

lr = LogisticRegression(random_state=SEED)
knn = KNN()
dt = DecisionTreeClassifier(max_depth= 3, max_features= 0.6, 
                            min_samples_leaf= 0.04, random_state=SEED)


# Define a list called classifier that contains the tuples (classifier_name, classifier)

classifiers = [('Logistic Regression', lr),
               ('K Nearest Neighbours', knn),
               ('Classification Tree', dt)]

# Iterate over the defined list of tuples containing the classifiers

for clf_name, clf in classifiers:
    
    # fit clf to the training set
    clf.fit(X_train, y_train)
    
    # Predict the labels of the test set
    y_pred = clf.predict(X_test)
    
    # Evaluate the accuracy of clf on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy_score(y_test, y_pred)))
    

# Instantiate a VotingClassifier 'vc'

vc = VotingClassifier(estimators=classifiers)

# Fit 'vc' to the traing set and predict test set labels
vc.fit(X_train, y_train)
y_pred = vc.predict(X_test)

# Evaluate the test-set accuracy of 'vc'
print('Voting Classifier test set accuracy: {:.3f}'.format(accuracy_score(y_test, y_pred)))
    
print('5 fold cross validation scores =', cross_val_score(vc, X, y, cv = 5))

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



Logistic Regression : 0.787
K Nearest Neighbours : 0.813
Classification Tree : 0.801
Voting Classifier test set accuracy: 0.798




5 fold cross validation scores = [0.79213483 0.80337079 0.7752809  0.79775281 0.81920904]
[[146  19]
 [ 35  67]]
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       165
           1       0.78      0.66      0.71       102

    accuracy                           0.80       267
   macro avg       0.79      0.77      0.78       267
weighted avg       0.80      0.80      0.79       267

