In [18]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
titanic_df = pd.read_csv('c:/projects/datasets/titanic_processed.csv')

titanic_df.head()
# categorical values have been either label encoded or one-hot encoded

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,28.0,0,0,7.8958,0,0,1
1,0,3,1,26.0,1,2,20.575,0,0,1
2,1,2,0,25.0,1,1,30.0,0,0,1
3,0,3,1,28.0,0,0,7.8958,0,0,1
4,0,3,1,29.0,1,0,7.0458,0,0,1


In [20]:
# the trage values we want to predict are those in the survived column
# extract the feature from the data frame
FEATURES = list(titanic_df.columns[1:])
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [21]:
# where we will store the results of the different models
result_dict = {}

In [22]:
# create some helper functions that will be reused in this model
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    return {'accuracy':acc, 'precision':prec, 'recall':recall, 'accuracy_count':num_acc}

In [23]:
def build_model(classifier_fn, name_of_y_col, names_of_x_cols, dataset, test_frac=0.2):
    
    # extract the X variables, that is the features we'll use for training, and the Y values from the database
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]
    
    # split the dataset to training data and test data
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
    
    # because this is a helper function, we call and instantiate the right estimator object and train the model
    model = classifier_fn(x_train, y_train)
    
    # the classifier with then return an instance of the fully trained model whicl will then be used to predict no the test data
    y_pred = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    # summarize the data by calling the summarize_classification helper function defined earlier
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    # dataframe with actual and predicted values on the test data
    pred_results = pd.DataFrame({'y_test': y_test, 'y_pred':y_pred})
    
    # calculate the confusion matrix
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    # finally return all the information as a dictionary
    return {'training': train_summary, 'test':test_summary, 'confusion_matrix': model_crosstab }

In [24]:
# helper function to quickly compare the results of the different classification models
def compare_results():
    for key in result_dict:
        print('Classification: ', key)
        
        print()
        print('Training data:')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
            
        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
            
        print()

In [25]:
# let's use the helper functions to build and train a logistic regression model
# this function takes in training data and training models
def logistic_fn(x_train, y_train):
    
    # instantiate logstic regression estimator and calls regression estimator
    model = LogisticRegression(solver='liblinear')
    # call fit on teh estimator to start training
    model.fit(x_train, y_train)
    
    # return the fully trained model
    return model

In [26]:
result_dict['survived ~ logistic'] = build_model(logistic_fn, 'Survived', FEATURES, titanic_df)
compare_results()

Classification:  survived ~ logistic

Training data:
accuracy 0.7873462214411248
precision 0.7681818181818182
recall 0.7071129707112971
accuracy_count 448

Test data
accuracy 0.8181818181818182
precision 0.7346938775510204
recall 0.7346938775510204
accuracy_count 117



In [29]:
# linear discriminant find axes to best separate the classes such that all instances of a class are in the same quadrant
# the best axes here refers to those axes that best separate the data into different classes
def linear_discriminant_fn(x_train, y_train, solver='svd'):
    # svd - singular value decomposition solver
    # svd estimator finds axes without calculating the covariance matrix of features
    # useful when we have many features or many rows in dataset
    
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    
    return model

In [33]:
result_dict['survived ~ linear_discriminant_analysis'] = build_model(linear_discriminant_fn, 'Survived', FEATURES, titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data:
accuracy 0.7873462214411248
precision 0.7681818181818182
recall 0.7071129707112971
accuracy_count 448

Test data
accuracy 0.8181818181818182
precision 0.7346938775510204
recall 0.7346938775510204
accuracy_count 117

Classification:  survived ~ linear_discriminant_analysis

Training data:
accuracy 0.7996485061511424
precision 0.7767441860465116
recall 0.7167381974248928
accuracy_count 455

Test data
accuracy 0.7552447552447552
precision 0.6851851851851852
recall 0.6727272727272727
accuracy_count 108



In [32]:
# drop one of the last one-hot encoding feature
result_dict['survived ~ linear_discriminant_analysis'] = build_model(linear_discriminant_fn, 'Survived', FEATURES[0:-1], titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data:
accuracy 0.7873462214411248
precision 0.7681818181818182
recall 0.7071129707112971
accuracy_count 448

Test data
accuracy 0.8181818181818182
precision 0.7346938775510204
recall 0.7346938775510204
accuracy_count 117

Classification:  survived ~ linear_discriminant_analysis

Training data:
accuracy 0.7978910369068541
precision 0.7767441860465116
recall 0.7136752136752137
accuracy_count 454

Test data
accuracy 0.7832167832167832
precision 0.7254901960784313
recall 0.6851851851851852
accuracy_count 112



In [43]:
# finds axes to best separate the calsses such that all instances of a class
# are in the same quadrant but the decision boundary is quadratic
def quadriatic_discriminant_fn(x_train, y_train):
    
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model

In [44]:
result_dict['survived ~ quadriatic_discriminant_analysis'] = build_model(quadriatic_discriminant_fn, 'Survived', FEATURES[0:-1],titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data:
accuracy 0.7873462214411248
precision 0.7681818181818182
recall 0.7071129707112971
accuracy_count 448

Test data
accuracy 0.8181818181818182
precision 0.7346938775510204
recall 0.7346938775510204
accuracy_count 117

Classification:  survived ~ linear_discriminant_analysis

Training data:
accuracy 0.7996485061511424
precision 0.7767441860465116
recall 0.7167381974248928
accuracy_count 455

Test data
accuracy 0.7552447552447552
precision 0.6851851851851852
recall 0.6727272727272727
accuracy_count 108

Classification:  survived ~ quadriatic_discriminant_analysis

Training data:
accuracy 0.7926186291739895
precision 0.7534246575342466
recall 0.7205240174672489
accuracy_count 451

Test data
accuracy 0.8111888111888111
precision 0.8076923076923077
recall 0.711864406779661
accuracy_count 116



In [52]:
# Stochastic Gradient Descent (SGD) performs numerical optimization - one training instance at a time - to find the best model parameters
def sgd_fn(x_train, y_train, max_iter=10000, tol=1e-3):
    
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(x_train, y_train)
    
    return model

In [53]:
result_dict['survived ~ sgd'] = build_model(sgd_fn, 'Survived', FEATURES, titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data:
accuracy 0.7873462214411248
precision 0.7681818181818182
recall 0.7071129707112971
accuracy_count 448

Test data
accuracy 0.8181818181818182
precision 0.7346938775510204
recall 0.7346938775510204
accuracy_count 117

Classification:  survived ~ linear_discriminant_analysis

Training data:
accuracy 0.7996485061511424
precision 0.7767441860465116
recall 0.7167381974248928
accuracy_count 455

Test data
accuracy 0.7552447552447552
precision 0.6851851851851852
recall 0.6727272727272727
accuracy_count 108

Classification:  survived ~ quadriatic_discriminant_analysis

Training data:
accuracy 0.7926186291739895
precision 0.7534246575342466
recall 0.7205240174672489
accuracy_count 451

Test data
accuracy 0.8111888111888111
precision 0.8076923076923077
recall 0.711864406779661
accuracy_count 116

Classification:  survived ~ sgd

Training data:
accuracy 0.7258347978910369
precision 0.6862745098039216
recall 0.603448275862069
accuracy_count 413



In [54]:
# find a hyperplace that separates points so all points on the same side below to the same class
# this plans should separate titanic survivors on one side with those who did not on the other side
def linear_svc_fn(x_train, y_train, C=1.0, max_iter=1000, tol=1e-3):
    # C - inverse pf regularization, smaller values indicate stronger regularization - penalize points on the wrong side of the margin
    

    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train)
    
    return model

In [58]:
result_dict['survived ~ linear_svc'] = build_model(linear_svc_fn, 'Survived', FEATURES, titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data:
accuracy 0.7873462214411248
precision 0.7681818181818182
recall 0.7071129707112971
accuracy_count 448

Test data
accuracy 0.8181818181818182
precision 0.7346938775510204
recall 0.7346938775510204
accuracy_count 117

Classification:  survived ~ linear_discriminant_analysis

Training data:
accuracy 0.7996485061511424
precision 0.7767441860465116
recall 0.7167381974248928
accuracy_count 455

Test data
accuracy 0.7552447552447552
precision 0.6851851851851852
recall 0.6727272727272727
accuracy_count 108

Classification:  survived ~ quadriatic_discriminant_analysis

Training data:
accuracy 0.7926186291739895
precision 0.7534246575342466
recall 0.7205240174672489
accuracy_count 451

Test data
accuracy 0.8111888111888111
precision 0.8076923076923077
recall 0.711864406779661
accuracy_count 116

Classification:  survived ~ sgd

Training data:
accuracy 0.7258347978910369
precision 0.6862745098039216
recall 0.603448275862069
accuracy_count 413



In [None]:
# Nearest Neighbors Classification
# uses training data to find what is most similar to the current sample
# uses the entire training dataset as a model
# each element in training data has an associated label
