In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [2]:
titanic_df = pd.read_csv('datasets/titanic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,3,0,36.0,1,0,17.4,0,0,1
1,1,2,1,1.0,2,1,39.0,0,0,1
2,0,2,1,51.0,0,0,12.525,0,0,1
3,1,2,0,23.0,0,0,13.7917,1,0,0
4,0,2,1,27.0,0,0,26.0,0,0,1


In [3]:
#group the features
FEATURES = list(titanic_df.columns[1:])
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [4]:
#dictionary to save results of all modeling
result_dict = {}

In [5]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True) #in term of fraction
    num_acc = accuracy_score(y_test, y_pred, normalize=False) #in term of entries
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {'accuracy': acc,
            'precision': prec,
            'recall': recall,
            'accuracy_count':num_acc}

In [6]:
#help build entry in different classification model
def build_model(classifier_fn,
                name_of_y_col,
                names_of_x_cols,
                dataset,
                test_frac=0.2):
    
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({'y_test': y_test,
                                 'y_pred': y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    return {'training': train_summary,
            'test': test_summary,
            'confusion_matrix': model_crosstab}

In [7]:
#compare results 
def compare_results():
    for key in result_dict:
        print('Classification: ', key)
        print()
        print('Training Data')
        for score in result_dict[key]['training']: #score variables are [accuracy, recall, precision]
            print(score, result_dict[key]['training'][score])
        print()
        print('Test Data')
        for score in result_dict[key]['test']: #score variables are [accuracy, recall, precision]
            print(score, result_dict[key]['test'][score])
        print()

In [8]:
#Logistic regresion modul
def logistic_fn(x_train, y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    return model

In [9]:
#main program
result_dict['survived ~ logistic'] = build_model(logistic_fn,
                                                 'Survived',
                                                 FEATURES,
                                                 titanic_df)

compare_results()

Classification:  survived ~ logistic

Training Data
accuracy 0.8014059753954306
precision 0.7934272300469484
recall 0.7100840336134454
accuracy_count 456

Test Data
accuracy 0.8251748251748252
precision 0.7906976744186046
recall 0.68
accuracy_count 118



In [10]:
#Linear Discriminant Analysis
#The best axis here refers to those axes that best seperate data into different classes
#svd [Singular Value Decomposission] find the best axis to seperate data without calculating cov
#usefull on many data dan feature
def linear_discriminant_fn(x_train, y_train, solver='svd'):
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    return model

In [11]:
result_dict['survived ~ linear_disciminant_analysis'] = build_model(linear_discriminant_fn,
                                                                   'Survived',
                                                                    FEATURES[0:-1],
                                                                    titanic_df)
compare_results()

Classification:  survived ~ logistic

Training Data
accuracy 0.8014059753954306
precision 0.7934272300469484
recall 0.7100840336134454
accuracy_count 456

Test Data
accuracy 0.8251748251748252
precision 0.7906976744186046
recall 0.68
accuracy_count 118

Classification:  survived ~ linear_disciminant_analysis

Training Data
accuracy 0.7996485061511424
precision 0.7822222222222223
recall 0.7302904564315352
accuracy_count 455

Test Data
accuracy 0.7762237762237763
precision 0.6744186046511628
recall 0.6170212765957447
accuracy_count 111



In [12]:
#QDA find axis that best seperate classes, not need to be straight line. 
#use best when covariance are different for X for all values of Y
def quadratic_discriminant_fn(x_train, y_train):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    return model

In [13]:
result_dict['survived ~ quadratic_discriminant_analysis'] = build_model(quadratic_discriminant_fn,
                                                                       'Survived',
                                                                        FEATURES[0:-1],
                                                                        titanic_df)
compare_results()

Classification:  survived ~ logistic

Training Data
accuracy 0.8014059753954306
precision 0.7934272300469484
recall 0.7100840336134454
accuracy_count 456

Test Data
accuracy 0.8251748251748252
precision 0.7906976744186046
recall 0.68
accuracy_count 118

Classification:  survived ~ linear_disciminant_analysis

Training Data
accuracy 0.7996485061511424
precision 0.7822222222222223
recall 0.7302904564315352
accuracy_count 455

Test Data
accuracy 0.7762237762237763
precision 0.6744186046511628
recall 0.6170212765957447
accuracy_count 111

Classification:  survived ~ quadratic_discriminant_analysis

Training Data
accuracy 0.8154657293497364
precision 0.7887323943661971
recall 0.7368421052631579
accuracy_count 464

Test Data
accuracy 0.7202797202797203
precision 0.6923076923076923
recall 0.6
accuracy_count 103



In [14]:
#numerical optimization training, considers one instance at a time, only one record from the training
#data sets to find the best model parameters, each training instance is fed into iterations
#max iter = max number of iter, tol = tolerance value = stopping creteria for the model training.
#stop training if the loss of particular iteration is less than tolerance value that we specified
#as compared to the previous iteration

def sgd_fn(x_train, y_train, max_iter=10000, tol=1e-3):
    model = SGDClassifier(max_iter=max_iter,tol=tol)
    model.fit(x_train, y_train)
    return model

In [15]:
result_dict['survived ~ sgd'] = build_model(sgd_fn,
                                            'Survived',
                                            FEATURES,
                                            titanic_df)
compare_results()
#why the accuracy is low? change max iter into 10k

Classification:  survived ~ logistic

Training Data
accuracy 0.8014059753954306
precision 0.7934272300469484
recall 0.7100840336134454
accuracy_count 456

Test Data
accuracy 0.8251748251748252
precision 0.7906976744186046
recall 0.68
accuracy_count 118

Classification:  survived ~ linear_disciminant_analysis

Training Data
accuracy 0.7996485061511424
precision 0.7822222222222223
recall 0.7302904564315352
accuracy_count 455

Test Data
accuracy 0.7762237762237763
precision 0.6744186046511628
recall 0.6170212765957447
accuracy_count 111

Classification:  survived ~ quadratic_discriminant_analysis

Training Data
accuracy 0.8154657293497364
precision 0.7887323943661971
recall 0.7368421052631579
accuracy_count 464

Test Data
accuracy 0.7202797202797203
precision 0.6923076923076923
recall 0.6
accuracy_count 103

Classification:  survived ~ sgd

Training Data
accuracy 0.7627416520210897
precision 0.7360406091370558
recall 0.6359649122807017
accuracy_count 434

Test Data
accuracy 0.755244755244

In [16]:
#Create N dimensional Hypercube. Hyperplane that seperates points so all points on the same 
#side belong to the same class. This plane should seperate survive (0 and 1)
#C is inverse regularization. small c indicate strong regularization (penalized points on the wrong
#side of the plane)
#LinearSVC == SVC(kernel = "linear")
#Use dual = false when n_sample  > n_feature
def linear_svc_fn(x_train, y_train, C=1.0, max_iter=1000, tol=1e-3):
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train)
    return model

In [17]:
result_dict['survived ~ linear_svc'] = build_model(linear_svc_fn,
                                                  'Survived',
                                                   FEATURES,
                                                   titanic_df)
compare_results()

Classification:  survived ~ logistic

Training Data
accuracy 0.8014059753954306
precision 0.7934272300469484
recall 0.7100840336134454
accuracy_count 456

Test Data
accuracy 0.8251748251748252
precision 0.7906976744186046
recall 0.68
accuracy_count 118

Classification:  survived ~ linear_disciminant_analysis

Training Data
accuracy 0.7996485061511424
precision 0.7822222222222223
recall 0.7302904564315352
accuracy_count 455

Test Data
accuracy 0.7762237762237763
precision 0.6744186046511628
recall 0.6170212765957447
accuracy_count 111

Classification:  survived ~ quadratic_discriminant_analysis

Training Data
accuracy 0.8154657293497364
precision 0.7887323943661971
recall 0.7368421052631579
accuracy_count 464

Test Data
accuracy 0.7202797202797203
precision 0.6923076923076923
recall 0.6
accuracy_count 103

Classification:  survived ~ sgd

Training Data
accuracy 0.7627416520210897
precision 0.7360406091370558
recall 0.6359649122807017
accuracy_count 434

Test Data
accuracy 0.755244755244

In [22]:
#Radius NN, specifies the neighbor is in radius 40 (hyperparameter)
#neighbor will vote
def radius_neighbor_fn(x_train, y_train, radius=40.0):
    model =  RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train,y_train)
    return model 

In [23]:
result_dict['survived ~ radius neighbors'] = build_model(radius_neighbor_fn,
                                                         'Survived',
                                                         FEATURES,
                                                         titanic_df)
compare_results()

Classification:  survived ~ logistic

Training Data
accuracy 0.8014059753954306
precision 0.7934272300469484
recall 0.7100840336134454
accuracy_count 456

Test Data
accuracy 0.8251748251748252
precision 0.7906976744186046
recall 0.68
accuracy_count 118

Classification:  survived ~ linear_disciminant_analysis

Training Data
accuracy 0.7996485061511424
precision 0.7822222222222223
recall 0.7302904564315352
accuracy_count 455

Test Data
accuracy 0.7762237762237763
precision 0.6744186046511628
recall 0.6170212765957447
accuracy_count 111

Classification:  survived ~ quadratic_discriminant_analysis

Training Data
accuracy 0.8154657293497364
precision 0.7887323943661971
recall 0.7368421052631579
accuracy_count 464

Test Data
accuracy 0.7202797202797203
precision 0.6923076923076923
recall 0.6
accuracy_count 103

Classification:  survived ~ sgd

Training Data
accuracy 0.7627416520210897
precision 0.7360406091370558
recall 0.6359649122807017
accuracy_count 434

Test Data
accuracy 0.755244755244

In [24]:
def decision_tree_fn(x_train, y_train, max_depth=None, max_features=None):
    model = DecisionTreeClassifier(max_depth= max_depth, max_features= max_features)
    model.fit(x_train, y_train)
    return model

result_dict['survived ~ decision tree'] =  build_model(decision_tree_fn,
                                                       'Survived',
                                                       FEATURES,
                                                       titanic_df)
compare_results()

Classification:  survived ~ logistic

Training Data
accuracy 0.8014059753954306
precision 0.7934272300469484
recall 0.7100840336134454
accuracy_count 456

Test Data
accuracy 0.8251748251748252
precision 0.7906976744186046
recall 0.68
accuracy_count 118

Classification:  survived ~ linear_disciminant_analysis

Training Data
accuracy 0.7996485061511424
precision 0.7822222222222223
recall 0.7302904564315352
accuracy_count 455

Test Data
accuracy 0.7762237762237763
precision 0.6744186046511628
recall 0.6170212765957447
accuracy_count 111

Classification:  survived ~ quadratic_discriminant_analysis

Training Data
accuracy 0.8154657293497364
precision 0.7887323943661971
recall 0.7368421052631579
accuracy_count 464

Test Data
accuracy 0.7202797202797203
precision 0.6923076923076923
recall 0.6
accuracy_count 103

Classification:  survived ~ sgd

Training Data
accuracy 0.7627416520210897
precision 0.7360406091370558
recall 0.6359649122807017
accuracy_count 434

Test Data
accuracy 0.755244755244

In [25]:
#Naive Bayes Classifier : which label is most likely, given the attributes observed in feature vector, and given how often different label occur in the data
#prior parameter is known?
def naive_bayes_fn(x_train, y_train, priors=None):
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)
    return model

In [26]:
result_dict['survived ~ naive_bayes'] = build_model(naive_bayes_fn,
                                                    'Survived',
                                                    FEATURES,
                                                    titanic_df)
compare_results()

Classification:  survived ~ logistic

Training Data
accuracy 0.8014059753954306
precision 0.7934272300469484
recall 0.7100840336134454
accuracy_count 456

Test Data
accuracy 0.8251748251748252
precision 0.7906976744186046
recall 0.68
accuracy_count 118

Classification:  survived ~ linear_disciminant_analysis

Training Data
accuracy 0.7996485061511424
precision 0.7822222222222223
recall 0.7302904564315352
accuracy_count 455

Test Data
accuracy 0.7762237762237763
precision 0.6744186046511628
recall 0.6170212765957447
accuracy_count 111

Classification:  survived ~ quadratic_discriminant_analysis

Training Data
accuracy 0.8154657293497364
precision 0.7887323943661971
recall 0.7368421052631579
accuracy_count 464

Test Data
accuracy 0.7202797202797203
precision 0.6923076923076923
recall 0.6
accuracy_count 103

Classification:  survived ~ sgd

Training Data
accuracy 0.7627416520210897
precision 0.7360406091370558
recall 0.6359649122807017
accuracy_count 434

Test Data
accuracy 0.755244755244