In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [5]:
titanic_df = pd.read_csv('datasets/titanic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,3,0,36.0,1,0,17.4,0,0,1
1,1,2,1,1.0,2,1,39.0,0,0,1
2,0,2,1,51.0,0,0,12.525,0,0,1
3,1,2,0,23.0,0,0,13.7917,1,0,0
4,0,2,1,27.0,0,0,26.0,0,0,1


In [6]:
#group the features
FEATURES = list(titanic_df.columns[1:])
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [7]:
#dictionary to save results of all modeling
result_dict = {}

In [10]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True) #in term of fraction
    num_acc = accuracy_score(y_test, y_pred, normalize=False) #in term of entries
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {'accuracy': acc,
            'precision': prec,
            'recall': recall,
            'accuracy_count':num_acc}

In [11]:
#help build entry in different classification model
def build_model(classifier_fn,
                name_of_y_col,
                names_of_x_cols,
                dataset,
                test_frac=0.2):
    
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({'y_test': y_test,
                                 'y_pred': y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    return {'training': train_summary,
            'test': test_summary,
            'confusion_matrix': model_crosstab}

In [12]:
#compare results 
def compare_results():
    for key in result_dict:
        print('Classification: ', key)
        print()
        print('Training Data')
        for score in result_dict[key]['training']: #score variables are [accuracy, recall, precision]
            print(score, result_dict[key]['training'][score])
        print()
        print('Test Data')
        for score in result_dict[key]['test']: #score variables are [accuracy, recall, precision]
            print(score, result_dict[key]['test'][score])
        print()

In [13]:
#Logistic regresion modul
def logistic_fn(x_train, y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    return model

In [14]:
#main program
result_dict['survived ~ logistic'] = build_model(logistic_fn,
                                                 'Survived',
                                                 FEATURES,
                                                 titanic_df)

compare_results()

Classification:  survived ~ logistic

Training Data
accuracy 0.789103690685413
precision 0.7777777777777778
recall 0.6851063829787234
accuracy_count 449

Test Data
accuracy 0.8181818181818182
precision 0.7454545454545455
recall 0.7735849056603774
accuracy_count 117

