In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import linear_model
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib as joblib
from sklearn import ensemble

from mylib import preprocessing as prep
from mylib import training as tr

In [2]:
def load_data(data_filename, label):
    print("=== Loading Data ===")
    # Load the data set
    df = pd.read_csv(data_filename)
    # Create the X and y arrays
    y = df[label].to_numpy()
    del df[label]
    X = df.to_numpy()
    return X, y

def setup_data_and_model(X, y):
    print("=== Setting up data ===")
    # Split the data set in a training set (70%) and a test set (30%)
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.3,
        random_state=0
    )

    print("=== Setting up model ===")
    # Fit regression model
    # model = linear_model.LogisticRegression(solver='lbfgs')
    # RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
    model = ensemble.GradientBoostingClassifier()
    return model, X_train, X_test, y_train, y_test

In [3]:
    data_filename = 'train_formatted.csv'
    label = 'Survived'
    X, y = load_data(data_filename, label)
    model, X_train, X_test, y_train, y_test = setup_data_and_model(X, y)

=== Loading Data ===
=== Setting up data ===
=== Setting up model ===


In [4]:
print(len(X_train))
print(len(y_train))

623
623


In [5]:
tr.test_classifier(X_train, X_test, y_train, y_test)

Running : RandomForestClassifier
Accuracy: 82.8358%
Mean Abs Error train : 0.1413
Mean Abs Error test  : 0.1716
Running : AdaBoostClassifier
Accuracy: 81.7164%
Mean Abs Error train : 0.1782
Mean Abs Error test  : 0.1828
Running : GradientBoostingClassifier
Accuracy: 81.3433%
Mean Abs Error train : 0.1525
Mean Abs Error test  : 0.1866
Running : SVC
Accuracy: 78.7313%
Mean Abs Error train : 0.2135
Mean Abs Error test  : 0.2127
Running : LinearSVC
Accuracy: 80.9701%
Mean Abs Error train : 0.1734
Mean Abs Error test  : 0.1903
Running : NuSVC
Accuracy: 78.7313%
Mean Abs Error train : 0.2135
Mean Abs Error test  : 0.2127
Running : KNeighborsClassifier
Accuracy: 81.3433%
Mean Abs Error train : 0.1685
Mean Abs Error test  : 0.1866
Running : LinearDiscriminantAnalysis
Accuracy: 80.2239%
Mean Abs Error train : 0.1734
Mean Abs Error test  : 0.1978
Running : QuadraticDiscriminantAnalysis
Accuracy: 37.3134%
Mean Abs Error train : 0.6116
Mean Abs Error test  : 0.6269
Running : GaussianNB
Accuracy: 4