In [62]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import fbeta_score, accuracy_score
from sklearn.model_selection import train_test_split

# TODO:
# - Clean functions
# - Training
# - Predicting

# Functions to clean data

In [63]:
def remove_columns(data, columns):
    data.drop(columns, axis=1, inplace=True)
    
    return data

In [64]:
# set_median(data, 'Age')
def set_median(data, column):
    age_median = np.median(data[data[column].notnull()][column])
    data[column] = data[column].fillna(age_median)
    
    return data

In [65]:
def fill_nan_values(data, column, value = -1):
    data[column] = data[column].fillna(value)
    
    return data

In [66]:
def factorize_column(data, column):
    labels, uniques = pd.factorize(data[column])
    data[column] = labels
    
    return data

In [67]:
def normalize_columns(data, columns):
    scaler = MinMaxScaler()
    data[columns] = scaler.fit_transform(data[columns])
    
    return data

# Prepare training data

In [68]:
data = pd.read_csv('./data/train.csv')

In [69]:
# Remove some columns
data = remove_columns(data, ['PassengerId', 'Name', 'Ticket'])

In [70]:
data = set_median(data, 'Age')

In [71]:
data = fill_nan_values(data, 'Cabin')
data = factorize_column(data, 'Cabin')

In [72]:
data = fill_nan_values(data, 'Embarked')
data = factorize_column(data, 'Embarked')

In [73]:
data = normalize_columns(data, ['Age', 'Fare'])

In [74]:
# One-hot the data
data = pd.get_dummies(data)

# Training and validate training data

In [75]:
# Split data

output = data['Survived']
data = remove_columns(data, 'Survived')
X_train, X_dev, y_train, y_dev = train_test_split(data, output, test_size = 0.3, random_state = 0)

In [77]:
def show_scores(y, predicted):
    acc = accuracy_score(y, predicted)
    fbeta = fbeta_score(y, predicted, beta=0.5)
    
    print("Accuracy: {:.4f}".format(acc))
    print("f-beta: {:.4f}".format(fbeta))    

In [78]:
# Gaussian model
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)

pred_train_nb = clf.predict(X_train)
pred_dev_nb = clf.predict(X_dev)

print("TEST")
show_scores(y_train, pred_train_nb)
print("DEV")
show_scores(y_dev, pred_dev_nb)

TEST
Accuracy: 0.7881
f-beta: 0.7240
DEV
Accuracy: 0.8022
f-beta: 0.7292


In [79]:
# AdaBoost model
from sklearn.ensemble import AdaBoostClassifier
clf_ada = AdaBoostClassifier(random_state = 0)
clf_ada.fit(X_train, y_train)

pred_train_ada = clf_ada.predict(X_train)
pred_dev_ada = clf_ada.predict(X_dev)

print("TEST")
show_scores(y_train, pred_train_ada)
print("DEV")
show_scores(y_dev, pred_dev_ada)

TEST
Accuracy: 0.8459
f-beta: 0.8077
DEV
Accuracy: 0.8134
f-beta: 0.7563


# Prepare test data

In [49]:
# Load test data
test_data = pd.read_csv('./data/test.csv')

In [50]:
# Remove some columns
test_data = remove_columns(test_data, ['PassengerId', 'Name', 'Ticket'])

In [51]:
test_data = set_median(test_data, 'Age')

In [52]:
test_data = fill_nan_values(test_data, 'Cabin')
test_data = factorize_column(test_data, 'Cabin')

In [53]:
test_data = fill_nan_values(test_data, 'Embarked')
test_data = factorize_column(test_data, 'Embarked')

In [56]:
test_data = fill_nan_values(test_data, 'Fare')

In [58]:
test_data = normalize_columns(test_data, ['Age', 'Fare'])

In [60]:
# One-hot the data
test_data = pd.get_dummies(test_data)

# Predict

In [80]:
p_gaussian = clf.predict(test_data)
print(p_gaussian)

[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 1 0 1 0 1 0 1 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 1 1 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 1 0 0 0 0 1 0 0 1 1 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 1 0 1 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 1 0 0
 1 0 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 1 1 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 0 1 0 0
 1 1 1 1 1 1 0 1 0 0 0]


In [81]:
p_ada = clf_ada.predict(test_data)
print(p_ada)

[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 0 1 0 0 0 0 0 1 1 1 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 1 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 0 0 0 1 1 1 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 0 0 1 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 0 1 1 0 1 0 0 0 1 1 0 0 1 0 0 0 1 0
 1 0 1 1 0 1 0 0 1 1 0 0 1 0 0 0 1 1 1 1 1 0 0 1 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 1 0 0 0 1 0 0 0 0
 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0
 1 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 1 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0
 0 1 1 1 1 1 0 1 0 0 0]


# Create submission file for Kaggle

In [83]:
passengers_id = pd.read_csv('./data/test.csv')['PassengerId']
submission = pd.DataFrame({'PassengerId':passengers_id,'Survived':p_ada})

filename = 'titanic_predictions_1.csv'

submission.to_csv(filename,index=False)

print(filename)

titanic_predictions_1.csv
