In [21]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, accuracy_score
from sklearn.metrics import classification_report

# TODO:
# - Clean functions
# - Training
# - Predicting

# Functions to clean data

In [22]:
def remove_columns(data, columns):
    data.drop(columns, axis=1, inplace=True)
    
    return data

In [23]:
# set_median(data, 'Age')
def set_median(data, column):
    age_median = np.median(data[data[column].notnull()][column])
    data[column] = data[column].fillna(age_median)
    
    return data

In [24]:
def fill_nan_values(data, column, value = -1):
    data[column] = data[column].fillna(value)
    
    return data

In [25]:
def factorize_column(data, column):
    labels, uniques = pd.factorize(data[column])
    data[column] = labels
    
    return data

In [26]:
def normalize_columns(data, columns):
    scaler = MinMaxScaler()
    data[columns] = scaler.fit_transform(data[columns])
    
    return data

# Prepare training data

In [27]:
data = pd.read_csv('./data/train.csv')

In [28]:
# Remove some columns
data = remove_columns(data, ['PassengerId', 'Name', 'Ticket'])

In [29]:
data = set_median(data, 'Age')

In [30]:
data = fill_nan_values(data, 'Cabin')
data = factorize_column(data, 'Cabin')

In [31]:
data = fill_nan_values(data, 'Embarked')
data = factorize_column(data, 'Embarked')

In [32]:
data = normalize_columns(data, ['Age', 'Fare'])

In [33]:
# One-hot the data
data = pd.get_dummies(data)

# Training and validate training data

In [34]:
# Split data

output = data['Survived']
data = remove_columns(data, 'Survived')
X_train, X_dev, y_train, y_dev = train_test_split(data, output, test_size = 0.3, random_state = 0)

In [37]:
def show_scores(y, predicted):
    acc = accuracy_score(y, predicted)
    fbeta = fbeta_score(y, predicted, beta=0.5)
    
    #print("Accuracy: {:.4f}".format(acc))
    #print("f-beta: {:.4f}".format(fbeta))
    print(classification_report(y, predicted))

In [38]:
# Gaussian model
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)

pred_train_nb = clf.predict(X_train)
pred_dev_nb = clf.predict(X_dev)

print("TEST")
show_scores(y_train, pred_train_nb)
print("DEV")
show_scores(y_dev, pred_dev_nb)

TEST
             precision    recall  f1-score   support

          0       0.84      0.81      0.82       381
          1       0.72      0.75      0.73       242

avg / total       0.79      0.79      0.79       623

DEV
             precision    recall  f1-score   support

          0       0.86      0.82      0.84       168
          1       0.72      0.77      0.74       100

avg / total       0.81      0.80      0.80       268



In [39]:
# AdaBoost model
from sklearn.ensemble import AdaBoostClassifier
clf_ada = AdaBoostClassifier(random_state = 0)
clf_ada.fit(X_train, y_train)

pred_train_ada = clf_ada.predict(X_train)
pred_dev_ada = clf_ada.predict(X_dev)

print("TEST")
show_scores(y_train, pred_train_ada)
print("DEV")
show_scores(y_dev, pred_dev_ada)

TEST
             precision    recall  f1-score   support

          0       0.86      0.89      0.88       381
          1       0.81      0.78      0.80       242

avg / total       0.85      0.85      0.85       623

DEV
             precision    recall  f1-score   support

          0       0.84      0.87      0.85       168
          1       0.77      0.72      0.74       100

avg / total       0.81      0.81      0.81       268



# Prepare test data

In [49]:
# Load test data
test_data = pd.read_csv('./data/test.csv')

In [50]:
# Remove some columns
test_data = remove_columns(test_data, ['PassengerId', 'Name', 'Ticket'])

In [51]:
test_data = set_median(test_data, 'Age')

In [52]:
test_data = fill_nan_values(test_data, 'Cabin')
test_data = factorize_column(test_data, 'Cabin')

In [53]:
test_data = fill_nan_values(test_data, 'Embarked')
test_data = factorize_column(test_data, 'Embarked')

In [56]:
test_data = fill_nan_values(test_data, 'Fare')

In [58]:
test_data = normalize_columns(test_data, ['Age', 'Fare'])

In [60]:
# One-hot the data
test_data = pd.get_dummies(test_data)

# Predict

In [80]:
p_gaussian = clf.predict(test_data)
print(p_gaussian)

[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 1 0 1 0 1 0 1 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 1 1 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 1 0 0 0 0 1 0 0 1 1 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 1 0 1 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 1 0 0
 1 0 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 1 1 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 0 1 0 0
 1 1 1 1 1 1 0 1 0 0 0]


In [81]:
p_ada = clf_ada.predict(test_data)
print(p_ada)

[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 0 1 0 0 0 0 0 1 1 1 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 1 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 0 0 0 1 1 1 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 0 0 1 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 0 1 1 0 1 0 0 0 1 1 0 0 1 0 0 0 1 0
 1 0 1 1 0 1 0 0 1 1 0 0 1 0 0 0 1 1 1 1 1 0 0 1 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 1 0 0 0 1 0 0 0 0
 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0
 1 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 1 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0
 0 1 1 1 1 1 0 1 0 0 0]


# Create submission file for Kaggle

In [83]:
passengers_id = pd.read_csv('./data/test.csv')['PassengerId']
submission = pd.DataFrame({'PassengerId':passengers_id,'Survived':p_ada})

filename = 'titanic_predictions_1.csv'

submission.to_csv(filename,index=False)

print(filename)

titanic_predictions_1.csv
