Training a support vector machine to predicts which passengers survived the Titanic shipwreck.


In [1]:
from sklearn.preprocessing import StandardScaler
import random
import pandas as pd
import numpy as np
import os 


def preprocess_data(data, isTestData):
    data = data.drop(['Name','Ticket','Cabin','PassengerId'], axis=1)
    # replacing the NaNs in Age with an interpolated value
    data = data.interpolate(method='linear', axis=0)
    # replacing the NaNs in Embarked with a random letter
    data['Embarked'] = data['Embarked'].replace(np.nan,['S','C','Q'][random.randint(0,2)])

    data = pd.get_dummies(data)
    
    ## extracting the continuous features I want to scale
    features_to_scale = data[['Age','Fare']]
    ## scaling them
    scaled_features = StandardScaler().fit_transform(features_to_scale.values)
    ## and then reassigning the new scaled values back into my DataFrame
    data[['Age','Fare']] = scaled_features
    return data

def get_target_variable(data):
    train_X = data.drop(['Survived'], axis=1)
    train_y = data[['Survived']]
    
    # train test split - not going to split training data for final model training
    #from sklearn.model_selection import train_test_split
    #X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.33, random_state=42)
    
    return train_X,train_y

def convert_to_numpy(data):    
    # converting my pandas DataFrames into numpy arrays - the scikit's expected input
    # applying squeeze - if training set is passed in squeeze will have no effect,
    # if 1-dimensional target variable is passed in squeeze will change to 1-d numpy array
    data = np.array(data).squeeze()
    return data
        

def train_model(X_train, y_train):
    from sklearn.svm import SVC
    model = SVC()
    model.fit(X_train, y_train)
    return model

def make_predictions(X_test, model):
    y_pred = model.predict(X_test)
    return y_pred

In [2]:
def main():
    ## training portion
    trainingData = pd.read_csv('./train.csv')
    trainingData = preprocess_data(trainingData, False)
    train_X, train_y = get_target_variable(trainingData)
    train_X = convert_to_numpy(train_X)
    train_y = convert_to_numpy(train_y)
    fitted_model = train_model(train_X,train_y)
    
    ## making predictions on test data
    testData = pd.read_csv('./test.csv')
    output = testData[['PassengerId']]
    testData = preprocess_data(testData, True)
    testData = convert_to_numpy(testData)
    y_pred = make_predictions(testData, fitted_model)
    output['Survived'] = y_pred
    output.to_csv('my_submission.csv', index=False)
    print('output saved as my_submission.csv')
    return output

In [3]:
main()

output saved as my_submission.csv


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
