In [5]:
import pandas as pd
import numpy as np
import sys 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
# from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.metrics import confusion_matrix 

In [6]:

    
def get_marital_status(row):
    if row.Sex == 'female':
        return ('Mrs.' in row.Name) | ('Mme.' in row.Name)
    
def get_deck_from_cabin(cabin):
    if len(cabin) > 0:
        deck = cabin[:1]
        decks = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "O": 8, "T": 9, "U": 10}
        return decks[deck]

def get_dataframe(file_name):
    print('Filename: ', file_name)
    df = pd.read_csv(file_name) 
    df.Name.apply(str)
    df['Married'] = df.apply(get_marital_status, axis=1)
    df.Married = df.Married.astype('bool')
    df['LastName'] = df.Name.apply(lambda name: name.split(', ')[0])
#     df.Cabin = df.Cabin.fillna('U0')
#     deck prob has too many unknown values to be useful
#     df['Deck'] = df.Cabin.apply(get_deck_from_cabin)
    genders = { 'male': 0, 'female': 1}
    df.Sex.replace(genders, inplace=True)  
    df['Relatives'] = df.Parch + df.SibSp
    return df

def encode_columns(df):   
    object_cols = ['Pclass',  'LastName']
    df_copy = df.copy()
    df_copy.Age = df_copy.Age.fillna(df_copy.Age.mean()).astype(int) 
    
#     categorical cols
    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    cols_df = pd.DataFrame(encoder.fit_transform(df_copy[object_cols])) 

    # One-hot encoding removed index; put it back
    cols_df.index = df_copy.index 

    # Remove categorical columns (will replace with one-hot encoding)
    num_X_df = df_copy.drop(object_cols, axis=1) 

    # Add one-hot encoded columns to numerical features
    OH_df = pd.concat([num_X_df, cols_df], axis=1) 
    return OH_df 

def get_model(df): 
    columns = ['Pclass', 'Sex',  'LastName', 'Age']
    X = df[columns]  
    y = df['Survived']   
    OH_X = encode_columns(X) 
    clf = RandomForestClassifier()
    # Split dataset into training set and test set 
    X_train, X_test, y_train, y_test = train_test_split(OH_X, y, test_size=0.3)  
    clf.fit(X_train, y_train)
    y_pred=clf.predict(X_test)
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
    
def get_predictions(df_train, df_valid): 
#     Married, Cabin, Relatives, Deck not improve accuracy 
    df_train.Age = df_train.Age.fillna(df_train.Age.mean()).astype(int)
    df_valid.Age = df_valid.Age.fillna(df_valid.Age.mean()).astype(int)
    columns = ['Pclass', 'Sex',  'LastName', 'Age']
    object_cols = ['Pclass',  'LastName']
    X_train = df_train[columns]
    X_valid = df_valid[columns]
    y_train = df_train.Survived.astype('bool')
    
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
    OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

    # One-hot encoding removed index; put it back
    OH_cols_train.index = X_train.index
    OH_cols_valid.index = X_valid.index

    # Remove categorical columns (will replace with one-hot encoding)
    num_X_train = X_train.drop(object_cols, axis=1)
    num_X_valid = X_valid.drop(object_cols, axis=1)

    # Add one-hot encoded columns to numerical features
    OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
    OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
 
    clf = RandomForestClassifier() 
    clf.fit(OH_X_train, y_train)
    y_pred = clf.predict(OH_X_valid)
    output=pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_pred})
    output.Survived = output.Survived.astype(int)
    return output

In [8]:
get_model(get_dataframe('train.csv'))
print('Done')

Filename:  train.csv
Accuracy: 0.8097014925373134
Done




In [9]:
df_train = get_dataframe('train.csv')
df_test = get_dataframe('test.csv')
predictions = get_predictions(df_train, df_test).to_csv('submission.csv', index=False)
print('Done')

Filename:  train.csv
Filename:  test.csv
Done


