In [142]:
%pylab inline
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
pd.options.display.max_rows = None

Populating the interactive namespace from numpy and matplotlib


In [143]:
# just quickly explore contents of cell
for i in (['downloaded/train.csv', 'downloaded/test.csv']):
    print('\nsummary for {}'.format(i))
    df_tmp = pd.read_csv(i)
    df_tmp.info()
    



summary for downloaded/train.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

summary for downloaded/test.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
T

In [144]:
def do_preprocessing(df):
    ''' Put DataFrame df in a nice format ready for scikit-learn'''
    
    # Note I don't do feature scaling as not needed for a decision tree but 
    # may want to add in later
    
    sex_mapping = {
        'female': 0,
        'male': 1,
    }
    df['Sex'] = df['Sex'].map(sex_mapping)
    
    # for missing Fare put in the mean
    imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
    print(df['Fare'].values.dtype)
    imr = imr.fit(df['Fare'].values.reshape(-1, 1))
    imputed_data = imr.transform(df['Fare'].values.reshape(-1,1))
    df['Fare'] = pd.DataFrame(imputed_data)
    
    # for missing Age put in the mean
    # Made it worse. Comment out!
    # A idea that might be better is fill in the mean based on similiar values.
    # For example get the mean per Pclass and Sex and fill in that mean for matching
    # Pclass and Sex.
    # imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
    # print(df['Age'].values.dtype)
    # imr = imr.fit(df['Age'].values.reshape(-1, 1))
    # imputed_data = imr.transform(df['Age'].values.reshape(-1,1))
    # df['Age'] = pd.DataFrame(imputed_data)    
    
    
    # Turn Embarked Q, S, C into binary columns
    df['Embarked'] = df['Embarked'].fillna('Missing')
    df['Embarked'] = df['Embarked'].map({    
        'Q':0,
        'S':1,
        'C':2,
        'Missing':3
    })
    ohe = OneHotEncoder(categorical_features=[0], n_values = 4)
    x = ohe.fit_transform(df['Embarked'].values.reshape(-1, 1)).toarray()
    x = pd.DataFrame(x)
    x = x.rename(index=str, columns={
        0: "embarked_q", 
        1: "embarked_s",
        2: "embarked_c",
        3: "embarked_missing",
    })
    x.index = range(len(x))
    df.index = range(len(df))
    df = df.join(x)
    
    return df


In [145]:
# There are the actual feature that the algorithms uses. Other data points
# are ignored for various reasons
training_columns = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', ]

df_train_raw = pd.read_csv('downloaded/train.csv')
df_train = do_preprocessing(df_train_raw)
y_train = df_train['Survived']
x_train, x_test, y_train, y_test = train_test_split(df_train[training_columns], 
    y_train, test_size=0.3, random_state=0)


float64


In [146]:
# Note using Decision Tree but kept it commented out as it could be a useful.

# tree = DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=0)
# tree.fit(x_train.as_matrix(), y_train.as_matrix())
# y_pred = tree.predict(x_test)
# Accuracy: 0.806
# print('Accuracy for 70/30 split: %.3f' % accuracy_score(y_test, y_pred))

In [147]:
# TODO should try a grid search for the Decision Tree parameters
forest = RandomForestClassifier()

# Note this score is bogus (too high) as it fit against the 
forest.fit(x_train.as_matrix(), y_train.as_matrix())
y_pred = forest.predict(x_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.832


# Last step generate the file to send to kaggle

In [148]:
# I'm going fit my random forest against everything for the final prediction file for 
# kaggle. Although there might be some overfitting it may increase the score.
x_everything = do_preprocessing(pd.read_csv('downloaded/train.csv'))
y_everything = x_everything['Survived']
forest.fit(x_everything[training_columns].as_matrix(), y_everything.as_matrix())

# Finally do prediction for test.csv
df_test_raw = pd.read_csv('downloaded/test.csv')
df_test = do_preprocessing(df_test_raw)
x_test = df_test[training_columns]
result = forest.predict(x_test)

# last step is output the results to a csv and get the columns correct
df_for_file = df_test[['PassengerId',]]
df_result = pd.DataFrame(result)
df_result.columns = ['Survived']
# get id of false pandas warning
pd.options.mode.chained_assignment = None
df_for_file['Survived'] = df_result[['Survived',]]
df_for_file.to_csv("out.csv", header = ['PassengerId', 'Survived'], index=False)

float64
float64
