In [1]:
import pandas as pd
import numpy as np

wdir = '/Users/dwhitehead/Documents/github/kaggle/titanic/'

# For .read_csv, always use header=0 when you know row 0 is the header row
df = pd.read_csv(wdir + 'train.csv', header=0)
df_test = pd.read_csv(wdir + 'test.csv', header=0)
# print 'train'
# print df.sample(5)
# print 'test'
# print df.sample(5)

In [2]:
def category_to_factor(df, category_col):
    mapper = {}
    for i, x in enumerate(df[category_col].unique()):
        mapper.update({x: i})
    df[category_col + '_int'] = df[category_col].map(mapper).astype(int)
    
    return df

In [3]:
# clean data:
def clean_data(df):
    # gender to binary
    df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    ### null handling ###
    ## Age
    if len(df[df.Age.isnull()]) > 0:
        # grab median age per passenger class as an array
        median_ages = np.zeros((len(df.Gender.unique()),len(df.Pclass.unique())))
        for i in range(0, len(df.Gender.unique())):
            for j in range(0, len(df.Pclass.unique())):
                median_ages[i,j] = df[(df['Gender'] == i) & (df['Pclass'] == j+1)]['Age'].dropna().median()
        # assign null age values to medians by class
        df['AgeFill'] = df['Age']
        for i in range(0, len(df.Gender.unique())):
            for j in range(0, len(df.Pclass.unique())):
                df.loc[(df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j+1), 'AgeFill'] = median_ages[i,j]
        # null age as binary
        df['AgeIsNull'] = pd.isnull(df.Age).astype(int)
        
    ## Fare
    if len(df[df.Fare.isnull()]) > 0:
        median_fare = np.zeros(len(df.Fare.unique()))
        for f in range(0,len(df.Fare.unique())):
            median_fare[f] = df[df.Pclass == f+1 ]['Fare'].dropna().median()
        for f in range(0,len(df.Fare.unique())):
            df.loc[(df.Fare.isnull()) & (df.Pclass == f+1 ), 'Fare'] = median_fare[f]
            
    # feature engineering
    df['FamilySize'] = df['SibSp'] + df['Parch']
    df['Age*Class'] = df.AgeFill * df.Pclass
    df = category_to_factor(df, 'Embarked')
    df = df.drop(df.dtypes[df.dtypes.map(lambda x: x=='object')].index.tolist(), axis=1)
    df = df.drop(['Age'], axis=1)
    
    return df

In [4]:
# reorder dependent variable:
def reorder_depvar(df, depvar):
    l = [depvar]
    for x in df.columns:
        if x != depvar:
            l.append(x)
    df = df.reindex_axis(l, axis=1)
    
    return df

In [5]:
df = clean_data(df)
df = reorder_depvar(df, 'Survived')
train_data = df.values
df_test = clean_data(df_test)
test_data = df_test.values
PassengerIds = df_test.PassengerId.values

In [6]:
# print df.Gender.unique()
# print df.describe()
# print df_test.describe()

In [7]:
# Import the random forest package
from sklearn.ensemble import RandomForestClassifier 

l = []
for x in range(0,500):
    # Create the random forest object which will include all the parameters
    # for the fit
    forest = RandomForestClassifier(n_estimators = 100)

    # Fit the training data to the Survived labels and create the decision trees
    forest = forest.fit(train_data[0::,1::],train_data[0::,0])

    # Take the same decision trees and run it on the test data
    output = forest.predict(test_data)
    l.append(output)

In [8]:
sum_output = np.zeros_like(l[0])

for output in l:
    sum_output += output

avg_output = []
final_output = []
for x in sum_output:
    avg_output.append(x/500)
    if x/500 < .5:
        final_output.append(0)
    else:
        final_output.append(1)
        
        
# print avg_output
final_output = np.array(final_output)
print len(final_output), type(final_output), final_output
# print test_data[0,0::]
print len(PassengerIds), type(PassengerIds), PassengerIds

import csv
predictions_file = open("randomforrest.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(PassengerIds, final_output))
predictions_file.close()
print 'Done.'


final_array = np.concatenate((PassengerIds, final_output), axis=0)

print final_array

# print l[0]==l[1]
# print l[0]
# print l[1]
# print l[0] + l[1]

418 <type 'numpy.ndarray'> [0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 0 1 0 0
 0 0 0 0 1 0 1 1 0 1 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 0 0 0 1
 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 1 1 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 1 0 1 0 0 0 0 0 1 0 1 1 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0
 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 1 1 0 1 0 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0
 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 1 0 0
 0 0 1 0 1 0 0 1 0 0 0]
418 <type 'numpy.ndarray'> [ 892  893  894  895  896  897  898  899  900  901  902  903  904  905  906
  907  908  909  910 