In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pylab as P
from random import sample
from sklearn.ensemble import RandomForestClassifier

In [2]:
%matplotlib inline

In [3]:
mypath = '/Users/dominicdebiaso/Development/datasets/'
df = pd.read_csv(mypath + 'kaggle_titanic_train.csv', header=0)
df.columns = map(str.lower, df.columns)

####EDA

In [4]:
## Referencing and Filtering
# df.info()
# df.describe()
# df['Age'][0:10]
# df.Age[0:10]
# df.Age.mean()
# df.sort(['age'])

## Subsetting df
# df[['sex', 'pclass', 'age']]
# df[df.age > 60]

## Combine subsetting to filter data
# df[df.age > 60][['sex', 'pclass', 'age']]

## Determine the values that have nulls
# df[df.age.isnull()][['sex', 'pclass', 'age']]

## Determine values based on 'sex' and each 'pclass'
# Count the number of times 'male' appears in the column
# len(df[(df.sex == 'male')])
# for i in range(1,4):
#     print i, len(df[(df.sex == 'male') & (df.pclass == i)])
# Produces same output as above
# for i in range(1,4):
#     i = len(df[(df.sex == 'male') & (df.pclass == i)])
#     print i

## Create an 'age' distribution 
# df.age.hist()
# P.show()
# df.age.dropna().hist(bins=16, range=(0,80), alpha = 0.5)

## Frequency tables
# df.groupby(['sex', 'pclass']).size()
# df.groupby(['sex', 'pclass']).describe()

## Create a random sampling of data
# df.ix[np.array(sample(xrange(len(df)),5))]

####Data Cleansing

In [5]:
## Create new 'gender' column and transform to binary values
df['gender'] = df['sex'].map(lambda x: x[0].upper())
df['gender'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)

In [6]:
## Preserve the original 'age' column        
df['age_fill'] = df['age']

## Locate rows with missing values and columns of interest
df[df['age'].isnull()][['gender', 'pclass', 'age', 'age_fill']].head()
# df.loc[df['age'] == 20, 'sex']

## Create an array that contains median age based on gender and pclass to fill in missing values
median_ages = np.zeros((2,3))
for i in range(0, 2):
    for j in range(0, 3):
        median_ages[i,j] = df[(df['gender'] == i) & \
                            (df['pclass'] == j+1)]['age'].dropna().median()

## Locate based on specified parameters, return the agefill column and replace with median_ages
for i in range(0, 2):
    for j in range(0, 3):
        df.loc[(df['age'].isnull()) & (df['gender'] == i) & (df['pclass'] == j+1),\
               'age_fill'] = median_ages[i,j]

## Determine whether Age was originally missing
df['age_is_null'] = pd.isnull(df.age).astype(int)

In [7]:
## Using fare to fill in gaps on embarked
df['embarked'].unique()
df['embarked_num'] = df['embarked'].map({'S': 0,'C': 1,'Q': 2}).dropna().astype(int)

## Based on 'embarked' location, get the average fare value
avgfare_embark = np.zeros((1,3))
for i in range(0, 1):
    for j in range(0, 3):
        avgfare_embark[i,j] = df[(df['embarked_num'] == j)]['fare'].dropna().mean()
        
df['embarked_fill'] = df['embarked_num']

df[df['embarked_fill'].isnull()][['fare', 'embarked', 'embarked_num', 'embarked_fill']].head()

df.loc[(df['embarked_num'].isnull()) & (df['fare'] > 50), 'embarked_fill'] = 1

# for i in range(0, 1):
#     for j in range(0, 3):
#         df.loc[(df['embarked_num'].isnull()), 'embarked_fill'] = min(avgfare[j], key=lambda x: abs(x - df['fare']))
        
# len(df[(df['embarked'] == 'C') & (df['fare'] > 70) & (df['sibsp'] == 0) & (df['parch'] == 0) & (df['pclass'] == 1)])

####Feature Engineering

In [8]:
## Create family unit by coming two variables
df['family_size'] = df['sibsp'] + df['parch']

## Age and Pclass are predictors of survial so want to amplify those
df['age*class'] = df.age_fill * df.pclass

####Model Prep

In [9]:
## Need to convert df to array, check to see which are strings
df.dtypes[df.dtypes == 'object']
df.dtypes[df.dtypes.map(lambda x: x == 'object')]

## Drop str columns; 'axis=1' looks by row
df_array = df.drop(['passengerid', 'name', 'sex', 'age', 'age_is_null', 'ticket', 'cabin', 'embarked', 'embarked_num'], axis=1)

## Convert df to array
train_data = df_array.values

####Test Dataset Preprocessing

In [10]:
## Load test datast
df_test = pd.read_csv(mypath + 'kaggle_titanic_test.csv', header=0)
df_test.columns = map(str.lower, df_test.columns)

## Determine null values
# df_test.isnull().any()
# df_test.isnull().sum()
# df_test[df_test['age'].isnull()].head()

## Gender
df_test['gender'] = df_test['sex'].map(lambda x: x[0].upper())
df_test['gender'] = df_test['sex'].map({'female': 0, 'male': 1}).astype(int)

## Age
df_test['age_fill'] = df_test['age']

median_ages_test = np.zeros((2,3))
for i in range(0, 2):
    for j in range(0, 3):
        median_ages_test[i,j] = df_test[(df_test['gender'] == i) & \
                            (df_test['pclass'] == j+1)]['age'].dropna().median()

for i in range(0, 2):
    for j in range(0, 3):
        df_test.loc[(df_test['age'].isnull()) & (df_test['gender'] == i) & (df_test['pclass'] == j+1),\
               'age_fill'] = median_ages[i,j]

## Embarked
df_test['embarked_fill'] = df_test['embarked']

df_test['embarked_fill'] = df_test['embarked'].map({'S': 0,'C': 1,'Q': 2}).dropna().astype(int)

## Fare
df_test['fare_fill'] = df_test['fare']

avgfare = np.zeros((1,3))
for i in range(0, 1):
    for j in range(0, 3):
        avgfare[i,j] = df_test[(df_test['pclass'] == j+1)]['fare_fill'].dropna().mean()

for i in range(0, 1):
    for j in range(0, 3):
        df_test.loc[(df_test['fare_fill'].isnull()) & (df_test['pclass'] == j+1),\
               'fare'] = avgfare[i,j]

## Feature Engineering
df_test['family_size'] = df_test['sibsp'] + df_test['parch']

df_test['age*class'] = df_test.age_fill * df_test.pclass
        
## Check to see which are strings
df_test.dtypes[df_test.dtypes == 'object']
df_test.dtypes[df_test.dtypes.map(lambda x: x == 'object')]        

## Drop str columns
df_test_array = df_test.drop(['passengerid', 'name', 'sex', 'age', 'ticket', 'cabin', 'embarked', 'fare_fill'], axis=1)

## Convert df to array
test_data = df_test_array.values

####Random Forests

In [11]:
## Initialize random forest object which includes parameters
forest = RandomForestClassifier(n_estimators = 500)

## Fit training data to survived labels and create trees
forest = forest.fit(train_data[0::,1::], train_data[0::,0])

## Score trained set
# print forest.score(train_data[0::,1::], train_data[0::,0])

## Take same tree and run on test data
output = forest.predict(test_data).astype(int)

In [12]:
# Print results output to csv
# d = {'Survived': output, 'PassengerId': df_test['passengerid']}
output_results = pd.DataFrame(data = {'PassengerId': df_test['passengerid'], 'Survived': output})
output_results.to_csv("/Users/dominicdebiaso/Desktop/kaggle_titanic_model.csv", index=False)

In [13]:
# https://www.kaggle.com/c/titanic/details/getting-started-with-python-ii
# https://www.kaggle.com/c/titanic/details/getting-started-with-random-forests