In [1]:
# Load the required libraries
import pandas as pd
import numpy as np
import csv as csv
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [44]:
# read the training and test data
train_df = pd.read_csv("../data/train.csv", header=0)

In [54]:
test_df = pd.read_csv("../data/test.csv", header=0)

##### Exploratory data analysis

In [14]:
# check the data dimensions
# in R its dim(), in Python use pd.shape()
print (" data dimension for train data: ",train_df.shape)
print ("\n data dimension for test data: ",test_df.shape)

 data dimesnion for train data:  (891, 12)

 data dimesnion for test data:  (418, 11)


In [15]:
# check the data structure
# in R its str(), in Python its pd.info()
print (" training data structure: ",train_df.info)
print ("\n test data structure: ",test_df.info)

 training data structure:  <bound method DataFrame.info of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
5              6         0       3   
6              7         0       1   
7              8         0       3   
8              9         1       3   
9             10         1       2   
10            11         1       3   
11            12         1       1   
12            13         0       3   
13            14         0       3   
14            15         0       3   
15            16         1       2   
16            17         0       3   
17            18         1       2   
18            19         0       3   
19            20         1       3   
20            21         0       2   
21            22         1       2   
22            23         1       3   
23            24         1   

In [22]:
# check for missing values
# in R its colSums(is.na(dtaframe_name))
# in Python its isna() and if you want to get a total count use, isna().sum()
print ("Count of Missing values in training data\n", train_df.isna().sum() )
print ("\nCount of Missing values in test data\n", test_df.isna().sum() )

Count of Missing values in training data
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Count of Missing values in test data
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [26]:
# Show the column names 
# in R its colnames(dataframe name)
# in Python when using pandas its pandas_dataframe_name.dtypes.index
print("\n Column names for training data\n\n", train_df.dtypes.index)
print("\n Column names for test data\n\n", test_df.dtypes.index)


 Column names for training data

 Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

 Column names for test data

 Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


- As we can see, the column `Survived` is absent in the test data. 

##### Feature Engineering
- Add a new variable `gender` to train data. It will be 0 for male and 1 for female. It's derived from variable `sex`. 

In [46]:
train_df['gender'] = train_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

- Similarly, recode the other categorical variables like `Embarked` to numeric

In [47]:
if len(train_df.Embarked[ train_df.Embarked.isnull() ]) > 0:
    train_df.Embarked[ train_df.Embarked.isnull() ] = train_df.Embarked.dropna().mode().values

Ports = list(enumerate(np.unique(train_df['Embarked'])))    # determine all values of Embarked,
Ports_dict = { name : i for i, name in Ports }              # set up a dictionary in the form  Ports : index
train_df.Embarked = train_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)     # Convert all Embark strings to int

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


##### Missing value imputation

In [48]:
# All the ages with no data -> make the median of all Ages
median_age = train_df['Age'].dropna().median()
if len(train_df.Age[ train_df.Age.isnull() ]) > 0:
    train_df.loc[ (train_df.Age.isnull()), 'Age'] = median_age

In [49]:
# Remove the Name column, Cabin, Ticket and Sex 
train_df = train_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 

I need to do the same with the test data now, so that the columns are the same as the training data. I need to convert all strings to integer classifiers: # female = 0, Male = 1 

In [55]:
test_df['gender'] = test_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

In [56]:
if len(test_df.Embarked[ test_df.Embarked.isnull() ]) > 0:
    test_df.Embarked[ test_df.Embarked.isnull() ] = test_df.Embarked.dropna().mode().values

test_df.Embarked = test_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)     # Convert all Embark strings to int

In [67]:
# All the ages with no data -> make the median of all Ages
median_age = test_df['Age'].dropna().median()
if len(test_df.Age[ test_df.Age.isnull() ]) > 0:
    test_df.loc[ (test_df.Age.isnull()), 'Age'] = median_age

In [58]:
# All the missing Fares -> assume median of their respective class
if len(test_df.Fare[ test_df.Fare.isnull() ]) > 0:
    median_fare = np.zeros(3)
    for f in range(0,3):                                              # loop 0 to 2
        median_fare[f] = test_df[ test_df.Pclass == f+1 ]['Fare'].dropna().median()
    for f in range(0,3):                                              # loop 0 to 2
        test_df.loc[ (test_df.Fare.isnull()) & (test_df.Pclass == f+1 ), 'Fare'] = median_fare[f]

# Collect the test data's PassengerIds before dropping it
ids = test_df['PassengerId'].values

In [59]:
# Remove the Name column, Cabin, Ticket and Sex 
test_df = test_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 

In [68]:
# check for missing data again
print ("Count of Missing values in training data\n", train_df.isna().sum() )
print ("\nCount of Missing values in test data\n", test_df.isna().sum() )

Count of Missing values in training data
 Survived    0
Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
gender      0
dtype: int64

Count of Missing values in test data
 Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
gender      0
dtype: int64


In [69]:
train_df.values

array([[ 0.    ,  3.    , 22.    , ...,  7.25  ,  2.    ,  1.    ],
       [ 1.    ,  1.    , 38.    , ..., 71.2833,  0.    ,  0.    ],
       [ 1.    ,  3.    , 26.    , ...,  7.925 ,  2.    ,  0.    ],
       ...,
       [ 0.    ,  3.    , 28.    , ..., 23.45  ,  2.    ,  0.    ],
       [ 1.    ,  1.    , 26.    , ..., 30.    ,  0.    ,  1.    ],
       [ 0.    ,  3.    , 32.    , ...,  7.75  ,  1.    ,  1.    ]])

In [70]:
# The data is now ready to apply machine learning models. So lets fit to the train, then predict to the test!
# Convert back to a numpy array
train_data = train_df.values
test_data = test_df.values

In [71]:
print ('Training...\n')
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit( train_data[0::,1::], train_data[0::,0] )

print ('Predicting...\n')
output = forest.predict(test_data).astype(int)

Training...

Predicting...



##### Write the predictions to file

In [76]:
predictions_file = open('../data/MLmodel_random_forest.csv', 'w')
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(['PassengerId','Survived'])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print ('Done.')

Done.


#### Kaggle Score: 0.73684

#### Areas of further improvement

- Rather than dropping the columns, apply feature engineering to generate new columns
- try missing data imputation using ML algorithm strategies