In [1]:
# Loading data

import numpy as np
import pandas as pd

train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [2]:
# Exploring data

y_train = train.Survived

X_train = train.drop(['Survived','PassengerId','Ticket', 'Parch'], 1)
X_test = test.drop(['PassengerId','Ticket', 'Parch'], 1)

#Types of data
print(X_train.dtypes,'\n')

#Procentage of survived people in train dataset
print('Survived', float(sum(y_train))/float(len(y_train)), '\n')

print('X_train.shape = ', X_train.shape)

Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Fare        float64
Cabin        object
Embarked     object
dtype: object 

Survived 0.3838383838383838 

X_train.shape =  (891, 8)


In [3]:
# Decoding names

def decodeTitle(name, sex):    
    if name.find('Mr.') != -1:
        return 'Mr.'
    if name.find('Mrs.') != -1:
        return 'Mrs.'
    if name.find('Miss.') != -1:
        return 'Miss.'
    if sex == 'male':
        return 'Mr.'
    if sex == 'female':
        return 'Miss.'    
    #in test data 125 Mrs., 182 Miss, 517 Mr.    

title_train = []
title_test = []

for i in range(X_train.shape[0]):    
    title_train.append(decodeTitle(X_train.Name[i], X_train.Sex[i]))
for i in range(X_test.shape[0]):    
    title_test.append(decodeTitle(X_test.Name[i], X_test.Sex[i]))

In [4]:
# Decodng cabins

def decodeCabin(name):
    if pd.notnull(name):
        cabins = name.split(' ')
        averageCabin = 0.0
        for cabin in cabins:
            if cabin[1:] != '':
                averageCabin = averageCabin + float(cabin[1:]) / float(len(cabins))
            else:
                averageCabin = np.nan
        return cabins[0][0], averageCabin
    else:
        return 'X', np.nan

cabinSym_train = []
cabinNum_train = []
cabinSym_test = []
cabinNum_test = []
    
for i in range(X_train.shape[0]):
    cabin = X_train.Cabin[i]
    symbol, number = decodeCabin(cabin)
    cabinSym_train.append(symbol)
    cabinNum_train.append(number)

for i in range(X_test.shape[0]):
    cabin = X_test.Cabin[i]
    symbol, number = decodeCabin(cabin)
    cabinSym_test.append(symbol)
    cabinNum_test.append(number)

In [5]:
# Putting decoding data into table

X_train = X_train.join(pd.DataFrame({'CabinSymbol': cabinSym_train}))
X_train = X_train.join(pd.DataFrame({'CabinNumber': cabinNum_train}))
X_train = X_train.join(pd.DataFrame({'Title': title_train}))
X_train = X_train.drop(['Name', 'Cabin'], 1)

X_test = X_test.join(pd.DataFrame({'CabinSymbol': cabinSym_test}))
X_test = X_test.join(pd.DataFrame({'CabinNumber': cabinNum_test}))
X_test = X_test.join(pd.DataFrame({'Title': title_test}))
X_test = X_test.drop(['Name', 'Cabin'], 1)

In [6]:
# Transforming data

numeric_cols = ['Age', 'Fare', 'CabinNumber']
categorial_cols = list(set(X_train.columns.values.tolist()) - set(numeric_cols))
X_num = X_train.drop(categorial_cols, 1)
X_cat = X_train.drop(numeric_cols, 1)
X_test_num = X_test.drop(categorial_cols, 1)
X_test_cat = X_test.drop(numeric_cols, 1)

print('X_cat.shape', X_cat.shape)
print('X_num.shape', X_num.shape)

print('X_cat.isnull =', X_cat.isnull().values.any())
print('X_num.isnull =', X_num.isnull().values.any())
print('X_test_cat.isnull =', X_test_cat.isnull().values.any())
print('X_test_num.isnull =', X_test_num.isnull().values.any())

X_cat.shape (891, 6)
X_num.shape (891, 3)
X_cat.isnull = True
X_num.isnull = True
X_test_cat.isnull = False
X_test_num.isnull = True


In [7]:
# Getting rid of missign values in numeric columns

missingAge = np.mean(X_num['Age'])
missingFare = np.mean(X_num['Fare'])
missingCabinNumber = np.mean(X_num['CabinNumber'])

X_num['Age'] = X_num['Age'].fillna(missingAge)
X_num['Fare'] = X_num['Fare'].fillna(missingFare)
X_num['CabinNumber'] = X_num['CabinNumber'].fillna(missingCabinNumber)

X_test_num['Age'] = X_test_num['Age'].fillna(missingAge)
X_test_num['Fare'] = X_test_num['Fare'].fillna(missingFare)
X_test_num['CabinNumber'] = X_test_num['CabinNumber'].fillna(missingCabinNumber)

print('Missing Age Value =', missingAge)
print('Missing Fare Value =', missingFare)
print('Missing Cabin Number Value =', missingCabinNumber)

# Getting rid of missing values in categorial data
X_cat = X_cat.fillna('NA')

print('X_num.isnull =', X_num.isnull().values.any())
print('X_cat.isnull =', X_cat.isnull().values.any())
print('X_test_cat.isnull =', X_test_cat.isnull().values.any())
print('X_test_num.isnull =', X_test_num.isnull().values.any())

Missing Age Value = 29.69911764705882
Missing Fare Value = 32.2042079685746
Missing Cabin Number Value = 50.28316326530612
X_num.isnull = False
X_cat.isnull = False
X_test_cat.isnull = False
X_test_num.isnull = False


In [8]:
# Reshaping categroial data according to features selection (look at L1_FeaturesSelection.ipynb)

def cabinSymbolValue(s):
    symbols = {'X':1, 'C':2, 'D':3, 'E':4}
    return symbols.get(s, 0)
    
X_test_cat['Married'] = np.array([X_test_cat['Title']=='Mrs.'][0])
X_test_cat['EmbarkedC'] = np.array([X_test_cat['Embarked']=='C'][0])        
X_test_cat['CabinSymbolReduced'] = [cabinSymbolValue(X_test_cat['CabinSymbol'][i]) for i in range(X_test_cat.shape[0])]
X_test_cat = X_test_cat.drop(['CabinSymbol', 'Embarked', 'Title'], 1)

X_cat['Married'] = np.array([X_cat['Title']=='Mrs.'][0])
X_cat['EmbarkedC'] = np.array([X_cat['Embarked']=='C'][0])        
X_cat['CabinSymbolReduced'] = [cabinSymbolValue(X_cat['CabinSymbol'][i]) for i in range(X_cat.shape[0])]
X_cat = X_cat.drop(['CabinSymbol', 'Embarked', 'Title'], 1)

X_cat.head(10)

Unnamed: 0,Pclass,Sex,SibSp,Married,EmbarkedC,CabinSymbolReduced
0,3,male,1,False,False,1
1,1,female,1,True,True,2
2,3,female,0,False,False,1
3,1,female,1,True,False,2
4,3,male,0,False,False,1
5,3,male,0,False,False,1
6,1,male,0,False,False,4
7,3,male,3,False,False,1
8,3,female,0,True,False,1
9,2,female,1,True,True,1


In [9]:
# Reshaping numerical data (according to experiment and common sense)

X_num = X_num.drop(['CabinNumber'], 1)
X_test_num = X_test_num.drop(['CabinNumber'], 1)

X_num.head()

Unnamed: 0,Age,Fare
0,22.0,7.25
1,38.0,71.2833
2,26.0,7.925
3,35.0,53.1
4,35.0,8.05


In [10]:
# Encoding categorial data

from sklearn.feature_extraction import DictVectorizer

encoder = DictVectorizer(sparse = False)
X_cat_oh = encoder.fit_transform(X_cat.T.to_dict().values())
X_test_cat_oh = encoder.transform(X_test_cat.T.to_dict().values())

print (X_cat_oh.shape, '\n')
print (X_cat_oh, '\n')

(891, 7) 

[[ 1.  0.  0. ...,  0.  1.  1.]
 [ 2.  1.  1. ...,  1.  0.  1.]
 [ 1.  0.  0. ...,  1.  0.  0.]
 ..., 
 [ 1.  0.  0. ...,  1.  0.  1.]
 [ 2.  1.  0. ...,  0.  1.  0.]
 [ 1.  0.  0. ...,  0.  1.  0.]] 



In [11]:
#Shuffling data
from sklearn.utils import shuffle
X_shuffled, y_shuffled = shuffle(np.hstack([X_num, X_cat_oh]), y_train, random_state = 8921)
print('X_shuffled.shape =', X_shuffled.shape)

X_shuffled.shape = (891, 9)


In [12]:
# Random Forset Classifier parameters search
from sklearn.ensemble import RandomForestClassifier

In [13]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

optimizer = GridSearchCV(RandomForestClassifier(), \
                         {'max_features' : [0.4, 0.6, 0.8, 1.0], 
                          'max_depth' : [5, 10, 15, 25, 35, 45, 55, 65], 
                          'n_estimators' : [1, 5, 10, 15, 30, 50, 70, 90, 100]},\
                         cv = 15, \
                         n_jobs=-1)
optimizer.fit(X_shuffled, y_shuffled)
print('Random Forest Best Result', optimizer.best_score_)
print('Best Parameters', optimizer.best_params_)

Random Forest Best Result 0.841750841751
Best Parameters {'max_features': 0.4, 'max_depth': 10, 'n_estimators': 50}


In [14]:
def writeAnswer(filename, optimizer):
    prediction = optimizer.predict(np.hstack([X_test_num, X_test_cat_oh]))
    submission = pd.DataFrame({
            "PassengerId": test["PassengerId"],
            "Survived": prediction
        })
    submission.to_csv(filename, index=False)
    
writeAnswer('RandomForestClassifier.csv', optimizer)