In [None]:
# Loading data

import numpy as np
import pandas as pd

train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [None]:
# Exploring data

y_train = train.Survived

X_train = train.drop(['Survived','PassengerId','Ticket', 'Parch'], 1)
X_test = test.drop(['PassengerId','Ticket', 'Parch'], 1)

#Types of data
print(X_train.dtypes,'\n')

#Procentage of survived people in train dataset
print('Survived',y_train.mean(), '\n')

print('X_train.shape = ', X_train.shape)

In [None]:
# Decoding names

def decodeTitle(row):
    signif_titles=['Mr.','Miss.','Mrs.','Master.'] 
    change_sex={'female':'Miss.','male':'Mr.'}
    # cut exactly title from column Name
    title=row[1][row[1].find(',')+2:row[1].find('.')+1]
    
    # return title if it is in set of significant titles
    if title in signif_titles:
        return(title)
    
    # return sex if title is insignificant
    else: return(change_sex[row[2]])

X_train['Title']=X_train['Name'].apply(decodeTitle)
X_test['Title']=X_test['Name'].apply(decodeTitle)

In [None]:
import re
# Decodng cabins
def number(string):
    if pd.notnull(string):
        numbers=[int(i) for i in re.findall('(\d+)',string)]
        return(np.mean(numbers))

X_train['CabinSymbol'] = X_train.Cabin.str[0].fillna('X')
X_train['CabinNumber'] = X_train.Cabin.apply(number)
X_test['CabinSymbol'] = X_test.Cabin.str[0].fillna('X')
X_test['CabinNumber'] = X_test.Cabin.apply(number)

X_train.drop(['Name', 'Cabin'], 1, inplace=True)
X_test.drop(['Name', 'Cabin'], 1, inplace=True)

In [None]:
# Transforming data

numeric_cols = ['Age', 'Fare', 'CabinNumber']
categorial_cols = list(set(X_train.columns.values.tolist()) - set(numeric_cols))
X_num = X_train.drop(categorial_cols, 1)
X_cat = X_train.drop(numeric_cols, 1)
X_test_num = X_test.drop(categorial_cols, 1)
X_test_cat = X_test.drop(numeric_cols, 1)

print('X_cat.shape', X_cat.shape)
print('X_num.shape', X_num.shape)

print('X_cat.isnull =', X_cat.isnull().values.any())
print('X_num.isnull =', X_num.isnull().values.any())
print('X_test_cat.isnull =', X_test_cat.isnull().values.any())
print('X_test_num.isnull =', X_test_num.isnull().values.any())

In [None]:
# Getting rid of missign values in numeric columns

missingAge = np.mean(X_num['Age'])
missingFare = np.mean(X_num['Fare'])
missingCabinNumber = np.mean(X_num['CabinNumber'])

X_num['Age'] = X_num['Age'].fillna(missingAge)
X_num['Fare'] = X_num['Fare'].fillna(missingFare)
X_num['CabinNumber'] = X_num['CabinNumber'].fillna(missingCabinNumber)

X_test_num['Age'] = X_test_num['Age'].fillna(missingAge)
X_test_num['Fare'] = X_test_num['Fare'].fillna(missingFare)
X_test_num['CabinNumber'] = X_test_num['CabinNumber'].fillna(missingCabinNumber)

print('Missing Age Value =', missingAge)
print('Missing Fare Value =', missingFare)
print('Missing Cabin Number Value =', missingCabinNumber)

# Getting rid of missing values in categorial data
X_cat = X_cat.fillna('NA')

print('X_num.isnull =', X_num.isnull().values.any())
print('X_cat.isnull =', X_cat.isnull().values.any())
print('X_test_cat.isnull =', X_test_cat.isnull().values.any())
print('X_test_num.isnull =', X_test_num.isnull().values.any())

In [None]:
# Reshaping categroial data according to features selection (look at L1_FeaturesSelection.ipynb)

def cabinSymbolValue(s):
    symbols = {'X':1, 'C':2, 'D':3, 'E':4}
    return symbols.get(s, 0)
    
X_test_cat['Married'] = np.array([X_test_cat['Title']=='Mrs.'][0])
X_test_cat['EmbarkedC'] = np.array([X_test_cat['Embarked']=='C'][0])        
X_test_cat['CabinSymbolReduced'] = [cabinSymbolValue(X_test_cat['CabinSymbol'][i]) for i in range(X_test_cat.shape[0])]
X_test_cat = X_test_cat.drop(['CabinSymbol', 'Embarked', 'Title'], 1)

X_cat['Married'] = np.array([X_cat['Title']=='Mrs.'][0])
X_cat['EmbarkedC'] = np.array([X_cat['Embarked']=='C'][0])        
X_cat['CabinSymbolReduced'] = [cabinSymbolValue(X_cat['CabinSymbol'][i]) for i in range(X_cat.shape[0])]
X_cat = X_cat.drop(['CabinSymbol', 'Embarked', 'Title'], 1)

X_cat.head(10)

In [None]:
# Reshaping numerical data (according to experiment and common sense)

X_num = X_num.drop(['CabinNumber'], 1)
X_test_num = X_test_num.drop(['CabinNumber'], 1)

X_num.head()

In [None]:
# Encoding categorial data

from sklearn.feature_extraction import DictVectorizer

encoder = DictVectorizer(sparse = False)
X_cat_oh = encoder.fit_transform(X_cat.T.to_dict().values())
X_test_cat_oh = encoder.transform(X_test_cat.T.to_dict().values())

print (X_cat_oh.shape, '\n')
print (X_cat_oh, '\n')

In [None]:
#Shuffling data
from sklearn.utils import shuffle
X_shuffled, y_shuffled = shuffle(np.hstack([X_num, X_cat_oh]), y_train, random_state = 8921)
print('X_shuffled.shape =', X_shuffled.shape)

In [None]:
# Random Forset Classifier parameters search
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

optimizer = GridSearchCV(RandomForestClassifier(), \
                         {'max_features' : [0.4, 0.6, 0.8, 1.0], 
                          'max_depth' : [5, 10, 15, 25, 35, 45, 55, 65], 
                          'n_estimators' : [1, 5, 10, 15, 30, 50, 70, 90, 100]},\
                         cv = 15, \
                         n_jobs=-1)
optimizer.fit(X_shuffled, y_shuffled)
print('Random Forest Best Result', optimizer.best_score_)
print('Best Parameters', optimizer.best_params_)

In [None]:
def writeAnswer(filename, optimizer):
    prediction = optimizer.predict(np.hstack([X_test_num, X_test_cat_oh]))
    submission = pd.DataFrame({
            "PassengerId": test["PassengerId"],
            "Survived": prediction
        })
    submission.to_csv(filename, index=False)
    
writeAnswer('RandomForestClassifier.csv', optimizer)