# Imports

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
sns.set(style="darkgrid")
%config InlineBackend.figure_format = 'retina'

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

# Overview

In [None]:
print('Train shape: ' + str(train.shape))
print('Test shape: ' + str(test.shape))

In [None]:
print (train.info())
print('_'*40)
print (test.info())

# EDA, data cleaning and feature engineering

In [None]:
fig, ax = plt.subplots()
sns.countplot('Survived', data=train)
ax.set_title('Passengers survived')

In [None]:
f, ax = plt.subplots()
sns.countplot('Pclass', hue='Survived', data=train)
ax.set_title('Survived over Pclass')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False)['Survived'].agg({'Survived': ['mean','count']})

In [None]:
f, ax = plt.subplots()
sns.countplot('Sex', hue='Survived', data=train)
ax.set_title('Survived over sex')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
train[["Sex", "Survived"]].groupby(['Sex'], as_index=False)['Survived'].agg({'Survived': ['mean','count']})

In [None]:
ax=sns.kdeplot(train.loc[(train['Survived'] == 0),'Age'] , color='gray',shade=True,label='Not survived')
ax=sns.kdeplot(train.loc[(train['Survived'] == 1),'Age'] , color='g',shade=True, label='Survived')

plt.xlabel("Age")
plt.ylabel('% Survived')
plt.title('Survived over age')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
sns.catplot(x='Sex', hue='Pclass', col="Survived", data=train, kind='count', height=5, aspect=.8)

In [None]:
f, ax = plt.subplots()
sns.countplot('SibSp', hue='Survived', data=train)
ax.set_title('Survived over SibSp')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
train[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False)['Survived'].agg({'Survived': ['mean','count']})

In [None]:
f, ax = plt.subplots()
sns.countplot('Parch', hue='Survived', data=train)
ax.set_title('Survived over Parch')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
train[["Parch", "Survived"]].groupby(['Parch'], as_index=False)['Survived'].agg({'Survived': ['mean','count']})

In [None]:
#adding family features
def getFamilySize(row):
    return row["SibSp"] + row["Parch"] + 1

def getIsAlone(row):
    if row['FamilySize'] == 1:
        return 1
    return 0

def fam_size(train, test):
    for i in [train, test]:
        i['FamilySize'] = i.apply(getFamilySize, axis=1)
        i['IsAlone'] = i.apply(getIsAlone, axis=1)
        
        i['FamilyType'] = np.where((i['SibSp']+i['Parch']) == 0 , 'Solo',
                           np.where((i['SibSp']+i['Parch']) <= 3,'Nuclear', 'Big'))
        del i['SibSp']
        del i['Parch']
    return train, test

train, test = fam_size(train, test)
train.head()

In [None]:
f, ax = plt.subplots()
sns.countplot('FamilySize', hue='Survived', data=train)
ax.set_title('Survived over FamilySize')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
train[["FamilySize", "Survived"]].groupby(['FamilySize'], as_index=False)['Survived'].agg({'Survived': ['mean','count']})

In [None]:
f, ax = plt.subplots()
sns.countplot('IsAlone', hue='Survived', data=train)
ax.set_title('Survived over IsAlone')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
train[["IsAlone", "Survived"]].groupby(['IsAlone'], as_index=False)['Survived'].agg({'Survived': ['mean','count']})

In [None]:
#adding name features
def names(train, test):
    for i in [train, test]:
        i['NameLen'] = i['Name'].apply(lambda x: len(x))
        i['NameTitle'] = i['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])
        del i['Name']
    return train, test


train, test = names(train, test)
pd.crosstab(train['NameTitle'], train['Sex'])

In [None]:
train[["NameTitle", "Age"]].groupby(['NameTitle'], as_index=False).mean()

In [None]:
#filling missing age data
def age_impute(train, test):
    for i in [train, test]:
        i['Age_Flag'] = i['Age'].apply(lambda x: 1 if pd.isnull(x) else 0)
        data = train.groupby(['NameTitle', 'Pclass'])['Age']
        i['Age'] = data.transform(lambda x: x.fillna(x.mean()))
    return train, test

train, test = age_impute(train, test)

train.info()
print('_'*40)
test.info()

In [None]:
#adding cabin features
def cabin(train, test):
    for i in [train, test]:
        i['ExistCabin'] = i.apply(getCabinKind, axis=1)
        i['CabinLetter'] = i['Cabin'].apply(lambda x: str(x)[0])
        del i['Cabin']
    return train, test


def cabin_num(train, test):
    for i in [train, test]:
        i['Cabin_num1'] = i['Cabin'].apply(lambda x: str(x).split(' ')[-1][1:])
        i['Cabin_num1'].replace('an', np.NaN, inplace = True)
        i['Cabin_num1'] = i['Cabin_num1'].apply(lambda x: int(x) if not pd.isnull(x) and x != '' else np.NaN)
        i['Cabin_num'] = pd.qcut(train['Cabin_num1'],3)
    train = pd.concat((train, pd.get_dummies(train['Cabin_num'], prefix = 'Cabin_num')), axis = 1)
    test = pd.concat((test, pd.get_dummies(test['Cabin_num'], prefix = 'Cabin_num')), axis = 1)
    del train['Cabin_num']
    del test['Cabin_num']
    del train['Cabin_num1']
    del test['Cabin_num1']
    return train, test

def getCabinKind(row):
    cabin = row['Cabin']
    if cabin == cabin:
        return 1
    return 0
    
train, test = cabin_num(train, test)
train, test = cabin(train, test)

f, ax = plt.subplots()
sns.countplot('ExistCabin', hue='Survived', data=train)
ax.set_title('Survived over ExistCabin')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
#filling missing embarked data
def embarked_impute(train, test):
    for i in [train, test]:
        i['Embarked'] = i['Embarked'].fillna('S')
    return train, test

train, test = embarked_impute(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)

train.info()
print('_'*40)
test.info()

In [None]:
#adding ticket features
def ticket_grouped(train, test):
    for i in [train, test]:
        i['TicketLetter'] = i['Ticket'].apply(lambda x: str(x)[0])
        i['TicketLetter'] = i['TicketLetter'].apply(lambda x: str(x))
        i['TicketLetter'] = np.where((i['TicketLetter']).isin(['1', '2', '3', 'S', 'P', 'C', 'A']), i['TicketLetter'],
                                   np.where((i['TicketLetter']).isin(['W', '4', '7', '6', 'L', '5', '8']),
                                            'LowTicket', 'OtherTicket'))
        i['TicketLen'] = i['Ticket'].apply(lambda x: len(x))
        del i['Ticket']
    return train, test

train, test = ticket_grouped(train, test)
train.head()

In [None]:
def dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett', 'Cabin_Letter', 'Name_Title', 'Fam_Size']):
    for column in columns:
        train[column] = train[column].apply(lambda x: str(x))
        test[column] = test[column].apply(lambda x: str(x))
        good_cols = [column+'_'+i for i in train[column].unique() if i in test[column].unique()]
        train = pd.concat((train, pd.get_dummies(train[column], prefix = column)[good_cols]), axis = 1)
        test = pd.concat((test, pd.get_dummies(test[column], prefix = column)[good_cols]), axis = 1)
        del train[column]
        del test[column]
    return train, test

PassengerId_copy = test['PassengerId']

def drop(train, test, bye = ['PassengerId', 'FamilySize', 'IsAlone', 'ExistCabin']):
    for i in [train, test]:
        for z in bye:
            del i[z]
    return train, test

train, test = dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'TicketLetter', 'CabinLetter', 'NameTitle', 'FamilyType'])
train, test = drop(train, test)

train.info()

# Model

rf_model = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=10,
                                  max_features=3, max_leaf_nodes=None,
                                  min_impurity_decrease=0.0, min_impurity_split=None,
                                  min_samples_leaf=1, min_samples_split=4,
                                  min_weight_fraction_leaf=0.0, n_estimators=1500,
                                  n_jobs=None, oob_score=True, random_state=42, verbose=0,
                                  warm_start=False)

rf_model.fit(train.iloc[:, 1:], train.iloc[:, 0]) 
print("Random forest score: " + "%.4f" % rf_model.oob_score)

pd.concat((pd.DataFrame(train.iloc[:, 1:].columns, columns = ['variable']), 
           pd.DataFrame(rf_model.feature_importances, 
                        columns = ['importance'])), axis = 1).sort_values(by='importance', ascending = False)[:20]

#random forest model


rf = RandomForestClassifier(max_features='auto', oob_score=True, random_state=1, n_jobs=-1)

param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, 5, 10],
              "min_samples_split" : [2, 4, 10, 12, 16], "n_estimators": [50, 100, 400, 700, 1000]}
gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)
gs = gs.fit(train.iloc[:, 1:], train.iloc[:, 0])

In [None]:
#fit the model
rf = RandomForestClassifier(criterion='gini', n_estimators=1500, max_depth=10, min_samples_split=4 ,
                             min_samples_leaf=1, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
rf.fit(train.iloc[:, 1:], train.iloc[:, 0])
print("%.4f" % rf.oob_score_)

In [None]:
#variable importance
pd.concat((pd.DataFrame(train.iloc[:, 1:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

In [None]:
#prediction
predictions = rf.predict(test)
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((PassengerId_copy, predictions), axis = 1)
predictions.to_csv('random_forest_model.csv', sep=",", index = False)