# Titanic survival prediction


In [132]:
import numpy as np
import pandas as pd
import re  # String manipulation
# For Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)
#Missing values
import missingno as msno
# Feature Scaling
from sklearn.preprocessing import MinMaxScaler
# Train Test split
from sklearn.model_selection import train_test_split
# Cross validation
from sklearn.model_selection import cross_val_score
# To build models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
# To evaluate models
from sklearn.metrics import roc_auc_score
from sklearn import metrics

%matplotlib inline

In [133]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [134]:
#Saving a copy of the train and test set
original_data_train=train
original_data_test=test


In [235]:
print(train.columns.tolist())

In [236]:
train.head()

In [237]:
test.head()

In [4]:
train.info()
print('_'*40)
test.info()

In [43]:
print('No of rows and columns in training set ',train.shape)
print('No of rows and columns in testing set ',test.shape)


In [8]:
# Numerical variables description
train.select_dtypes(['number']).describe()

*   **There are 3 discrete variables: Pclass, SibSp, Parch.**
*   **There are 2 continous variables: Age, Fare.**
*   **There is one binary variable : Survived.(Target variable).**

In [9]:
# Categorical variables description
train.select_dtypes(['object']).describe()

* **There are 2 mixed categorical variabes: Cabin, Ticket.**


### Types of variables summary:
* ** 5 categorical variables: from them 2 are mixed type of variables (numbers and strings)**
* **7 numerical variables: 3 discrete, 2 continious, 1 Id, and 1 binary target**

# Data Visualization

In [11]:
#g=sns.FacetGrid(train,col='Survived')
#g.map(plt.hist,'Age',bins=20);

fig=ff.create_facet_grid(
                        train,
                        x='Age',
                        #y='Survived',
                        trace_type='histogram',
                        facet_col='Survived',
                        facet_col_labels='name',
                        width=750,
                        height=750,
                        binsize=4
                       )

py.iplot(fig)

* ** Infants survived**
*  **Large number of Young people of age group 15-25 did not survive**
* ** Passengers of age 80 survived**
* **Most passengers are in age group 15-40 (Young and middle aged)**

In [12]:
fig=ff.create_facet_grid(
                        train,
                        x='Age',
                        #y='Survived',
                        trace_type='histogram',
                        facet_col='Survived',
                        facet_row='Pclass',
                        #scales='free',
                        width=1000,
                        height=1000,
                        facet_col_labels='name',
                        facet_row_labels='name',
                        histnorm='probability',
                        xaxis=dict(title='Age'),
                        
                        #facet_col_labels='name'
                       )

py.iplot(fig)

*  **Most passengers in Pclass=1 survived. Most passengers in Pclass=3 did not survive**
*  **Age distribution is different among the classes**
* ** Infants in Pclass=2 & 3 mostly survived **

In [21]:
g=sns.FacetGrid(train,row='Embarked',hue='Sex')
g.map(sns.pointplot,'Pclass','Survived')
g.add_legend();

*  **Females have better survival in ports S,Q.**
*  **Males have better survival in port C.**
*  **Pclass=3 has less survival among females in ports S,Q **
* ** Increasing survival with Pclass among male in port Q but a decreasing trend with PClass in other ports.**

In [22]:
grid = sns.FacetGrid(train, row='Embarked', col='Survived')
grid.map(sns.barplot, 'Sex', 'Fare')
grid.add_legend();

*  **Higher fare passengers have better survival**
*  **Survival rates change with port of embarkation.**

In [23]:
# Heat maps/ correlation map

plt.figure(figsize=(20,20))
plt.title("Correlation of features")
sns.heatmap(train.corr(),linewidths=0.1,vmax=1.0,linecolor='black',annot=True,square=True);


* **SibSp and Parch have strong correlation. So one of the features or a engineered feature should be created.**

In [24]:
grid = sns.FacetGrid(train, row='Sex',hue='Survived')
grid.map(sns.distplot, 'Age','Survived',hist=False,rug=False)
grid.add_legend();

In [25]:
grid=sns.FacetGrid(train)
grid.map(sns.barplot,'Embarked','Survived');


In [26]:
trace=go.Histogram(
             x=train.SibSp,
             y=train.Survived
             
)
layout=go.Layout(
                 title='SibSp Vs Survived',
                xaxis=dict(title='SibSp'),
                yaxis=dict(title='Survived')
)
data=[trace]
fig=go.Figure(data=data,layout=layout)
py.iplot(fig)


In [27]:
trace=go.Histogram(
             x=train.Parch,
             y=train.Survived
             
)
layout=go.Layout(
                 title='Parch Vs Survived',
                xaxis=dict(title='Parch'),
                yaxis=dict(title='Survived')
)
data=[trace]
fig=go.Figure(data=data,layout=layout)
py.iplot(fig)

In [28]:
fig=ff.create_facet_grid(
                        train,
                        x='Embarked',
                        y='Survived',
                        trace_type='histogram',
                        color_name='Sex',
                        #scales='free',
                        #width=1000,
                        #height=1000,
                        histfunc='avg'
                        #xaxis=dict(title='Age'),
 
                        #facet_col_labels='name'
                       )

py.iplot(fig)

In [None]:
train_transformed=train

In [None]:
train_transformed['Sex']=train_transformed['Sex'].map({'female':0,'male':1})

In [None]:
train_transformed['Embarked']=train_transformed['Embarked'].fillna('S')

In [None]:
train_transformed['Embarked'].isnull().sum()

In [None]:
train_transformed['Embarked']=train_transformed['Embarked'].map({'C':0,'S':1,'Q':2})

In [None]:
train_transformed=train_transformed.drop(['PassengerId','Ticket','Cabin','Name'],axis=1)

In [None]:
train_transformed.head()

In [None]:
train_transformed['New_survived']=train_transformed['Survived'].astype(object).map({0:'Not Survived',1:'Survived'})

fig=ff.create_scatterplotmatrix(train_transformed,diag='histogram',index='New_survived',colormap_type='cat',height=1200, width=1200)
py.iplot(fig)


## Missing values imputation

In [31]:
msno.matrix(train,figsize=(10,10))

In [30]:
msno.matrix(test,figsize=(10,10))

In [239]:
# Training data missing values percentage
train.isnull().mean().sort_values(ascending=False)


* **Three of the variables contain missing data, Age (~20%), Cabin (~77%) and Embarked (< 1%)**

In [240]:
# Testing data missing values percentage
test.isnull().mean().sort_values(ascending=False)

* **Three of the variables contain missing data, Age (~20%), Cabin (~78%) and Fare (< 1%)**

## Outliers

### Outliers in continuous variables

In [6]:
# Visualizing outliers in continuous variables Fare and Age

plt.figure(figsize=(15,6))
plt.subplot(1, 2, 1)
fig =train.boxplot(column='Age')
fig.set_title('')
fig.set_ylabel('Age')

plt.subplot(1, 2, 2)
fig = train.boxplot(column='Fare')
fig.set_title('')
fig.set_ylabel('Fare')

In [7]:
# Plotting the distributions of the variables Fare and Age


plt.figure(figsize=(15,6))
plt.subplot(1, 2, 1)
fig = train.Age.hist(bins=20)
fig.set_ylabel('Number of passengers')
fig.set_xlabel('Age')

plt.subplot(1, 2, 2)
fig = train.Fare.hist(bins=20)
fig.set_ylabel('Number of passengers')
fig.set_xlabel('Fare')

* **Age variable is near Gaussian but Fare variable is Skewed(Right)**
*  **So we will use normal assumption to detect the outliers for Age variable and interquartile range to find outliers**

In [241]:
# find outliers

# Age
Upper_boundary = train.Age.mean() + 3* train.Age.std()
Lower_boundary = train.Age.mean() - 3* train.Age.std()
print('Age outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_boundary, upperboundary=Upper_boundary))

# Fare
IQR = train.Fare.quantile(0.75) - train.Fare.quantile(0.25)
Lower_bound = train.Fare.quantile(0.25) - (IQR * 3)
Upper_bound = train.Fare.quantile(0.75) + (IQR * 3)
print('Fare outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_bound, upperboundary=Upper_bound))


### Outliers in discrete variables

In [242]:
# outlies in discrete variables
for var in ['Pclass',  'SibSp', 'Parch']:
    print(train[var].value_counts() / np.float(len(train)))
    print()

* **We will take percentage values less than 1% as outliers in the above discrete variables.**
* ** Pclass doesn't have outliers**
* **SibSp values greater than 4(values 5,8) are outliers**
* **Parch values greater than 2 are outliers**

In [243]:
# Cardinality of Categorical variables
categorical = [var for var in train.columns if train[var].dtype=='O']
for var in categorical:
    print(var, ' contains ', len(train[var].unique()), ' labels')

* **Name, Ticket, Cabin variables should be engineered to extract relevant useful information. They have high cardinality and should be dealt with.**

In [135]:
# Cabin
train['Cabin_numerical'] = train.Cabin.str.extract('(\d+)') # extracts number from string
train['Cabin_numerical'] = train['Cabin_numerical'].astype('float') # parses the above variable to float type

train['Cabin_categorical'] = train['Cabin'].str[0] # captures first letter of string (the letter of the cabin)

# same for Test set
test['Cabin_numerical'] = test.Cabin.str.extract('(\d+)')
test['Cabin_numerical'] = test['Cabin_numerical'].astype('float')

test['Cabin_categorical'] = test['Cabin'].str[0]

train[['Cabin', 'Cabin_numerical', 'Cabin_categorical']].head()

In [136]:
# drop the original variable
train.drop(labels='Cabin', inplace=True, axis=1)
test.drop(labels='Cabin', inplace=True, axis=1)

In [137]:
#  Ticket
# extract the last bit of ticket as number
train['Ticket_numerical'] = train.Ticket.apply(lambda s: s.split()[-1])
train['Ticket_numerical'] = np.where(train.Ticket_numerical.str.isdigit(), train.Ticket_numerical, np.nan)
train['Ticket_numerical'] = train['Ticket_numerical'].astype('float')

# extract the first part of ticket as category
train['Ticket_categorical'] = train.Ticket.apply(lambda s: s.split()[0])
train['Ticket_categorical'] = np.where(train.Ticket_categorical.str.isdigit(), np.nan, train.Ticket_categorical)

# Same for Test set
test['Ticket_numerical'] = test.Ticket.apply(lambda s: s.split()[-1])
test['Ticket_numerical'] = np.where(test.Ticket_numerical.str.isdigit(), test.Ticket_numerical, np.nan)
test['Ticket_numerical'] = test['Ticket_numerical'].astype('float')

# extract the first part of ticket as category
test['Ticket_categorical'] = test.Ticket.apply(lambda s: s.split()[0])
test['Ticket_categorical'] = np.where(test.Ticket_categorical.str.isdigit(), np.nan, test.Ticket_categorical)

train[['Ticket', 'Ticket_numerical', 'Ticket_categorical']].head()

In [138]:
train.Ticket_categorical.unique()

In [139]:
print("No of labels in Ticket_Categorical variable is ",len(train.Ticket_categorical.unique()))

In [140]:
# remove non letter characters from string as the above labels are similar apart from punctuation marks
text = train.Ticket_categorical.apply(lambda x: re.sub("[^a-zA-Z]", '', str(x)))

# to visualise the output and compare with input
pd.concat([text, train.Ticket_categorical], axis=1).head(100)


In [141]:
text=text.str.upper()
text.unique()

In [143]:
print("No of labels in Ticket_Categorical variable is ",len(text.unique()))

In [144]:
train["Ticket_categorical"]= text


In [145]:
test['Ticket_categorical'] = test.Ticket_categorical.apply(lambda x: re.sub("[^a-zA-Z]", '', str(x)))
test['Ticket_categorical'] = test['Ticket_categorical'].str.upper()

In [146]:
# drop the original Ticket variable
train.drop(labels='Ticket',inplace=True,axis=1)
test.drop(labels='Ticket', inplace=True, axis=1)

In [147]:
#Title=pd.DataFrame()
Title=train.Name.map( lambda name: name.split( ',' )[1].split( '.' )[0].strip() )


In [148]:
Title_Dictionary = {
                    "Capt":       "Officer",
                    "Col":        "Officer",
                    "Major":      "Officer",
                    "Jonkheer":   "Royalty",
                    "Don":        "Royalty",
                    "Sir" :       "Royalty",
                    "Dr":         "Officer",
                    "Rev":        "Officer",
                    "the Countess":"Royalty",
                    "Dona":       "Royalty",
                    "Mme":        "Mrs",
                    "Mlle":       "Miss",
                    "Ms":         "Mrs",
                    "Mr" :        "Mr",
                    "Mrs" :       "Mrs",
                    "Miss" :      "Miss",
                    "Master" :    "Master",
                    "Lady" :      "Royalty"

                    }

In [149]:
Title= Title.map( Title_Dictionary )

In [150]:
Title2=test.Name.map(lambda name: name.split( ',' )[1].split( '.' )[0].strip())

In [151]:
Title2= Title2.map( Title_Dictionary )

In [152]:
# Title column that contains the labels taken from the Name variable
train['Title']=Title
test['Title']=Title2

In [153]:
train.Title.unique()

In [154]:
test.Title.unique()

In [155]:
# dropping the Name variable
train.drop(labels='Name',inplace=True,axis=1)
test.drop(labels='Name',inplace=True,axis=1)

In [156]:
# creating a variable indicating family size (including the passenger)
# sums siblings and parents

train['Family_size'] = train['SibSp']+train['Parch']+1
test['Family_size'] = test['SibSp']+test['Parch']+1

print(train.Family_size.value_counts()/ np.float(len(train)))

(train.Family_size.value_counts() / np.float(len(train))).plot.bar()

* **Family size greater than 7 is considered a rare value.**

In [157]:
train.Age.isnull().sum()

In [366]:
# variable indicating if passenger was a mother

#train['is_mother'] = np.where((train.Sex =='female')&(train.Parch>=1)&(train.Age.fillna(0)>18),1,0)


In [268]:
#test['is_mother'] = np.where((test.Sex =='female')&(test.Parch>=1)&(test.Age>18),1,0)



In [269]:
#train[['Sex', 'Parch', 'Age', 'is_mother']].head()

In [270]:
#print('there were {} mothers in the Titanic'.format(train.is_mother.sum()))

In [271]:
#train.is_mother.head(10)

### New Numerical variables missing values

In [272]:
#train[['Cabin_numerical', 'Ticket_numerical', 'is_mother', 'Family_size']].isnull().mean()

### **New Numerical variables outliers**

In [158]:
# first we plot the distributions to find out if they are Gaussian or skewed.
# Depending on the distribution, we will use the normal assummption or the interquantile
# range to find outliers

plt.figure(figsize=(15,6))
plt.subplot(1, 2, 1)
fig = train.Cabin_numerical.hist(bins=50)
fig.set_ylabel('Number of passengers')
fig.set_xlabel('Cabin number')

plt.subplot(1, 2, 2)
fig = train.Ticket_numerical.hist(bins=50)
fig.set_ylabel('Number of passengers')
fig.set_xlabel('Ticket number')

In [159]:
# let's visualise outliers with the boxplot and whiskers
plt.figure(figsize=(15,6))
plt.subplot(1, 2, 1)
fig = train.boxplot(column='Cabin_numerical')
fig.set_title('')
fig.set_ylabel('Cabin number')

plt.subplot(1, 2, 2)
fig = train.boxplot(column='Ticket_numerical')
fig.set_title('')
fig.set_ylabel('Ticket number')

* **Cabin_numerical doesn't contain outliers.**

In [160]:
# Ticket numerical
IQR = train.Ticket_numerical.quantile(0.75) - train.Ticket_numerical.quantile(0.25)
Lower_bound = train.Ticket_numerical.quantile(0.25) - (IQR * 3)
Upper_bound = train.Ticket_numerical.quantile(0.75) + (IQR * 3)
print('Ticket number outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_bound, upperboundary=Upper_bound))
passengers = len(train[train.Ticket_numerical>Upper_bound]) / np.float(len(train))
print('Number of passengers with ticket values higher than {upperboundary}: {passengers}'.format(upperboundary=Upper_bound,passengers=passengers))
                                                                                                 

### New categorical vaiable missing values

In [161]:
train[['Cabin_categorical', 'Ticket_categorical', 'Title']].isnull().mean()

* **Ticket_categorical,Title variables doesn't contain missing values.**

### New categorical vaiable cardinality and rare labels

In [162]:
for var in ['Cabin_categorical', 'Ticket_categorical', 'Title']:
    print(var, ' contains ', len(train[var].unique()), ' labels')

In [163]:
# rare / unfrequent labels (less than 1% of passengers)
for var in ['Cabin_categorical', 'Ticket_categorical', 'Title']:
    print(train[var].value_counts() / np.float(len(train)))
    print()

* **Cabin contains the rare labels G and T: replace by most frequent category**
* **Ticket contains a lot of unfrequent labels: replace by rare**
* **Title does not contain rare labels**

**Number of passengers in the rare cabins is so small, grouping them into a new category called rare, will be in itself rare. In cabin, I will replace rare labels by the most frequent category.
In ticket_categorical, on the other hand, the number of unfrequent labels is high, therefore grouping them into a new label . **

In [164]:
# Let's split the train data into train and validation data

X_train, X_val, y_train, y_val = train_test_split(train, train.Survived, test_size=0.2,
                                                    random_state=0)
X_train.shape, X_val.shape

In [165]:
# let's group again the variables into categorical or numerical
# now considering the newly created variables

def find_categorical_and_numerical_variables(dataframe):
    cat_vars = [col for col in train.columns if train[col].dtypes == 'O']
    num_vars  = [col for col in train.columns if train[col].dtypes != 'O']
    return cat_vars, num_vars
                 
categorical, numerical = find_categorical_and_numerical_variables(train)

In [166]:
categorical

In [167]:
numerical = [var for var in numerical if var not in ['Survived','PassengerId']]
numerical

## Engineering Missing values in  numerical variables

In [168]:
# print variables with missing data
for col in numerical:
    if X_train[col].isnull().mean()>0:
        print(col, X_train[col].isnull().mean())

* **Age and ticket contains < 50% NA: create additional variable with NA + random sample imputation**
* **Cabin_numerical contains > 50% NA: impute NA by value far in the distribution**

In [169]:
# Function to perform random imputation of mising values
def impute_na(X_train, df, variable):
    # make temporary df copy
    temp = df.copy()
    
    # extract random from train set to fill the na
    random_sample = X_train[variable].dropna().sample(temp[variable].isnull().sum(), random_state=0)
    
    # pandas needs to have the same index in order to merge datasets
    random_sample.index = temp[temp[variable].isnull()].index
    temp.loc[temp[variable].isnull(), variable] = random_sample
    return temp[variable]

In [170]:
# Age and ticket
# add variable indicating missingness
for df in [X_train, X_val, test]:
    for var in ['Age', 'Ticket_numerical']:
        df[var+'_NA'] = np.where(df[var].isnull(), 1, 0)
    
# replace by random sampling
for df in [X_train, X_val, test]:
    for var in ['Age', 'Ticket_numerical']:
        df[var] = impute_na(X_train, df, var)
    

# Cabin numerical
extreme = X_train.Cabin_numerical.mean() + X_train.Cabin_numerical.std()*3
for df in [X_train, X_val, test]:
    df.Cabin_numerical.fillna(extreme, inplace=True)

In [171]:
test.isnull().sum()

In [172]:

for df in [X_train, X_val, test]:
    list_is_mother=[]
    for i in range(df.shape[0]):
        if (df['Sex'].iloc[i] =='female')&(df['Parch'].iloc[i] >=1)&(df['Age'].iloc[i]>18):
            list_is_mother.append(1)
        else:
            list_is_mother.append(0)
    list_is_mother=pd.Series(list_is_mother)  
    list_is_mother.index=df.index
    df['is_mother']=list_is_mother    
    #df['is_mother'] = pd.Series(np.where((df.Sex =='female')&(df.Parch>=1)&(df.Age>18),1,0))

## Engineering Missing values in categorical variables

In [173]:
# print variables with missing data
for col in categorical:
    if X_train[col].isnull().mean()>0:
        print(col, X_train[col].isnull().mean())

* **Embarked NA imputed by most frequent category, because NA is low**
* **Cabin_categorical imputed by 'Missing', because NA is high**

In [174]:
# add label indicating 'Missing' to Cabin categorical
# or replace by most frequent label in Embarked

for df in [X_train, X_val, test]:
    df['Embarked'].fillna(X_train['Embarked'].mode()[0], inplace=True)
    df['Cabin_categorical'].fillna('Missing', inplace=True)

In [175]:
# checking for null values
X_train.isnull().sum()

In [176]:
X_val.isnull().sum()

In [177]:
test.isnull().sum()

In [178]:
# Fare missing value in test set is replaced by median value of the train set X_train
test.Fare.fillna(X_train.Fare.median(),inplace=True)

In [108]:
test.isnull().sum()

## Outliers in Numerical variables 

### From the above analysis of the outliers the following measure are taken to correct the outliers
* **Age: top-coding (73)**
* **Fare: equal frequency binning**
* **Sibsp: top-coding (4)**
* **Parch: top-coding (2)**
* **Family Size: top-coding (7)**
* **Ticket_number: equal frequency binning**

In [179]:
def top_code(df, variable, top):
    return np.where(df[variable]>top, top, df[variable])

for df in [X_train, X_val,test]:
    df['Age'] = top_code(df, 'Age', 73)
    df['SibSp'] = top_code(df, 'SibSp', 4)
    df['Parch'] = top_code(df, 'Parch', 2)
    df['Family_size'] = top_code(df, 'Family_size', 7)

In [180]:
# let's check that it worked
for var in ['Age',  'SibSp', 'Parch', 'Family_size']:
    print(var, ' max value: ', X_train[var].max())

In [181]:
# let's check that it worked
for var in ['Age',  'SibSp', 'Parch', 'Family_size']:
    print(var, ' max value: ', test[var].max())

In [182]:
# Discretizing Fare variables to correct outliers by equal frequency discretization(quantiles)

# find quantiles and discretise train set
X_train['Fare'], bins = pd.qcut(x=X_train['Fare'], q=8, retbins=True, precision=3, duplicates='raise')
X_val['Fare'] = pd.cut(x = X_val['Fare'], bins=bins, include_lowest=True)
test['Fare'] = pd.cut(x = test['Fare'], bins=bins, include_lowest=True)

In [183]:
test.Fare.isnull().sum()

In [184]:
t1 = X_train.groupby(['Fare'])['Fare'].count() / np.float(len(X_train))
t2 = X_val.groupby(['Fare'])['Fare'].count() / np.float(len(X_val))
t3 = test.groupby(['Fare'])['Fare'].count() / np.float(len(test))

temp = pd.concat([t1,t2,t3], axis=1)
temp.columns = ['train', 'val', 'test']
temp.plot.bar(figsize=(12,6))

In [185]:
# Discretizing Ticket_numerical variable to correct outliers by equal frequency discretization(quantiles)
# find quantiles and discretise train set
X_train['Ticket_numerical'], bins = pd.qcut(x=X_train['Ticket_numerical'], q=8, retbins=True, precision=3, duplicates='raise')
X_val['Ticket_numerical'] = pd.cut(x = X_val['Ticket_numerical'], bins=bins, include_lowest=True)
test['Ticket_numerical_temp'] = pd.cut(x = test['Ticket_numerical'], bins=bins, include_lowest=True)

In [186]:
X_val.Ticket_numerical.isnull().sum()

In [187]:
test.Ticket_numerical_temp.isnull().sum()

In [188]:
test[test.Ticket_numerical_temp.isnull()][['Ticket_numerical', 'Ticket_numerical_temp']]

* Value of 2 above is less than the minimum lower bound of the bins. The minimum lower bound of the bins is 2.99. so we will place these above two rows into the bins that starts with 2.99

In [189]:
test.loc[test.Ticket_numerical_temp.isnull(), 'Ticket_numerical_temp'] = X_train.Ticket_numerical.unique()[0]
test.Ticket_numerical_temp.isnull().sum()

In [190]:
test['Ticket_numerical'] = test['Ticket_numerical_temp']
test.drop(labels=['Ticket_numerical_temp'], inplace=True, axis=1)
test.head()

## Engineering rare labels in categorical variables 

In [191]:
# find percent of labels in categorical variables
for var in categorical:
    print(var, X_train[var].value_counts()/np.float(len(X_train)))
    print()

* Labels with (<1%) are taken as rare labels
* Cabin contains the rare labels G and T: replace by most frequent category
* Ticket contains a lot of unfrequent labels: replace by rare
* rare cabins is so small, grouping them into a new category called rare. Thus, in cabin, I will replace rare labels by the most frequent category. In ticket_categorical,  grouping them into a new label.

In [192]:
def rare_imputation(variable, which='rare'):    
    # find frequent labels
    temp = X_train.groupby([variable])[variable].count()/np.float(len(X_train))
    frequent_cat = [x for x in temp.loc[temp>0.01].index.values]
    
    # create new variables, with Rare labels imputed
    if which=='frequent':
        # most frequent category
        mode_label = X_train.groupby(variable)[variable].count().sort_values().tail(1).index.values[0]
        X_train[variable] = np.where(X_train[variable].isin(frequent_cat), X_train[variable], mode_label)
        X_val[variable] = np.where(X_val[variable].isin(frequent_cat), X_val[variable], mode_label)
        test[variable] = np.where(test[variable].isin(frequent_cat), test[variable], mode_label)
    
    else:
        X_train[variable] = np.where(X_train[variable].isin(frequent_cat), X_train[variable], 'Rare')
        X_val[variable] = np.where(X_val[variable].isin(frequent_cat), X_val[variable], 'Rare')
        test[variable] = np.where(test[variable].isin(frequent_cat), test[variable], 'Rare')

In [193]:
rare_imputation('Cabin_categorical', 'frequent')
rare_imputation('Ticket_categorical', 'rare')

In [194]:
# checking for train data
for var in categorical:
    print(var, X_train[var].value_counts()/np.float(len(X_train)))
    print()

In [195]:
# checking for testa data
for var in categorical:
    print(var, test[var].value_counts()/np.float(len(test)))
    print()

## Encode categorical variables

* Sex variable is one hot encoded
* Remaining variables are encoded by risk probability

In [196]:
categorical

In [197]:
for df in [X_train, X_val, test]:
    df['Sex']  = pd.get_dummies(df.Sex, drop_first=True)

In [198]:
X_train.Sex.unique()

In [69]:
test.Sex.unique()

In [199]:
X_train.groupby(['Embarked'])['Survived'].mean()


In [200]:
def encode_categorical_variables(var, target):
        # make label to risk dictionary
        ordered_labels = X_train.groupby([var])[target].mean().to_dict()
        
        # encode variables
        X_train[var] = X_train[var].map(ordered_labels)
        X_val[var] = X_val[var].map(ordered_labels)
        test[var] = test[var].map(ordered_labels)

# enccode labels in categorical vars
for var in categorical:
    encode_categorical_variables(var, 'Survived')

In [201]:
test.Embarked.head()

In [202]:
X_val.Embarked.isnull().sum()

In [203]:
# parse discretised variables to object before encoding
for df in [X_train, X_val, test]:
    df.Fare = df.Fare.astype('O')
    df.Ticket_numerical = df.Ticket_numerical.astype('O')

In [204]:
# encode labels
for var in ['Fare', 'Ticket_numerical']:
    print(var)
    encode_categorical_variables(var, 'Survived')

In [205]:
test.head()

In [207]:
# Converting the float64 dtypes to float32
xg_test=test
xg_X_train=X_train
xg_X_val=X_val
for df in [xg_X_train,xg_X_val,xg_test]:
    for c, dtype in zip(df.columns, df.dtypes):
        if dtype == np.float64:
            df[c] = df[c].astype(np.float32)

In [216]:
xg_test.info()

In [226]:
X_val.info()

## Feature Scaling

In [217]:
variables_that_need_scaling = ['Pclass', 'Age', 'Sibsp', 'Parch', 'Cabin_numerical', 'Family_size']

In [218]:
training_vars = [var for var in X_train.columns if var not in ['PassengerId', 'Survived']]
training_vars

In [236]:
# fit scaler
scaler = MinMaxScaler() # create an instance
 #  fit  the scaler to the train set and then transform it
scaler.fit(X_train[training_vars])

In [222]:
X_train.Cabin_numerical.head()

In [231]:
# xgBoost model
xgb_model = xgb.XGBClassifier()

eval_set = [(xg_X_val[training_vars], y_val)]
xgb_model.fit(xg_X_train[training_vars], y_train, eval_metric="auc", eval_set=eval_set, verbose=False)

pred = xgb_model.predict_proba(xg_X_train[training_vars])
print('xgb train roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
pred = xgb_model.predict_proba(xg_X_val[training_vars])
print('xgb test roc-auc: {}'.format(roc_auc_score(y_val, pred[:,1])))

In [232]:
pred.dtype

In [233]:
# Random forests model

rf_model = RandomForestClassifier()
rf_model.fit(X_train[training_vars], y_train)

pred = rf_model.predict_proba(X_train[training_vars])
print('RF train roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
pred = rf_model.predict_proba(X_val[training_vars])
print('RF test roc-auc: {}'.format(roc_auc_score(y_val, pred[:,1])))

In [234]:
ada_model = AdaBoostClassifier()
ada_model.fit(X_train[training_vars], y_train)

pred = ada_model.predict_proba(X_train[training_vars])
print('Adaboost train roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
pred = ada_model.predict_proba(X_val[training_vars])
print('Adaboost test roc-auc: {}'.format(roc_auc_score(y_val, pred[:,1])))

In [237]:
logit_model = LogisticRegression()
logit_model.fit(scaler.transform(X_train[training_vars]), y_train)

pred = logit_model.predict_proba(scaler.transform(X_train[training_vars]))
print('Logit train roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
pred =logit_model.predict_proba(scaler.transform(X_val[training_vars]))
print('Logit test roc-auc: {}'.format(roc_auc_score(y_val, pred[:,1])))

In [238]:
pred_list = []
for model in [xgb_model, rf_model, ada_model, logit_model]:
    pred_list.append(pd.Series(model.predict_proba(X_val[training_vars])[:,1]))

final_pred = pd.concat(pred_list, axis=1).mean(axis=1)
print('Ensemble test roc-auc: {}'.format(roc_auc_score(y_val,final_pred)))

In [239]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, final_pred)
thresholds

In [240]:
accuracy_list = []
for thres in thresholds:
    y_pred = np.where(final_pred>thres,1,0)
    accuracy_list.append(metrics.accuracy_score(y_val, y_pred, normalize=True))
    
accuracy_list = pd.concat([pd.Series(thresholds), pd.Series(accuracy_list)],
                        axis=1)
accuracy_list.columns = ['thresholds', 'accuracy']
accuracy_list.sort_values(by='accuracy', ascending=False, inplace=True)


In [241]:
accuracy_list.head()

# Submission to Kaggle

In [244]:
pred_list = []


In [512]:
pred_list.append(pd.Series(xgb_model.predict_proba(xg_test[training_vars])[:,1]))

In [258]:
xg_test.head()

In [264]:
xg_test.info()

In [261]:
#xgb_model.predict_proba(xg_test[training_vars])[:,1]

In [None]:
 pred_list.append(pd.Series(rf_model.predict_proba(test[training_vars])[:,1]))

In [None]:
 pred_list.append(pd.Series(ada_model.predict_proba(test[training_vars])[:,1]))

In [None]:
 pred_list.append(pd.Series(logit_model.predict_proba(test[training_vars])[:,1]))

In [246]:
#for model in [rf_model, ada_model, logit_model]:
    #pred_list.append(pd.Series(model.predict_proba(test[training_vars])[:,1]))

final_pred = pd.concat(pred_list, axis=1).mean(axis=1)

In [247]:
accuracy_list.iloc[0,0]

In [248]:
list_pred=[]
for i in range(final_pred.shape[0]):
    if final_pred.iloc[i]>accuracy_list.iloc[0,0]:
        list_pred.append(1)
    else:
        list_pred.append(0)


In [249]:
list_pred=pd.Series(list_pred)


In [250]:
list_pred.index=final_pred.index


In [251]:
final_pred = list_pred

In [252]:
final_pred.head()

In [253]:
temp = pd.concat([test.PassengerId, final_pred], axis=1)
temp.columns = ['PassengerId', 'Survived']
temp.head()

In [254]:
temp.to_csv('submission.csv', index=False)

## Feature Importance 

In [255]:
importance = pd.Series(rf_model.feature_importances_)
importance.index = training_vars
importance.sort_values(inplace=True, ascending=False)
importance.plot.bar(figsize=(12,6))

In [256]:
importance = pd.Series(xgb_model.feature_importances_)
importance.index = training_vars
importance.sort_values(inplace=True, ascending=False)
importance.plot.bar(figsize=(12,6))

In [257]:
importance = pd.Series(np.abs(logit_model.coef_.ravel()))
importance.index = training_vars
importance.sort_values(inplace=True, ascending=False)
importance.plot.bar(figsize=(12,6))

##  **Second level of ensemble models should be built based on the above feature importances**