In [15]:
import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))

import matplotlib #collection of functions for scientific and publication-ready visualization
print("matplotlib version: {}". format(matplotlib.__version__))

import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD,Adam
import keras.backend as K




pandas version: 0.23.0
matplotlib version: 2.2.2
NumPy version: 1.14.3


# Load Data Modelling Libraries

We will use the popular *scikit-learn* library to develop our machine learning algorithms. In *sklearn,* algorithms are called Estimators and implemented in their own classes. For data visualization, we will use the *matplotlib* and *seaborn* library. Below are common classes to load.

In [16]:
#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

#Configure Visualization Defaults
# %matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

# Cleaning the data .... 


In [17]:
#import data from file: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
df_train = pd.read_csv('train.csv')


#a dataset should be broken into 3 splits: train, test, and (final) validation
#the test file provided is the validation file for competition submission
#we will split the train set into train and test data in future sections
df_test  = pd.read_csv('test.csv')



#however passing by reference is convenient, because we can clean both datasets at once
data = pd.concat([df_train, df_test],ignore_index=True)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  del sys.path[0]


In [18]:
print(df_train.isnull().sum(),df_test.isnull().sum(),data.isnull().sum())


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64 Age             263
Cabin          1014
Embarked          2
Fare              1
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
dtype: int64


# Adding title column and digitizing Embarked, Pclass

In [32]:
#quick and dirty code split title from name: http://www.pythonforbeginners.com/dictionary/python-split


# Title bussiness
data.loc[1305,"Name"] = "Oliva y Ocana, Mrs. Dona. Fermina" # Missing ttitle for Mrs Oliva
data['Title'] = data['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]


# Fill age based on title
for title in  data.groupby(['Title']).groups.keys():
    age_to_impute = data.groupby('Title')['Age'].median()[title]
    data.loc[(data['Age'].isnull()) & (data['Title'] == title), 'Age'] = age_to_impute

# #Unify common titles. 
# data["Title"] = data["Title"].replace('Mlle', 'Miss')
# data["Title"] = data["Title"].replace('Master', 'Master')
# data["Title"] = data["Title"].replace(['Mme', 'Dona', 'Ms'], 'Mrs')
# data["Title"] = data["Title"].replace(['Jonkheer','Don'],'Mr')
# data["Title"] = data["Title"].replace(['Capt','Major', 'Col','Rev'], 'Millitary')
# data["Title"] = data["Title"].replace(['Lady', 'the Countess', 'Countess','Sir'], 'Honor')

# # convert Title categories to Columns
# titledummies=pd.get_dummies(data[['Title']], prefix_sep='_') #Title
# data = pd.concat([data, titledummies], axis=1) 

# print('Title categories added')

# Repalcing titles by mean of surviving
order_list = data[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()
title  = order_list.Title.values
surv_mean = order_list.Survived.values
title_mapping = dict(zip(title,surv_mean))
data.loc[:,"Title"] = data["Title"].map(title_mapping)
print('Title categories added')


# Repalcing Embarked by mean of surviving

order_list = data[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()
title  = order_list.Embarked.values
surv_mean = order_list.Survived.values
title_mapping = dict(zip(title,surv_mean))
data.loc[:,"Embarked"] = data["Embarked"].map(title_mapping)
data.loc[data['Embarked'].isnull()==True,"Embarked"] = data["Embarked"].median()

print('Embarked categories Digitized')

# Sex digitalize
data.loc[:,"Sex"] = data["Sex"].replace(['male'], 1)
data.loc[:,"Sex"] = data["Sex"].replace(['female'], 0)

#Discrete variables
data['Family_Size'] = data ['SibSp'] + data['Parch'] + 1

data['IsAlone'] = 1 #initialize to yes/1 is alone
data.loc[data['Family_Size'] > 1,"IsAlone"] = 0 # now update to no/b0 if family size is greater than 1






# Repalcing Pclass by mean of surviving
order_list = data[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()
title  = order_list.Pclass.values
surv_mean = order_list.Survived.values
title_mapping = dict(zip(title,surv_mean))
data.loc[:,"Pclass"] = data["Pclass"].map(title_mapping)
print('Pclass categories digitized')


data.loc[data['Fare'].isnull()==True,"Fare"] = data["Fare"].median()

# # convert Pclass categories to Columns
# titledummies =pd.get_dummies(data['Pclass'])
# titledummies.rename(columns={1:'Pclass_1',2:'Pclass_2',3:'Pclass_3'}, inplace=True)
# data = pd.concat([data, titledummies], axis=1) 




# # data = data.drop(["Cabin","Name","Ticket","Title"],axis =1)



Title categories added
Embarked categories Digitized
Pclass categories digitized


In [20]:
# Finding columns with missing values
data.isnull().sum()

Age               0
Cabin          1014
Embarked          0
Fare              0
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
Title             0
Family_Size       0
IsAlone           0
dtype: int64

## Family Survival

This is based on code taken from from https://www.kaggle.com/shunjiangxu/blood-is-thicker-than-water-friendship-forever

In [22]:
# get last name
data["Last_Name"] = data['Name'].apply(lambda x: str.split(x, ",")[0])
# Set survival value
DEFAULT_SURVIVAL_VALUE = 0.5
data["Family_Survival"] = DEFAULT_SURVIVAL_VALUE

# Find Family groups by Fare
for grp, grp_df in data[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
    
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0

print("Number of passengers with family survival information:", 
      data.loc[data['Family_Survival']!=0.5].shape[0])

# Find Family groups by Ticket
for _, grp_df in data.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0
                        
print("Number of passenger with family/group survival information: " 
      +str(data[data['Family_Survival']!=0.5].shape[0]))

# # Family_Survival in df_train and df_test:
# df_train["Family_Survival"] = data['Family_Survival'][:891]
# df_test["Family_Survival"] = data['Family_Survival'][891:]

Number of passengers with family survival information: 420
Number of passenger with family/group survival information: 546


In [24]:
# Survival expectation value by sex and pclass
DEFAULT_SURVIVAL_VALUE = 0.5
data["Sx_Pa_Survival"] = DEFAULT_SURVIVAL_VALUE

df_train = data[:891]

list_groups = df_train.groupby(["Parch","Sex"])["Sx_Pa_Survival"].count().values


surv_list_groups = df_train.groupby(["Parch","Sex","Survived"])["Sx_Pa_Survival"].count().values


surv = surv_list_groups[1:14:2]

surv_rate1 = surv/list_groups[:7]



for grp in data.groupby(["Parch","Sex"]): 
#     print(grp[0])
    if grp[0] == (1,0):  data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = surv_rate1[0]
    if grp[0] == (0,1):  data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = surv_rate1[1]
    if grp[0] == (1,0):  data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = surv_rate1[2]
    if grp[0] == (1,1):  data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = surv_rate1[3]
    if grp[0] == (2,0):  data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = surv_rate1[4]
    if grp[0] == (2,1):  data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = surv_rate1[5]  
    if grp[0] == (3,0):  data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = surv_rate1[6] 
    if grp[0] == (3,1):  data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = 0
    if grp[0] == (4,0):  data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = 0
    if grp[0] == (4,1):  data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = 0    
    if grp[0] == (5,0):  data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = .25
    if grp[0] == (5,1):  data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = 0
    if grp[0] == (6,0):  data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = 0
    if grp[0] == (6,1):  data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = .5
        


In [25]:
# Survival expectation value by sex and pclass
DEFAULT_SURVIVAL_VALUE = 0.5
data["Sx_Si_Survival"] = DEFAULT_SURVIVAL_VALUE

df_train = data[:891]

list_groups = df_train.groupby(["SibSp","Sex"])["Sx_Si_Survival"].count().values



surv_list_groups = df_train.groupby(["SibSp","Sex","Survived"])["Sx_Pa_Survival"].count().values



surv = surv_list_groups[14::2]


for grp in data.groupby(["SibSp","Sex"]): 
#     print(grp[0])
    if grp[0] == (1,0):  data.loc[grp[1].index.values,["Sx_Si_Survival"]] = surv_rate1[0]
    if grp[0] == (0,1):  data.loc[grp[1].index.values,["Sx_Si_Survival"]] = surv_rate1[1]
    if grp[0] == (1,0):  data.loc[grp[1].index.values,["Sx_Si_Survival"]] = surv_rate1[2]
    if grp[0] == (1,1):  data.loc[grp[1].index.values,["Sx_Si_Survival"]] = surv_rate1[3]
    if grp[0] == (2,0):  data.loc[grp[1].index.values,["Sx_Si_Survival"]] = surv_rate1[4]
    if grp[0] == (2,1):  data.loc[grp[1].index.values,["Sx_Si_Survival"]] = surv_rate1[5]  
    if grp[0] == (3,0):  data.loc[grp[1].index.values,["Sx_Si_Survival"]] = surv_rate1[6] 
    if grp[0] == (3,1):  data.loc[grp[1].index.values,["Sx_Si_Survival"]] = 0
    if grp[0] == (4,0):  data.loc[grp[1].index.values,["Sx_Si_Survival"]] =       2/6
    if grp[0] == (4,1):  data.loc[grp[1].index.values,["Sx_Si_Survival"]] =    1/12
    if grp[0] == (5,0):  data.loc[grp[1].index.values,["Sx_Si_Survival"]] = 0
    if grp[0] == (5,1):  data.loc[grp[1].index.values,["Sx_Si_Survival"]] = 0
    if grp[0] == (8,0):  data.loc[grp[1].index.values,["Sx_Si_Survival"]] = .5
    if grp[0] == (8,1):  data.loc[grp[1].index.values,["Sx_Si_Survival"]] = .5

In [26]:

NUMERIC_COLUMNS=['Alone','Family Size','Sex','Pclass','Fare','FareBand','Age','TitleCat','Embarked'] #72
ORIGINAL_NUMERIC_COLUMNS=['Pclass','Age','SibSp','Parch','Sex','Title_Master', 'Title_Miss','Title_Mr', 'Title_Mrs', 'Title_Millitary','Embarked'] #83
REVISED_NUMERIC_COLUMNS=[ 'Fare','Sex','Pclass','Age','Parch','Title','IsAlone','Embarked','Family_Size'] #84
# 'Sx_Cl_Survival','Sx_Em_Survival',"Sx_Si_Survival","Sx_Pa_Survival",'Title_Dr','Title_Master', 'Title_Miss','Title_Mr', 'Title_Mrs', 'Title_Millitary',
#'Family_Size', 

data[REVISED_NUMERIC_COLUMNS] = data[REVISED_NUMERIC_COLUMNS].values / np.max(data[REVISED_NUMERIC_COLUMNS].values,axis=0)
# data[REVISED_NUMERIC_COLUMNS] = normalize(data[REVISED_NUMERIC_COLUMNS].values)
# for x in REVISED_NUMERIC_COLUMNS:
#     data[x] = data[x]/np.max(data[x].values)


#'
df_train = data[:891]
df_test = data[891:]


# create test and training data
data_to_train = df_train[REVISED_NUMERIC_COLUMNS].fillna(-1000)
y=df_train['Survived'].values
X=data_to_train.values
X_train, X_test, Y_train, Y_test =  model_selection.train_test_split(X, y, test_size=0.3,random_state=1, stratify=y)


In [27]:
data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title,Family_Size,IsAlone,Last_Name,Family_Survival,Sx_Cl_Survival,Sx_Pa_Survival,Sx_Si_Survival
0,0.275,,0.608696,0.014151,"Braund, Mr. Owen Harris",0.0,1,0.384929,1.0,1,0.0,A/5 21171,0.156673,0.181818,0.0,Braund,0.5,0.5,0.165289,0.327586
1,0.475,C85,1.0,0.139136,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.0,2,1.0,0.0,1,1.0,PC 17599,0.792,0.181818,0.0,Cumings,0.5,0.5,0.5,0.766667
2,0.325,,0.608696,0.015469,"Heikkinen, Miss. Laina",0.0,3,0.384929,0.0,0,1.0,STON/O2. 3101282,0.697802,0.090909,1.0,Heikkinen,0.5,0.5,0.5,0.5
3,0.4375,C123,0.608696,0.103644,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.0,4,1.0,0.0,1,1.0,113803,0.792,0.181818,0.0,Futrelle,0.0,0.5,0.5,0.766667
4,0.4375,,0.608696,0.015713,"Allen, Mr. William Henry",0.0,5,0.384929,1.0,0,0.0,373450,0.156673,0.090909,1.0,Allen,0.5,0.5,0.165289,0.165289


In [31]:
df_train.to_csv("train_new.csv")
df_test.to_csv("test_new.csv")