In [None]:
# by Grossmend, 2018

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## <b><font color='3C89F9'>1. Data preparation</font></b>

In [None]:
# load data

# train data
data = pd.read_csv('/kaggle/input/train.csv')

# test data
test_data = pd.read_csv('/kaggle/input/test.csv')

# concat train and test data in one DataFrame
all_data = pd.concat([data, test_data], axis=0, ignore_index=True, sort=False)

# show first 10 row data
all_data.head(10)

In [None]:
# description of columns:

# Survived - Survival (0 = No; 1 = Yes)
# Pclass - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
# Name - Name
# Sex - Sex
# Age - Age
# SibSp - Number of Siblings/Spouses Aboard
# Parch - Number of Parents/Children Aboard
# Ticket - Ticket Number
# Fare - Passenger Fare ()
# Cabin - Cabin (Number)
# Embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

In [None]:
# show info
all_data.info()
print(pd.__version__)

In [None]:
# count empty values
print('Empty values:')
all_data.isnull().sum()

In [None]:
# add column "Counter"
all_data['Counter'] = 1

### <b><font color='green'>"Name"</font> field processing</b>

In [None]:
# select 'Title' from field 'Name'

def title_parser(name):
    
    if not isinstance(name, str):
        return name
        
    if len(name.split()) == 1:
        return name
    
    try:
        parser_name = name.split(',')[1].split('.')[0].strip()
    except Exception as e:
        parser_name = 'error_parse'
    
    if parser_name == 'Mlle' or parser_name == 'Miss':
        parser_name = 'Miss'
    elif parser_name == 'Mme' or parser_name == 'Lady' or parser_name == 'Ms' or parser_name == 'Mrs':
        parser_name = 'Mrs'
    elif parser_name == 'Master':
        parser_name = 'Master'
    elif parser_name == 'Mr':
        parser_name = 'Mr'
    elif parser_name == 'error_parse':
        parser_name = 'error_parse'
    else:
        parser_name = 'Other'
        
    return parser_name


# processing field 'Name'
all_data['Name'] = all_data['Name'].apply(title_parser)

# unique count field 'Name' after processing
all_data['Name'].value_counts()

In [None]:
# normalize between 0 and 1 field 'Name'

def name_to_number(name):
    
    if not isinstance(name, str):
        return name
    
    if name == 'Mr':
        number_name = 0
    elif name == 'Miss':
        number_name = 1
    elif name == 'Mrs':
        number_name = 2
    elif name == 'Master':
        number_name = 3
    elif name == 'Other':
        number_name = 4
    else:
        number_name = -1
        
    return float(number_name)

# convert field 'Name' string to number
all_data['Name'] = all_data['Name'].apply(name_to_number)

# # alternative methods
# all_data['Name'] = all_data['Name'].replace(['Mr', 'Miss', 'Mrs', 'Master', 'Other'], [0, 1, 2, 3, 4])
# all_data['Name'] = all_data['Name'].map({'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Other': 4})

# normalize between 0 and 1 field 'Name'
scaler = MinMaxScaler()
all_data['Name'] = scaler.fit_transform(all_data[['Name']])
all_data['Name'].value_counts()

### <b><font color='green'>"Sex"</font> field processing</b>

In [None]:
# look at survival by sex
print(all_data[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean())

In [None]:
# look at count survival by 'sex' and 'class'

all_data.groupby(["Pclass", "Sex"])["Survived"].value_counts()

In [None]:
# normalize between 0 and 1 field 'Sex'

def sex_to_number(name):
    
    if not isinstance(name, str):
        return name
    
    if name == 'male':
        number_sex = 0
    elif name == 'female':
        number_sex = 1
    else:
        number_sex = -1
        
    return float(number_sex)

# convert field 'Name' string to number
all_data['Sex'] = all_data['Sex'].apply(sex_to_number)

# normalize between 0 and 1 field 'Name'
scaler = MinMaxScaler()
all_data['Sex'] = scaler.fit_transform(all_data[['Sex']])
all_data['Sex'].value_counts()

### <b><font color='green'>"Family"</font> create field and processing</b>

In [None]:
print(np.dtype(all_data['SibSp']))
print(np.dtype(all_data['Parch']))

# create new field 'Family' from sum 'SibSp' + 'Parch'
all_data['Family'] = all_data['SibSp'].astype(int) + all_data['Parch'] + 1

# convert column to float
all_data['Family'] = all_data["Family"]

# view how influence 'Family' survival
df_family = all_data[['Family', 'Survived', 'PassengerId']].copy()
df_family.groupby('Family', as_index=False).agg({'Survived': 'mean', 'PassengerId': 'count'}).rename(columns={'PassengerId': 'Count'})

In [None]:
def family_agr(family_count):
    
    """ function group family count """
    
    if not isinstance(family_count, int):
        return family_count
    
    if family_count == 1:
        family_group = 1
    elif (family_count == 2) or (family_count == 3):
        family_group = 2
    elif family_count == 4:
        family_group = 3
    elif (family_count == 5) or (family_count == 6) or (family_count == 7):
        family_group = 4
    elif (family_count == 8) or (family_count == 11):
        family_group = 5
    else:
        family_group = 6
    
    return float(family_group)
    
    
# convert field 'Name' string to number
all_data['Family'] = all_data['Family'].apply(family_agr)
all_data.groupby('Family', as_index=False).agg({'Survived': 'mean', 'PassengerId': 'count'}).rename(columns={'PassengerId': 'Count'})

In [None]:
# normalize field 'Family'
scaler = MinMaxScaler()
all_data['Family'] = scaler.fit_transform(all_data[['Family']])
all_data['Family'].value_counts()

### <b><font color='green'>"Fare"</font> field processing</b>

In [None]:
# count empty value field "Fare"
print('Count empty "Fare":', all_data['Fare'].isnull().sum())

In [None]:
# fill empty values mean group from filed "Pclass"
all_data['Fare'] = all_data['Fare'].fillna(all_data.groupby('Pclass')['Fare'].transform('mean'))
print('Count empty "Fare":', all_data['Fare'].isnull().sum())

In [None]:
# field "Fare" contains zero values
print('Zero counts in field "Fare"', all_data[all_data['Fare'] == 0].shape[0])

# fill zero values mean group from fields "Pclass" and "Sex" (slow method)
all_data['Fare'] = all_data['Fare'].replace(0, all_data.groupby('Pclass')['Fare'].transform('mean'))

print('Count zero "Fare":', all_data['Fare'][all_data['Fare']==0].count())

In [None]:
# view "Fare" values
plt.plot(all_data['Fare'].sort_values().reset_index(drop=True));
plt.title('"Fare" field sort values')

In [None]:
# convert fielf "Fare" to categorical
all_data['Fare'] = pd.cut(all_data['Fare'], bins=10, labels=False).astype('float')

In [None]:
# normalize field 'Fare'
scaler = MinMaxScaler()
all_data['Fare'] = scaler.fit_transform(all_data[['Fare']])
all_data['Fare'].value_counts()

### <b><font color='green'>"Age"</font> field processing</b>

In [None]:
print('Count empty "Age" field:', all_data['Age'].isnull().sum())
print('Percentage empty "Age" field:', round(all_data['Age'].isnull().sum() / all_data.shape[0] * 100, 2), '%')

In [None]:
# plot distribution "Age" by "Class"
all_data['Age'][all_data['Pclass'] == 1].plot(kind='kde');
all_data['Age'][all_data['Pclass'] == 2].plot(kind='kde');
all_data['Age'][all_data['Pclass'] == 3].plot(kind='kde');
plt.title("Distribution 'Age' by 'Class'");
plt.legend(('1st class', '2nd class','3rd class'),loc='best');

In [None]:
# fill empty values "Age" by "Name" (processing previos)
all_data['Age'] = all_data['Age'].fillna(all_data.groupby('Name')['Age'].transform('mean'))

In [None]:
# convert fielf "Fare" to categorical
all_data['Age'] = pd.cut(all_data['Age'], bins=10, labels=False).astype('float')

In [None]:
# normalize field 'Fare'
scaler = MinMaxScaler()
all_data['Age'] = scaler.fit_transform(all_data[['Age']])
all_data['Age'].value_counts()

### <b><font color='green'>"Cabin"</font> field processing</b>

In [None]:
# fill empty values
all_data['Cabin'].fillna('Z',inplace=True)

In [None]:
# select 1st element string "Cabin"
if not np.issubdtype(all_data['Cabin'].dtype, np.number):
    all_data['Cabin'] = all_data['Cabin'].map(lambda x : x[0])

In [None]:
# look "Survived" and "Counter" by "Cabin"
cabin = all_data.groupby(['Cabin'])['Survived', 'Counter'].agg({'Survived': np.mean, 'Counter': np.sum}).sort_values(by=['Counter'], ascending=[0]).reset_index()
cabin

In [None]:
# string to numeric field "Cabin"
if not np.issubdtype(all_data['Cabin'].dtype, np.number):
    all_data['Cabin'] = all_data['Cabin'].map(dict(zip(cabin['Cabin'].values, cabin.index.values))).astype(float)

In [None]:
# normalize field 'Cabin'
scaler = MinMaxScaler()
all_data['Cabin'] = scaler.fit_transform(all_data[['Cabin']])
all_data['Cabin'].value_counts()

### <b><font color='green'>"isAlone"</font> add field</b>

In [None]:
# add field "is alone"
all_data['isAlone'] = 0
all_data.loc[all_data['Family'] == 1, 'isAlone'] = 1

In [None]:
all_data.head(10)

### <b><font color='green'>"Embarked"</font> field processing</b>

In [None]:
# count_empty values
all_data['Embarked'].value_counts()

In [None]:
# replace empty values
all_data['Embarked'].fillna('N', inplace=True)

In [None]:
embarked = all_data.groupby(['Embarked'])['Survived', 'Counter'].agg({'Survived': np.mean, 'Counter': np.sum}).sort_values(by=['Counter'], ascending=[0]).reset_index()
embarked

In [None]:
# string to numeric field "Embarked"
if not np.issubdtype(all_data['Embarked'].dtype, np.number):
    all_data['Embarked'] = all_data['Embarked'].map(dict(zip(embarked['Embarked'].values, embarked.index.values))).astype(float)

In [None]:
# normalize field 'Cabin'
scaler = MinMaxScaler()
all_data['Embarked'] = scaler.fit_transform(all_data[['Embarked']])
all_data['Embarked'].value_counts()

### <b><font color='green'>"Pclass"</font> field processing</b>

In [None]:
# normalize field 'Pclass'
scaler = MinMaxScaler()
all_data['Pclass'] = scaler.fit_transform(all_data[['Pclass']].astype(float))
all_data['Pclass'].value_counts()

### <b><font color='green'>"Ticket"</font> field processing</b>

In [None]:

all_data['Ticket'] = all_data['Ticket'].apply(lambda x: len(x) if isinstance(x, str) else x)
all_data['Ticket'] = pd.cut(all_data['Fare'], bins=10, labels=False).astype('float')

scaler = MinMaxScaler()
all_data['Ticket'] = scaler.fit_transform(all_data[['Ticket']].astype(float))
all_data['Ticket'].value_counts()

all_data.groupby(['Ticket'])['Survived', 'Counter'].agg({'Survived': 'mean', 'Counter': 'sum'})


### <b>Select fields for machine learning</b>

In [None]:
# let's look at the processing result "all_data"
all_data.head(10)

In [None]:
# very nice. Select fields for ML
data_for_ml = all_data[['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Fare', 'Cabin', 'Embarked', 'Family', 'isAlone', 'Ticket']]
data_for_ml.head(10)

In [None]:
# split data (with answers and no answers)
train_data = data_for_ml[data_for_ml['Survived'].notnull()]
target_data = train_data['Survived'].reset_index(drop=True)

train_data = train_data.drop('Survived', axis=1)
print('train data length:', train_data.shape[0])

test_data = data_for_ml[data_for_ml['Survived'].isnull()].drop(['Survived'], axis=1)
print('test data length:', test_data.shape[0])

## <b><font color='3C89F9'>2. Machine Learning</font></b><i><font color='black'> (simple deep Neural Network)</i></font>

In [None]:
import sklearn.datasets
from sklearn.model_selection import train_test_split

In [None]:
# create data for NN
train_brunch = sklearn.datasets.base.Bunch(data=train_data, target=target_data)
test_brunch = sklearn.datasets.base.Bunch(data=test_data, target=0)

In [None]:
# cross-validation K blocks

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

nn = train_brunch['data'].shape[1]

model = MLPClassifier(solver='lbfgs',
                      max_iter=300,
                      batch_size=16,
                      alpha=0.01,
                      hidden_layer_sizes=(nn,nn),
                      activation='logistic',
                      learning_rate_init=0.002)
scores = cross_val_score(model, train_brunch['data'], train_brunch['target'], cv=5)
plt.plot(scores)
print('mean scores:', np.mean(scores))


In [None]:
# save CSV result

model.fit(train_brunch['data'], train_brunch['target'])

out = model.predict(test_brunch.data).T.astype('int')
df_out = pd.DataFrame(data=out, index=test_brunch.data.index + 1).reset_index()
df_out.columns = ['PassengerId', 'Survived']
df_out.to_csv('submission.csv', index=False, sep=',')