# Libraries

In [329]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix

# Setup

In [290]:
pd.set_option('display.max_rows', 50)

In [291]:
data = pd.read_csv('./data/train.csv')
data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,0,3,"Kramer, James",male,19.0,0,0,A. 10866,13.04,,S
4,4,1,3,"Bond, Michael",male,25.0,0,0,427635,7.76,,S
5,5,0,2,"Sassano, Jonathan",male,35.0,0,0,13363,6.71,,S
6,6,0,3,"Conway, Jose",male,7.0,0,0,A/5,9.77,,S
7,7,1,3,"Werner, Linda",female,6.0,1,2,434426,31.5,,S
8,8,0,1,"Wardlaw, Michael",male,27.0,2,1,474849,73.02,A7253,S
9,9,0,2,"Greigo, Rudy",male,66.0,0,0,6981,9.14,D2969,C


### Description of the attributes

* PassengerId - Unique passenger id
* Survived - Flag whether passenger survived or not
* Pclass - Ticket class
* Name - Name of the passenger
* Sex - Gender of the passenger
* Age - Age of the passenger
* SibSp - # of siblings / spouses aboard the Titanic:
    * Siblings: brother, sister, stepbrother, stepsister
    * Spouse: husband, wife (mistresses and fiancés were ignored)
* Parch - # of parents / children aboard the Titanic:
    * Parent: mother, father
    * Child: daughter, son, stepdaughter, stepson
* Ticket - Ticket number
* Fare - Passenger fare
* Cabin - Cabin number
* Embarked - Port of embarkation:
    * C = Cherbourg
    * Q = Queenstown
    * S = Southampton

### Summary statistics

In [292]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,100000.0,100000.0,100000.0,96708.0,100000.0,100000.0,99866.0
mean,49999.5,0.42774,2.10691,38.355472,0.39769,0.45456,43.92933
std,28867.657797,0.494753,0.837727,18.313556,0.862566,0.950076,69.58882
min,0.0,0.0,1.0,0.08,0.0,0.0,0.68
25%,24999.75,0.0,1.0,25.0,0.0,0.0,10.04
50%,49999.5,0.0,2.0,39.0,0.0,0.0,24.46
75%,74999.25,1.0,3.0,53.0,1.0,1.0,33.5
max,99999.0,1.0,3.0,87.0,8.0,9.0,744.66


### Null values

In [293]:
data.isna().sum()

PassengerId        0
Survived           0
Pclass             0
Name               0
Sex                0
Age             3292
SibSp              0
Parch              0
Ticket          4623
Fare             134
Cabin          67866
Embarked         250
dtype: int64

### Unique values

In [294]:
data.nunique()

PassengerId    100000
Survived            2
Pclass              3
Name            92144
Sex                 2
Age               173
SibSp               7
Parch               8
Ticket          75331
Fare            15935
Cabin           26992
Embarked            3
dtype: int64

# Feature Engineering

### Count number of all family members

In [295]:
data['Family_Members'] = data['SibSp'] + data['Parch']

### Separate Name into Surname and Firstname

In [296]:
data[['Surname', 'Firstname']] = data.Name.str.split(", ", expand=True)
data.drop(['Name', 'Firstname'], axis=1, inplace=True)
data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Members,Surname
0,0,1,1,male,,2,0,209245,27.14,C12239,S,2,Oconnor
1,1,0,3,male,,0,0,27323,13.35,,S,0,Bryan
2,2,0,3,male,0.33,1,2,CA 457703,71.29,,S,3,Owens
3,3,0,3,male,19.0,0,0,A. 10866,13.04,,S,0,Kramer
4,4,1,3,male,25.0,0,0,427635,7.76,,S,0,Bond
5,5,0,2,male,35.0,0,0,13363,6.71,,S,0,Sassano
6,6,0,3,male,7.0,0,0,A/5,9.77,,S,0,Conway
7,7,1,3,female,6.0,1,2,434426,31.5,,S,3,Werner
8,8,0,1,male,27.0,2,1,474849,73.02,A7253,S,3,Wardlaw
9,9,0,2,male,66.0,0,0,6981,9.14,D2969,C,0,Greigo


### Separate Cabin into Letter and Number

In [297]:
data['Cabin_Letter'] = data.Cabin.str.slice(0, 1)
data['Cabin_Number'] = data.Cabin.str.slice(1)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Members,Surname,Cabin_Letter,Cabin_Number
0,0,1,1,male,,2,0,209245,27.14,C12239,S,2,Oconnor,C,12239.0
1,1,0,3,male,,0,0,27323,13.35,,S,0,Bryan,,
2,2,0,3,male,0.33,1,2,CA 457703,71.29,,S,3,Owens,,
3,3,0,3,male,19.0,0,0,A. 10866,13.04,,S,0,Kramer,,
4,4,1,3,male,25.0,0,0,427635,7.76,,S,0,Bond,,


# Filling Nulls

In [298]:
predicted_fare = data.groupby(['Pclass', 'Embarked', 'Sex']).Fare.median().reset_index()
data = pd.merge(data, predicted_fare, how='left', on=['Pclass', 'Embarked', 'Sex'])
data['Fare'] = np.where(data['Fare_x'].isnull(), data['Fare_y'], data['Fare_x'])
data.drop(['Fare_x', 'Fare_y'], axis=1, inplace=True)
data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Cabin,Embarked,Family_Members,Surname,Cabin_Letter,Cabin_Number,Fare
0,0,1,1,male,,2,0,209245,C12239,S,2,Oconnor,C,12239.0,27.14
1,1,0,3,male,,0,0,27323,,S,0,Bryan,,,13.35
2,2,0,3,male,0.33,1,2,CA 457703,,S,3,Owens,,,71.29
3,3,0,3,male,19.0,0,0,A. 10866,,S,0,Kramer,,,13.04
4,4,1,3,male,25.0,0,0,427635,,S,0,Bond,,,7.76
5,5,0,2,male,35.0,0,0,13363,,S,0,Sassano,,,6.71
6,6,0,3,male,7.0,0,0,A/5,,S,0,Conway,,,9.77
7,7,1,3,female,6.0,1,2,434426,,S,3,Werner,,,31.5
8,8,0,1,male,27.0,2,1,474849,A7253,S,3,Wardlaw,A,7253.0,73.02
9,9,0,2,male,66.0,0,0,6981,D2969,C,0,Greigo,D,2969.0,9.14


In [299]:
predicted_age = data.groupby(['Sex', 'Pclass', 'Embarked']).Age.median().reset_index()
data = pd.merge(data, predicted_age, how='left', on=['Sex', 'Pclass', 'Embarked'])
data['Age'] = np.where(data['Age_x'].isnull(), data['Age_y'], data['Age_x'])
data.drop(['Age_x', 'Age_y'], axis=1, inplace=True)
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Ticket,Cabin,Embarked,Family_Members,Surname,Cabin_Letter,Cabin_Number,Fare,Age
0,0,1,1,male,2,0,209245,C12239,S,2,Oconnor,C,12239.0,27.14,42.5
1,1,0,3,male,0,0,27323,,S,0,Bryan,,,13.35,32.0
2,2,0,3,male,1,2,CA 457703,,S,3,Owens,,,71.29,0.33
3,3,0,3,male,0,0,A. 10866,,S,0,Kramer,,,13.04,19.0
4,4,1,3,male,0,0,427635,,S,0,Bond,,,7.76,25.0


In [300]:
data.isnull().sum()

PassengerId           0
Survived              0
Pclass                0
Sex                   0
SibSp                 0
Parch                 0
Ticket             4623
Cabin             67866
Embarked            250
Family_Members        0
Surname               0
Cabin_Letter      67866
Cabin_Number      67866
Fare                  1
Age                   2
dtype: int64

# Variables encoding

### Sex -> Gender

In [301]:
sex_encoder = OrdinalEncoder()
data['Gender'] = sex_encoder.fit_transform(data[['Sex']])
data.drop(['Sex'], axis=1, inplace=True)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,SibSp,Parch,Ticket,Cabin,Embarked,Family_Members,Surname,Cabin_Letter,Cabin_Number,Fare,Age,Gender
0,0,1,1,2,0,209245,C12239,S,2,Oconnor,C,12239.0,27.14,42.5,1.0
1,1,0,3,0,0,27323,,S,0,Bryan,,,13.35,32.0,1.0
2,2,0,3,1,2,CA 457703,,S,3,Owens,,,71.29,0.33,1.0
3,3,0,3,0,0,A. 10866,,S,0,Kramer,,,13.04,19.0,1.0
4,4,1,3,0,0,427635,,S,0,Bond,,,7.76,25.0,1.0


### Embarked -> Origin

In [302]:
# ord_encoder = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = np.nan)
# data['Origin'] = ord_encoder.fit_transform(data[['Embarked']])
# data.drop(['Embarked'], axis=1, inplace=True)
# data.head()

# Model

In [303]:
data_ = data[['Survived', 'PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Family_Members', 'Gender']]
data_.dropna(inplace=True)

X = data_[data_.columns[~data_.columns.isin(['Survived'])]]
y = data_['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y)
clf = RandomForestClassifier(n_estimators=500, max_depth=10, criterion='entropy', \
                            random_state=42).fit(X_train, y_train)
clf.score(X_test, y_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0.76244

## Model Accuracy

In [328]:
pred_score_train = clf.predict(X_test)
pred_score_train

array([1, 0, 1, ..., 1, 0, 1], dtype=int64)

In [330]:
confusion_matrix(y_test, pred_score_train)

array([[11386,  2875],
       [ 3064,  7675]])

# Predict test

In [304]:
test_data = pd.read_csv('./data/test.csv')
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,100000,3,"Holliday, Daniel",male,19.0,0,0,24745,63.01,,S
1,100001,3,"Nguyen, Lorraine",female,53.0,0,0,13264,5.81,,S
2,100002,1,"Harris, Heather",female,19.0,0,0,25990,38.91,B15315,C
3,100003,2,"Larsen, Eric",male,25.0,0,0,314011,12.93,,S
4,100004,1,"Cleary, Sarah",female,17.0,0,2,26203,26.89,B22515,C


In [310]:
test_data['Family_Members'] = test_data['SibSp'] + test_data['Parch']

In [306]:
test_data = pd.merge(test_data, predicted_fare, how='left', on=['Pclass', 'Embarked', 'Sex'])
test_data['Fare'] = np.where(test_data['Fare_x'].isnull(), test_data['Fare_y'], test_data['Fare_x'])
test_data.drop(['Fare_x', 'Fare_y'], axis=1, inplace=True)
test_data.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin,Embarked,Fare
0,100000,3,"Holliday, Daniel",male,19.0,0,0,24745,,S,63.01
1,100001,3,"Nguyen, Lorraine",female,53.0,0,0,13264,,S,5.81
2,100002,1,"Harris, Heather",female,19.0,0,0,25990,B15315,C,38.91
3,100003,2,"Larsen, Eric",male,25.0,0,0,314011,,S,12.93
4,100004,1,"Cleary, Sarah",female,17.0,0,2,26203,B22515,C,26.89
5,100005,3,"Orabuena, John",male,34.0,0,0,,,C,10.4
6,100006,1,"Guerrero, Angel",female,48.0,1,2,PC 28237,B21146,S,221.94
7,100007,3,"Payton, Brian",male,13.0,0,0,42031,,S,9.41
8,100008,3,"Murray, Barbara",female,16.0,1,0,C.A. 219645,,S,12.31
9,100009,3,"Mickle, Johnny",male,20.0,0,0,39023,,S,6.17


In [307]:
test_data = pd.merge(test_data, predicted_age, how='left', on=['Sex', 'Pclass', 'Embarked'])
test_data['Age'] = np.where(test_data['Age_x'].isnull(), test_data['Age_y'], test_data['Age_x'])
test_data.drop(['Age_x', 'Age_y'], axis=1, inplace=True)
test_data.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,SibSp,Parch,Ticket,Cabin,Embarked,Fare,Age
0,100000,3,"Holliday, Daniel",male,0,0,24745,,S,63.01,19.0
1,100001,3,"Nguyen, Lorraine",female,0,0,13264,,S,5.81,53.0
2,100002,1,"Harris, Heather",female,0,0,25990,B15315,C,38.91,19.0
3,100003,2,"Larsen, Eric",male,0,0,314011,,S,12.93,25.0
4,100004,1,"Cleary, Sarah",female,0,2,26203,B22515,C,26.89,17.0


In [308]:
sex_encoder_ = OrdinalEncoder()
test_data['Gender'] = sex_encoder_.fit_transform(test_data[['Sex']])
test_data.drop(['Sex'], axis=1, inplace=True)
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,SibSp,Parch,Ticket,Cabin,Embarked,Fare,Age,Gender
0,100000,3,"Holliday, Daniel",0,0,24745,,S,63.01,19.0,1.0
1,100001,3,"Nguyen, Lorraine",0,0,13264,,S,5.81,53.0,0.0
2,100002,1,"Harris, Heather",0,0,25990,B15315,C,38.91,19.0,0.0
3,100003,2,"Larsen, Eric",0,0,314011,,S,12.93,25.0,1.0
4,100004,1,"Cleary, Sarah",0,2,26203,B22515,C,26.89,17.0,0.0


## Fill values that were not filled before

In [317]:
imputer_age = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
test_data['Age'] = imputer_age.fit_transform(test_data[['Age']])

imputer_fare = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
test_data['Fare'] = imputer_fare.fit_transform(test_data[['Fare']])

In [320]:
test_data.isnull().sum()

PassengerId           0
Pclass                0
Name                  0
SibSp                 0
Parch                 0
Ticket             5181
Cabin             70831
Embarked            277
Fare                  0
Age                   0
Gender                0
Family_Members        0
dtype: int64

In [334]:
test_data['Survived'] = clf.predict(test_data[['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Family_Members', 'Gender']])

In [337]:
test_data[['PassengerId', 'Survived']].to_csv('./submission1.csv', index = False)