In [1117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [1118]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
combine = [train, test]

# Data preparation

### Get 'child' from age and sex

In [1119]:
train['Person'] = train['Sex']
train.loc[train['Age'] < 16, 'Person'] = 'child'

test['Person'] = test['Sex']
test.loc[test['Age'] < 16, 'Person'] = 'child'

### Person (Sex)

In [1120]:
'''for df in combine:
    df['Person'] = df['Person'].map({'male': 0, 'female': 1, 'child': 3}).astype('int')'''

"for df in combine:\n    df['Person'] = df['Person'].map({'male': 0, 'female': 1, 'child': 3}).astype('int')"

# Person get_dummies (and drop male)

In [1121]:
# create dummy variables for Person column, & drop Male as it has the lowest average of survived passengers
person_dummies_titanic  = pd.get_dummies(train['Person'])
person_dummies_titanic.columns = ['Child','Female','Male']
person_dummies_titanic.drop(['Male'], axis=1, inplace=True)

person_dummies_test  = pd.get_dummies(test['Person'])
person_dummies_test.columns = ['Child','Female','Male']
person_dummies_test.drop(['Male'], axis=1, inplace=True)

train = train.join(person_dummies_titanic)
test  = test.join(person_dummies_test)

train.drop(['Person'],axis=1,inplace=True)
test.drop(['Person'],axis=1,inplace=True)

### SibSp and Parch ===> IsAlone?

In [1122]:
train['IsAlone'] = 0
train.loc[(df['Parch'] == 0) & (train['SibSp'] == 0), 'IsAlone'] = 1

test['IsAlone'] = 0
test.loc[(df['Parch'] == 0) & (test['SibSp'] == 0), 'IsAlone'] = 1

### Embarked

In [1123]:
freq_port_train = train.Embarked.dropna().mode()[0]
#freq_port_test = test.Embarked.dropna().mode()
train.Embarked = train.Embarked.fillna(freq_port_train)
test.Embarked = test.Embarked.fillna(freq_port_train)
#df.Embarked = df.Embarked.map({'S': 0, 'C': 1, 'Q': 2}).fillna(4).astype(int)

# Embarked get_dummies (and drop S)

In [1124]:
embark_dummies_titanic  = pd.get_dummies(train['Embarked'])
embark_dummies_titanic.drop(['S'], axis=1, inplace=True)

embark_dummies_test  = pd.get_dummies(test['Embarked'])
embark_dummies_test.drop(['S'], axis=1, inplace=True)

train = train.join(embark_dummies_titanic)
test  = test.join(embark_dummies_test)

train.drop(['Embarked'], axis=1,inplace=True)
test.drop(['Embarked'], axis=1,inplace=True)

### Age

In [1125]:
'''for df in combine:
    age_nan = np.zeros((2,3))
    for sex in range(0,2):
        for pcl in range(1,4):
            df_SexPcl = df[(df['Sex'] == sex) & (df['Pclass'] == pcl)]['Age'].dropna()
            age_nan[sex,pcl-1] = int(df_SexPcl.median()/0.5 + 0.5) * 0.5

    for sex in range(0,2):
        for pcl in range(1,4):
            df.loc[((df['Age'].isnull()) & (df['Sex'] == sex) & (df['Pclass'] == pcl)), 'Age'] = \
                age_nan[sex, pcl-1]
    df['Age'] = df['Age'].astype(int)'''

"for df in combine:\n    age_nan = np.zeros((2,3))\n    for sex in range(0,2):\n        for pcl in range(1,4):\n            df_SexPcl = df[(df['Sex'] == sex) & (df['Pclass'] == pcl)]['Age'].dropna()\n            age_nan[sex,pcl-1] = int(df_SexPcl.median()/0.5 + 0.5) * 0.5\n\n    for sex in range(0,2):\n        for pcl in range(1,4):\n            df.loc[((df['Age'].isnull()) & (df['Sex'] == sex) & (df['Pclass'] == pcl)), 'Age'] =                 age_nan[sex, pcl-1]\n    df['Age'] = df['Age'].astype(int)"

In [1126]:
'''# pd.cut(train.Age, 5)

for df in combine:
    df.loc[ df.Age <= 16, 'Age'] = 0
    df.loc[(df.Age > 16) & (df.Age <= 32), 'Age'] = 1
    df.loc[(df.Age > 32) & (df.Age <= 48), 'Age'] = 2
    df.loc[(df.Age > 48) & (df.Age <= 64), 'Age'] = 3
    df.loc[ df.Age > 64, 'Age'] = 4
    df.Age = df.Age.astype(float)'''

"# pd.cut(train.Age, 5)\n\nfor df in combine:\n    df.loc[ df.Age <= 16, 'Age'] = 0\n    df.loc[(df.Age > 16) & (df.Age <= 32), 'Age'] = 1\n    df.loc[(df.Age > 32) & (df.Age <= 48), 'Age'] = 2\n    df.loc[(df.Age > 48) & (df.Age <= 64), 'Age'] = 3\n    df.loc[ df.Age > 64, 'Age'] = 4\n    df.Age = df.Age.astype(float)"

### Random Age Nan

In [1127]:
# get average, std, and number of NaN values in titanic_df
average_age_titanic   = train["Age"].mean()
std_age_titanic       = train["Age"].std()
count_nan_age_titanic = train["Age"].isnull().sum()

# get average, std, and number of NaN values in test_df
average_age_test   = test["Age"].mean()
std_age_test       = test["Age"].std()
count_nan_age_test = test["Age"].isnull().sum()

# generate random numbers between (mean - std) & (mean + std)
rand_1 = np.random.randint(average_age_titanic - std_age_titanic, 
                           average_age_titanic + std_age_titanic, 
                           size = count_nan_age_titanic)
rand_2 = np.random.randint(average_age_test - std_age_test, 
                           average_age_test + std_age_test, 
                           size = count_nan_age_test)

# fill NaN values in Age column with random values generated
train.loc[np.isnan(train["Age"]), "Age"] = rand_1
test.loc[np.isnan(test["Age"]), "Age"] = rand_2

# convert from float to int
train['Age'] = train['Age'].astype(int)
test['Age']  = test['Age'].astype(int)

### Age*Pclass

In [1128]:
'''for df in combine:
    df['Age*Pclass'] = df.Age * df.Pclass'''

"for df in combine:\n    df['Age*Pclass'] = df.Age * df.Pclass"

### Fare

In [1129]:
train.Fare = train.Fare.fillna(train.Fare.dropna().median())
test.Fare = test.Fare.fillna(test.Fare.dropna().median())

train.Fare = train.Fare.astype(int)
test.Fare = test.Fare.astype(int)

In [1130]:
# pd.qcut(train.Fare, 4)
'''train.loc[ train.Fare <= 7.91, 'Fare'] = 0
train.loc[(train.Fare > 7.91) & (train.Fare <= 14.454), 'Fare'] = 1
train.loc[(train.Fare > 14.454) & (train.Fare <= 31.0), 'Fare'] = 2
train.loc[ train.Fare > 31.0, 'Fare'] = 3
train.Fare = train.Fare.astype(int)

test.loc[ test.Fare <= 7.91, 'Fare'] = 0
test.loc[(test.Fare > 7.91) & (test.Fare <= 14.454), 'Fare'] = 1
test.loc[(test.Fare > 14.454) & (test.Fare <= 31.0), 'Fare'] = 2
test.loc[ test.Fare > 31.0, 'Fare'] = 3
test.Fare = test.Fare.astype(int)'''

"train.loc[ train.Fare <= 7.91, 'Fare'] = 0\ntrain.loc[(train.Fare > 7.91) & (train.Fare <= 14.454), 'Fare'] = 1\ntrain.loc[(train.Fare > 14.454) & (train.Fare <= 31.0), 'Fare'] = 2\ntrain.loc[ train.Fare > 31.0, 'Fare'] = 3\ntrain.Fare = train.Fare.astype(int)\n\ntest.loc[ test.Fare <= 7.91, 'Fare'] = 0\ntest.loc[(test.Fare > 7.91) & (test.Fare <= 14.454), 'Fare'] = 1\ntest.loc[(test.Fare > 14.454) & (test.Fare <= 31.0), 'Fare'] = 2\ntest.loc[ test.Fare > 31.0, 'Fare'] = 3\ntest.Fare = test.Fare.astype(int)"

### Name ===> Title

In [1131]:
title_map = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for df in combine:
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
     'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    df['Title'] = df['Title'].map(title_map).fillna(0)

### Delete useless

In [1132]:
train = train.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Sex',
                   'Fare', 'Age'], axis=1)
test = test.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Sex', 'Fare', 'Age'], axis=1)

# Pclass (drop 3-Pclass)

In [1133]:
pclass_dummies_titanic  = pd.get_dummies(train['Pclass'])
pclass_dummies_titanic.columns = ['Class_1','Class_2','Class_3']
pclass_dummies_titanic.drop(['Class_3'], axis=1, inplace=True)

pclass_dummies_test  = pd.get_dummies(test['Pclass'])
pclass_dummies_test.columns = ['Class_1','Class_2','Class_3']
pclass_dummies_test.drop(['Class_3'], axis=1, inplace=True)

train = train.join(pclass_dummies_titanic)
test  = test.join(pclass_dummies_test)

train.drop(['Pclass'], axis=1,inplace=True)
test.drop(['Pclass'], axis=1,inplace=True)

# Model, prediction and solve

In [1134]:
from sklearn import tree
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import random

In [1135]:
x_train = train.drop('Survived', axis=1)
y_train = train.Survived
x_test = test.drop('PassengerId', axis=1).copy()

### LogReg

In [1136]:
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)
y_pred = log_reg.predict(x_test)
log_reg.score(x_train, y_train).round(3)

0.795

In [1137]:
coeff_df = pd.DataFrame(train.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(log_reg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
1,Female,2.732994
0,Child,2.311282
5,Class_1,1.857058
6,Class_2,1.086944
3,C,0.617844
4,Q,0.327135
2,IsAlone,0.169294


### SVC

In [1138]:
svc = SVC()
svc.fit(x_train, y_train)
svc.score(x_train, y_train).round(3)

0.825

### RandomForest

In [1147]:
rf = RandomForestClassifier()
parameters = {'n_estimators': [100], 'max_depth': [3, 4, 5, 6],
             'min_samples_split': [20, 25, 30, 40], 'min_samples_leaf': [5, 10, 15]}

In [1148]:
rand_rf = RandomizedSearchCV(rf, parameters, cv=6)
rand_rf.fit(x_train, y_train)
rand_rf.score(x_train, y_train).round(3)

0.819

In [1149]:
rand_rf.best_params_

{'n_estimators': 100,
 'min_samples_split': 25,
 'min_samples_leaf': 10,
 'max_depth': 6}

In [1150]:
y_pred = rand_rf.predict(x_test)

In [1151]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": y_pred})
submission.to_csv('answer.csv', index=False)