# Import libraries

In [127]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.optimize import differential_evolution, shgo
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils.np_utils import to_categorical
pd.options.mode.chained_assignment = None  # default='warn'

# Import data

In [128]:
data_train = pd.read_csv('data/train.csv')
data_test  = pd.read_csv('data/test.csv')

# Feature engineering

## Fill NaNs by median value and Categorizing the feature 'Age'

In [129]:
for dataset in [data_train,data_test]:
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4

## Add two new feautures 'FamilySize' and 'IsAlone'

In [130]:
for dataset in [data_train,data_test]:
    dataset['FamilySize'] = dataset ['SibSp'] + dataset['Parch'] + 1
    dataset['IsAlone'] = 1 #initialize to yes/1 is alone
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0

## Encode values

In [147]:
for dataset in [data_train,data_test]:
    dataset['Sex'] = LabelEncoder().fit_transform(dataset['Sex'])
    dataset['Embarked'].fillna('S', inplace = True)
    dataset['Embarked'] = LabelEncoder().fit_transform(dataset['Embarked'])

## Categorizing 'Fare'

In [145]:
for dataset in [data_train,data_test]:
    dataset['Fare'].fillna(data_test['Fare'].median(), inplace = True)
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

## Add new feauture 'Title'

In [146]:
for dataset in [data_train,data_test]:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.')
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    dataset['Title'] = LabelEncoder().fit_transform(dataset['Title'])
    dataset['Title'] = dataset['Title'].fillna(0)

AttributeError: 'DataFrame' object has no attribute 'Name'

## Del useless values

In [None]:
for dataset in [data_train,data_test]:
    del dataset['Name']
    del dataset['Ticket']
    del dataset['Cabin']
    del dataset['SibSp']
    del dataset['Parch']

# Define features and targets

In [None]:
X_train = data_train.iloc[:,2:]
X_test = data_test.iloc[:,1:]
y_train = data_train['Survived']
y_test = pd.read_csv('data/gender_submission.csv')['Survived']

# Standardize features 

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# DL model

In [None]:
model = Sequential()
model.add(Dense(4221, activation='relu', input_dim=len(X_train[0])))
model.add(Dense(2000, activation = 'relu'))
model.add(Dense(1000, activation = 'relu'))
model.add(Dense(500, activation = 'relu'))
model.add(Dense(250, activation = 'relu'))
model.add(Dense(125, activation = 'relu'))
model.add(Dense(60, activation = 'relu'))

model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

model.fit(x=X_train,y=y_train, epochs=5)

In [None]:
y_pred = model.predict_classes(X_test)
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

In [None]:
submision = pd.DataFrame()
submision['PassengerId'] = data_test['PassengerId']
submision['Survived'] = y_pred
submision.to_csv('submit_dl.csv',index = False)

# RandomForest

In [None]:
def get_mae(max_depth):
    model =RandomForestClassifier(random_state=1,max_depth=max_depth)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_pred,y_test)
max_depth = differential_evolution(get_mae, bounds = [(1,100)])['x'][0]

In [None]:
model =RandomForestClassifier(random_state=1,max_depth=max_depth)
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

In [None]:
submision = pd.DataFrame()
submision['PassengerId'] = data_test['PassengerId']
submision['Survived'] = y_pred
submision.to_csv('submit_random_forest.csv',index = False) #0.78708