In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

!pip install pandas-profiling #EDA
from pandas_profiling import ProfileReport

# Predicting Passangers Who Survived or Not in Titanic Cruise

<font size='4'>**Problem Statement**</font>

The sinking of the Titanic is one of the most infamous shipwrecks in history.  

On April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.  

While there was some element of luck involved in surviving, it seems some groups of people were more likely to survive than others.  

In this challenge, we ask you to build a predictive model that answers the question: “what sorts of people were more likely to survive?” using passenger data (ie name, age, gender, socio-economic class, etc).  

# 1. Import Library & Load Dataset

## 1.1. Import Library

In [None]:
# Scientific Libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
style.use('fivethirtyeight')
# Plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
# Booster for rendering
import plotly.io as pio
pio.renderers.default = 'iframe'

# Warning ignorance
import warnings
warnings.filterwarnings('ignore')

# Scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Selection
from scipy.stats import chi2_contingency

# Machine Learning Algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score,StratifiedKFold

# Metrics Evaluation
from sklearn.metrics import accuracy_score, precision_score 
from sklearn.metrics import recall_score, f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix 
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix

# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

## 1.2. Load Dataset

In [None]:
# Import Train Dataset
train = pd.read_csv('/kaggle/input/titanic/train.csv')
# Import Test Dataset
test = pd.read_csv('/kaggle/input/titanic/test.csv')

# 2. Exploratory Data Analysis (EDA)

In [None]:
df = pd.DataFrame(data=train)
pr = ProfileReport(df)
pr.to_file(output_file='pandas_profiling.html')
pr

## 2.1. Separate Numerical & Categorical Data

In [None]:
# Numerical data
nums = ['PassengerId', 'Age','SibSp', 'Parch', 'Fare']
# Categorical data
cats =['Survived','Pclass','Name', 'Sex', 'Ticket', 
       'Cabin', 'Embarked']

# 3. Data Pre Processing

In [None]:
# Copy data for pre processing
pre1 = train.copy()

In [None]:
# Numerical data
num = ['Age','SibSp', 'Parch', 'Fare']
# Categorical data
cat =['Survived','Pclass', 'Sex', 'Ticket', 
       'Embarked']

## 3.1. Data Cleaning

### 3.1.1. Drop Irrelevant Features

In [None]:
# Drop PassengerID & Name
pre1.drop(['PassengerId','Name','Ticket'],1, inplace=True)

### 3.1.2. Handle Missing Values

There are 3 features with the missing values, such as `Age`, `Cabin`, `Embarked`.

<font size='4'>**Train Data**</font>

In [None]:
# Handle Feature Cabin & Embarked
# Due to cabin has the highest ratio of missing values, I'll drop it
# And also only 2 missing values for embarked 
pre1.drop('Cabin',1, inplace=True)
pre1.dropna(subset=['Embarked'],inplace=True)

# Impute Age 
pre1['Age'].fillna(pre1['Age'].mean(),inplace=True)

# Replace 0 values in Fare with median due to the distribution is skewed
median_fare = pre1['Fare'].median(skipna=True)
pre1['Fare']=pre1.Fare.mask(pre1.Fare == 0,median_fare)

In [None]:
# Check the missing values
feature = pre1.isna().sum().keys().tolist()
missing = pre1.isna().sum().values.tolist()
mv_check = pd.DataFrame(list(zip(feature, missing)), 
                  columns=['feature','missing_value'])
mv_check['%missing'] = round(((mv_check['missing_value']/train.shape[0])*100),2)
mv_check

**No missing values for all features & zero values in Fare**

<font size='4'>**Test Data**</font>

In [None]:
# Handle Feature Cabin & Age
# Due to cabin has the highest ratio of missing values, I'll drop it
# For age, I'll impute with mean
test1 = test.copy()
test1.drop('Cabin',1, inplace=True)

# Impute Age 
test1['Age'].fillna(test['Age'].mean(),inplace=True)
test1['Fare'].fillna(test['Fare'].median(),inplace=True)

In [None]:
test1.isna().sum()

**No missing values in test dataset**

## 3.2. Feature Engineering

### 3.2.1. Log Transformation

In [None]:
# Check data distribution
plt.figure(figsize=(8,5))
for i in range(0,len(num)):
    plt.subplot(2, len(num)/2, i+1)
    sns.distplot(pre1[num[i]], color='orange')
    plt.tight_layout()

In [None]:
# Apply log transformation
log = ['SibSp','Parch','Fare']
for col in log:
    pre1[col] = (pre1[col]+1).apply(np.log)

### 3.2.2. Standardization

In [None]:
# Apply standardization
for col in num:
    pre1[col]= StandardScaler().fit_transform(pre1[col].values.reshape(len(pre1), 1))

### 3.2.3. One Hot Encoding

In [None]:
# Train dataset
cats_train = ['Sex','Embarked']
for cat in cats_train:
    onehots_train = pd.get_dummies(pre1[cat], prefix=cat)
    pre1 = pre1.join(onehots_train)

In [None]:
pre1.drop(columns=cats_train,inplace=True)

In [None]:
# Test dataset
cats_test = ['Sex','Embarked']
for cat in cats_test:
    onehots_test = pd.get_dummies(test1[cat], prefix=cat)
    test1 = test1.join(onehots_test)

# 4. Modelling & Prediction

## 4.1. Separate Feature & Target For Validation & Prediction

In [None]:
X_train = pre1.drop('Survived',1)
y_train = pre1['Survived']
X_test = test1.drop(['Name','Ticket','PassengerId','Sex','Embarked'],1)

## 4.2. Cross Validation Score

In [None]:
# Create function for cross validation score
def cross_val(Model, X_train, y_train, cval):
    model = Model # initiate model
    kfold = StratifiedKFold(n_splits=cval, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    cv_mean = cv_results.mean()
    cv_std = cv_results.std()
    return round(cv_mean,4), round(cv_std,4)

In [None]:
# Inititate algorithm
lr = LogisticRegression(random_state=42)
nb = GaussianNB()
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
svc = SVC(random_state=42)
xgb = XGBClassifier(random_state=42)

# Create function to make the result as dataframe 
def model_cv_comparison(X_train,y_train):  
    
    # Logistic Regression
    lr_cv_score_mean, lr_cv_score_std = cross_val(lr, X_train, y_train, 10)
    # Naive Bayes
    nb_cv_score_mean, nb_cv_score_std = cross_val(nb, X_train, y_train, 10)
    # Decision Tree
    dt_cv_score_mean, dt_cv_score_std = cross_val(dt, X_train, y_train, 10)
    # Random Forest
    rf_cv_score_mean, rf_cv_score_std = cross_val(lr, X_train, y_train, 10)
    #KNN
    knn_cv_score_mean, knn_cv_score_std = cross_val(lr, X_train, y_train, 10)
    # SVC
    svc_cv_score_mean, svc_cv_score_std = cross_val(lr, X_train, y_train, 10)
    # XGBoost
    xgb_cv_score_mean, xgb_cv_score_std = cross_val(lr, X_train, y_train, 10)
    
    
    models = ['Logistic Regression','Naive Bayes','Decision Tree','Random Forest',
             'KNN','SVC','XGBoost']
    cv_mean = [lr_cv_score_mean, nb_cv_score_mean, dt_cv_score_mean, rf_cv_score_mean, 
                   knn_cv_score_mean, svc_cv_score_mean, xgb_cv_score_mean]
    cv_std = [lr_cv_score_std, nb_cv_score_std, dt_cv_score_std, rf_cv_score_std, 
                   knn_cv_score_std, svc_cv_score_std, xgb_cv_score_std]
    
    model_comparison = pd.DataFrame(data=[models, cv_mean, cv_std]).T.rename(
                                                            {0: 'Model',
                                                             1: 'CV_Mean',
                                                             2: 'CV_Stdev'}, axis=1)
    
    return model_comparison

In [None]:
model_cv_comparison(X_train,y_train)

**The highest average cross validation score is 0.7919 & 0.0285 for standar deviation. The simpler algorithm the better, so I will use logistic regression.**

## 4.3. Prediction

In [None]:
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('my_submission_3.csv', index=False)
# print("My submission was successfully saved!")