In [None]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import dtale as dt

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import linear_model
from sklearn.linear_model import LogisticRegression

from sklearn import svm
from sklearn.svm import SVC, LinearSVC

from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier

from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import SGDClassifier

from sklearn import naive_bayes
from sklearn.naive_bayes import GaussianNB

from sklearn import metrics
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

from sklearn import model_selection
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, cross_val_predict

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')


#### Load the dataset

The train and test were provided separately

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Using the D-tale library to display the dataset

In [None]:
dt.show(train)

In [None]:
dt.show(test)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.info()

## Performing Data Cleaning and Analysis
<!-- #### 1. Understanding meaning of each column: -->
<!-- <br>Data Dictionary: -->
<br>**Variable        Description**</br>
1. Survived	- Survived (1) or died (0)
2. Pclass -	Passenger’s class (1 = 1st, 2 = 2nd, 3 = 3rd)
3. Name	- Passenger’s name
4. Sex -	Passenger’s sex
5. Age	- Passenger’s age
6. SibSp -	Number of siblings/spouses aboard
7. Parch -	Number of parents/children aboard (Some children travelled only with a nanny, therefore parch=0 for them.)
8. Ticket -	Ticket number
9. Fare -	Fare
10. Cabin -	Cabin
11. Embarked -	Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

## Observation:

From the above, it was discovered that;

1. The train dataset was (891 rows, 12 column) and the test (418 rows, 11 collumns)
2. There are missing value in age, Embarked and cabin for both train and test dataset
3. Some columns are not needed to determining the survival of a person e.g. Name, PassengerId, Ticket 

#### Deleting unnecessary column for both train and test dataset

In [None]:
"""
Deleting the columns not required for determining the survival of a person in the train data
"""

del train['PassengerId']
del train['Ticket']
del train['Fare']
del train['Cabin']
del train['Name']

In [None]:
"""
Similarly, deleting the columns not required for determining the survival of a person for the test.csv data
"""

del test['Ticket']
del test['Fare']
del test['Cabin']
del test['Name']

## Imputation method for missing value

The cabin columns has been removed since it is not usefull.

#### Embarked column has 2 missing value in the train data, though removing or doing an imputation will not make much of a difference, Imputation was chosen. The missing value will be replaced by the mode since it is an object data type.


#### Age column has missing value for some of the people in both training and testing data. It can be solved by 
* filling the ones who have survived with the mean age of the survived people
* similarly fill those who haven't survived with the mean age of all non-survived people.

#### But Note, this type of imputation will not be proper for testing, this is because the null will be filled with the single value of mean to predict their survival status but this will not generalise as the case may be. 

* To solve the issue, an array of random numbers which are generated from mean age value in regards to standard deviation and is_null will be used for the missing value imputation.

In [None]:
# we create a list of training and testing dataset

titanic_data = [train, test]

for data in titanic_data:
    mean = train['Age'].mean()
    std = test['Age'].std()
    is_null = data["Age"].isnull().sum()
    
    # random numbers from mean, standard deviation and is_null will be computed
    random_age = np.random.randint(mean - std, mean + std, size = is_null)

    # fill NaN values in Age column with random values generated
    age_slice = data["Age"].copy()
    age_slice[np.isnan(age_slice)] = random_age
    data['Age'] = age_slice
    data['Age'] = train['Age'].astype(int)
    
train['Age'].isnull().sum

In [None]:
train.info()

In [None]:
train['Embarked'].fillna((train['Embarked'].value_counts().index[0]), inplace=True)

In [None]:
test.info()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
test.describe()

### Categorical Encoding

#### Column sex and Embarked neede to be encoded as they are categorical feature.
* One-Hot Encoding will be used in encoding the sex feature
* Label encoding will be used for Embarked features

In [None]:

# mark the variable as categorical type

train['Sex'] = train['Sex'].astype('category')
test['Sex'] = test['Sex'].astype('category')

train['Embarked'] = train['Embarked'].astype('category')
test['Embarked'] = test['Embarked'].astype('category')

# Also convert the passenger's class to category
# train['Pclass'] = train['Pclass'].astype('category')
# test['Pclass'] = test['Pclass'].astype('category')

train.info()

In [None]:
train['Sex'] = pd.get_dummies(train['Sex'])
test["Sex"] = pd.get_dummies(test['Sex'])

train['Embarked'] = train['Embarked'].cat.codes
test['Embarked'] = test['Embarked'].cat.codes

# train['Sex'] = train['Sex'].cat.codes
# test['Sex'] = test['Sex'].cat.codes

In [None]:
train.info()

In [None]:
train.head()

## Analyzing Data by Visualization

Inorder to understand who would have had a better probability of survival, we should visualize the patients who survived based on age, passenger class and etc.


In [None]:

sns.countplot(train['Survived'])

In [None]:
# Based on Age

age_hist = sns.FacetGrid(train, col='Survived')
age_hist.map(plt.hist, 'Age')
age_hist.set_ylabels('Number')

Age 20-40 years are among those that didn't survived and the infants have higher number of survised than the teenager

#### Using Passenger class(Pclass) and Age to determine who survived

In [None]:
pclass_age_grid = sns.FacetGrid(train, col='Survived', row='Pclass', height=2.0, aspect=1.6)
pclass_age_grid.map(plt.hist, 'Age', alpha=0.5, bins=20)
pclass_age_grid.add_legend()
pclass_age_grid.set_ylabels('Number')

People in the 3rd class with age range of 20 - 40 years are those that didn't survived compare to others most especially the 1st Pclass which have the most survived people.

In [None]:
#Combining 'Parch' & 'SibSp' as 'Realtives'
train['Family_Members']=train['Parch']+train['SibSp'] + 1
test['Family_Members']=test['Parch']+test['SibSp'] + 1

del train['SibSp']
del train['Parch']

del test['SibSp']
del test['Parch']

## Age Groupping

For better prediction, the Age will be groupped to sub-groups

In [None]:
titanic_data=[train,test]

for data in titanic_data:
    data.loc[ data['Age'] <= 10, 'Age'] = 0
    data.loc[(data['Age'] > 10) & (data['Age'] <= 20), 'Age'] = 1
    data.loc[(data['Age'] > 20) & (data['Age'] <= 35), 'Age'] = 2
    data.loc[(data['Age'] > 35) & (data['Age'] <= 45), 'Age'] = 3
    data.loc[ data['Age'] > 45 , 'Age'] = 4

train['Age'].value_counts()

In [None]:
plt.figure(figsize=(11, 11))
sns.heatmap(train.corr(), annot=True)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(test.corr(), annot=True)

#### Both the training and testing dataset do not seem to have any correllated variables

In [None]:
dt.show(train)

In [None]:
dt.show(test)

## Building Model for Prediction

#### Since the survival is represented as either 0 or 1, therefore it is a Classification problem. The algorimths will be using are:

* #### Logistic Regression
* #### Support Vector Machines
* #### KNN or K-Nearest Neighbors
* #### Decision Trees
* #### Random Forest
* #### Stochastic Gradient descent (SGD)
* #### Gaussian Naive Bayes

In [None]:
# To begin with, let drop and assign the survival columns from train and drop passengerID from test

X_train= train.drop(['Survived'], axis =1)
y_train= train['Survived']

X_test=test.drop('PassengerId', axis=1).copy()

print('X_train: {}\nX_test: {}\ny_train: {}'.format(X_train.shape, X_test.shape, y_train.shape ))


In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler().fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

## Logistic Regression

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train_std, y_train)

y_pred = log_reg.predict(X_test_std)
print(y_pred)

In [None]:
# Checking accuracy of the linear regressor model

log_reg_acc = round(log_reg.score(X_train_std,y_train)*100, 2)
print(log_reg_acc,'%')

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)
print(y_pred)

In [None]:
round(np.mean(y_pred), 2)

In [None]:
# Checking accuracy of the linear regressor model

log_reg_acc = round(log_reg.score(X_train,y_train)*100, 2)
print(log_reg_acc,'%')

In [None]:
# df_coeff= pd.DataFrame(train.columns.delete(0))
# df_coeff.columns = ['Feature']
# df_coeff['Correlation'] = pd.Series(log_reg.coef_[0])

# df_coeff.sort_values(by='Correlation', ascending=False)

## Cross-Validation
#### Cross-Validation protects against overfitting.
#### It is a resampling method which tells us how well our model would generalize to unseen data. This is achieved by fixing a number of partitions of the dataset called folds, predicting each fold separately, and averaging the predictions in the end.


In [None]:
# Set our robust cross-validation scheme!
kf = KFold(n_splits = 10, random_state = 2)

# Print CV accuracy estimate:
#print(cross_val_score(logisticRegression, X_test, y_test, cv = kf).mean())
scores= cross_val_score(log_reg, X_train, y_train, cv = kf, scoring='accuracy')

mean_acc_log = scores.mean()*100

print('Scores: ', scores*100, '%')
print('Mean: {0:.2f}%'.format(mean_acc_log))
print('Standard Deviation: ', scores.std()*100, '%\n')

pred= cross_val_predict(log_reg, X_train, y_train, cv=kf)
print('Confusion Matrix: \n' ,confusion_matrix(y_train, pred),'\n')

print("Precision: ", round(precision_score(y_train, pred)*100, 2),'%')
print("Recall: ", round(recall_score(y_train, pred)*100, 2), '%')
print('F1 Score: ', round(f1_score(y_train, pred)*100, 2), '%')


### The first row is about the not-survived-predictions: 470 passengers were correctly classified as not survived (called true negatives) and 79 where wrongly classified as not survived (false positives).
### The second row is about the survived-predictions: 110 passengers where wrongly classified as survived (false negatives) and 230 where correctly classified as survived (true positives).

# Support Vector Machine (SVM)

In [None]:
svc = SVC()
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)
print(y_pred)
y_pred.shape

In [None]:
# Checking accuracy of the support vector model

svc_acc = round(svc.score(X_train_std,y_train)*100, 2)
print(svc_acc,'%')

In [None]:
# Print CV accuracy estimate:
#print(cross_val_score(SVC(), X_test, y_test, cv = kf).mean())

scores= cross_val_score(SVC(), X_train, y_train, cv = kf, scoring='accuracy')

mean_acc_svc = scores.mean()*100

print('Scores: ', scores*100, '%')
print('Mean: ', mean_acc_svc, '%')
print('Standard Deviation: ', scores.std()*100, '%\n')

pred= cross_val_predict(svc, X_train, y_train, cv=kf)
print('Confusion Matrix: \n' ,confusion_matrix(y_train, pred),'\n')

print("Precision: ", round(precision_score(y_train, pred)*100, 2),'%')
print("Recall: ", round(recall_score(y_train, pred)*100, 2), '%')
print('F1 Score: ', round(f1_score(y_train, pred)*100, 2), '%')


In [None]:
params = {'C':(0.001,0.005,0.01,0.05, 0.1, 0.5, 1, 5, 10, 50,100,500,1000)} 

In [None]:
clf_svm_l = SVC(kernel='linear')

In [None]:
svm_grid_lin = GridSearchCV(clf_svm_l, params, n_jobs=-1,
                            cv=10, verbose=1, scoring='accuracy') 

In [None]:
svm_grid_lin.fit(X_train_std, y_train)
svm_grid_lin.best_params_ 

In [None]:
linsvm_clf = svm_grid_lin.best_estimator_

In [None]:
svc_acc=round(svm_grid_lin.score(X_train_std,y_train)*100, 2)
print(svc_acc,'%')

In [None]:

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(15, 15))
ax =sns.heatmap(cm, square=True, annot=True, cbar=False, fmt='d')
ax.xaxis.set_ticklabels(class_names, fontsize = 12)
ax.yaxis.set_ticklabels(class_names, fontsize = 12, rotation=0)
ax.set_xlabel('Predicted Labels',fontsize = 15)
ax.set_ylabel('True Labels',fontsize = 15)
plt.show()