In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, cross_val_predict
#from sklearn.grid_search import GridSearchCV

from sklearn import linear_model
from sklearn.linear_model import LogisticRegression

from sklearn import svm
from sklearn.svm import SVC, LinearSVC

from sklearn import ensemble
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import SGDClassifier


from sklearn import naive_bayes
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')


In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

## Performing Data Cleaning and Analysis
<!-- #### 1. Understanding meaning of each column: -->
<!-- <br>Data Dictionary: -->
<br>**Variable        Description**</br>
1. Survived	- Survived (1) or died (0)
2. Pclass -	Passenger’s class (1 = 1st, 2 = 2nd, 3 = 3rd)
3. Name	- Passenger’s name
4. Sex -	Passenger’s sex
5. Age	- Passenger’s age
6. SibSp -	Number of siblings/spouses aboard
7. Parch -	Number of parents/children aboard (Some children travelled only with a nanny, therefore parch=0 for them.)
8. Ticket -	Ticket number
9. Fare -	Fare
10. Cabin -	Cabin
11. Embarked -	Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)


In [None]:
"""
Deleting the columns not required for determining the survival of a person
"""

del train['PassengerId']
del train['Ticket']
del train['Fare']
del train['Cabin']
del train['Name']


In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.describe()

In [None]:
train.isnull().sum()

In [None]:
"""
Deleting the columns not required for determining the survival of a person for the data stored in test.csv
"""

del test['Ticket']
del test['Fare']
del test['Cabin']
del test['Name']


test.head()

In [None]:
test.describe()

In [None]:
test.isnull().sum()

In [None]:
#We will create a new column called "Gender" and
#fill it with values 1 ,2 based on the values of sex column in which male = 1 and female = 2

def getNum(str):
    if str == 'male':
        return '1'
    if str == 'female':
        return '2'

train['Gender'] = train['Sex'].apply(getNum)
print(train.head())

print("#"*50)

test['Gender'] = test['Sex'].apply(getNum)

# Delete the Sex columns
del train['Sex']
del test['Sex']

#Renaming "gender" column
train.rename(columns={'Gender':'Sex'}, inplace=True)

test.rename(columns={'Gender':'Sex'}, inplace=True)
test.head()

## Analyzing Data by Visualization

Inorder to understand who would have had a better probability of survival, we should visualize the patients who survived based on age, passenger class and etc.


In [None]:
sns.countplot(train['Survived'])

In [None]:
age_hist = sns.FacetGrid(train, col='Survived')
age_hist.map(plt.hist, 'Age')
age_hist.set_ylabels('Number')

## Number of people who survived using the age and passenger class


In [None]:
pclass_age_grid = sns.FacetGrid(train, col='Survived', row='Pclass', height=2.0, aspect=1.6)
pclass_age_grid.map(plt.hist, 'Age', alpha=0.5, bins=20)
pclass_age_grid.add_legend()
pclass_age_grid.set_ylabels('Number')

## Handling the NULL values in the column Age

#### We have NULL value inplace of Age for some of the people in both training and testing data.
#### So, one way is to fill them with the mean values i.e. fill the ones who have survived with the mean age of the survived people and similarly fill those who haven't survived with the mean age of all non-survived people.

#### But, this will only solve the problem for the training data and not for the testing data as we have to predict their survival status.

#### So, we can tackle the issue by creating an array that contains random numbers, which are computed based on the mean age value in regard to the standard deviation and is_null.



In [None]:
# finding average survived age
age_sur_mean = train[train.Survived==1]['Age'].mean()

age_sur_mean


### The avarage age of survived is 28.

In [None]:
# Finding the mean age of "Not Survived" people
age_nsur_mean = train[train.Survived==0]['Age'].mean()

age_nsur_mean

In [None]:
# Solving the null values in Age column

data = [train, test]
for data_point in data:
    mean = train['Age'].mean()
    std = test["Age"].std()
    is_null = data_point['Age'].isnull().sum()

    # compute the random range of age where mean-std, mean+std and is_null -> (start, high, size)
    random_age = np.random.randint(mean - std, mean + std, size=is_null)

    # Fill the random_age into NaN in Age columns
    age_copy = data_point['Age'].copy()
    age_copy[np.isnan(age_copy)] = random_age
    data_point['Age'] = age_copy
    data_point['Age'] = train['Age'].astype(int)

train['Age'].isnull().sum()

In [None]:
train.head()

In [None]:
"""Since there are ONLY 2 rows whose Embarked data is not known,
therefore we can neglect those 2 rows by dropping them as they will not make much of a difference"""

# Removing the 2 rows having null value for Embarked column
train.dropna(inplace=True)

In [None]:
train.info()

### Grouping the Age data

#### The age groups need to be converted into different sub-groups so that better prediction model can be formed


In [None]:
print(train['Age'].min())
print(train['Age'].max())
print(train['Age'].unique())

In [None]:
data=[train,test]

for data_point in data:
    data_point.loc[ data_point['Age'] <= 10, 'Age'] = 0
    data_point.loc[(data_point['Age'] > 10) & (data_point['Age'] <= 20), 'Age'] = 1
    data_point.loc[(data_point['Age'] > 20) & (data_point['Age'] <= 30), 'Age'] = 2
    data_point.loc[(data_point['Age'] > 30) & (data_point['Age'] <= 40), 'Age'] = 3
    data_point.loc[(data_point['Age'] > 40) & (data_point['Age'] <= 50), 'Age'] = 4
    data_point.loc[(data_point['Age'] > 50) & (data_point['Age'] <= 60), 'Age'] = 5
    data_point.loc[(data_point['Age'] > 60) & (data_point['Age'] <= 70), 'Age'] = 6
    data_point.loc[ data_point['Age'] > 70 , 'Age'] = 7

train['Age'].value_counts()


In [None]:
train.head()

In [None]:
data = [train, test]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].map({'C': 0, 'S': 1, 'Q': 2}).astype(int)

## Probability of Survived to other features

Like Pclass, Gender, SibSp, Parch etc.


In [None]:
train[['Embarked', "Survived"]].groupby(['Embarked']).mean().sort_values(by='Survived', ascending=True)

In [None]:
train[['Age', 'Survived']].groupby(['Age']).mean().sort_values(by='Survived', ascending=True)

In [None]:
train[['Pclass', 'Survived']].groupby(['Pclass']).mean().sort_values(by='Survived', ascending=True)

In [None]:

train[['Sex', 'Survived']].groupby(['Sex']).mean().sort_values(by='Survived', ascending=True)

In [None]:
train[['Parch', 'Survived']].groupby(['Parch']).mean().sort_values(by='Survived', ascending=True)

In [None]:
train[['SibSp', 'Survived']].groupby(['SibSp']).mean().sort_values(by='Survived', ascending=True)

In [None]:
"""
Observation is that 'Parent/children' and 'Sibling/Spouse' can be combine to make 'Relative' in which column family_members will be created.
"""

# Combine the columns 'Parch' and 'SibSp' as 'Family_Member'

train['Family_Member'] = train['Parch'] + train['SibSp'] + 1
test['Family_Member'] = test['Parch'] + test['SibSp'] + 1

del train['Parch']
del train['SibSp']
del test['Parch']
del test['SibSp']


In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(train.corr(), annot=True)
plt.show()

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(test.corr(), annot=True)
plt.show()

## Observation:

The correlation matrix revealed that the features in our dataset are not correlated which mean they are fit for the task.

In [None]:
train

In [None]:
test

## Building Models for Prediction

#### Now we will train multiple Machine Learning algorithms over the training data to predict the survival on our testing data and analyze the results thus obtained.

#### We might also use use cross-validation in the end.

####  We know that since the survival is represented as either 0 or 1, therefore it is a Classification problem. The algorithms used for the same are:
##### Logistic Regression
##### Support Vector Machines
##### KNN or K-Nearest Neighbors
##### Decision Trees
##### Random Forest
##### Stochastic Gradient descent (SGD)
##### Gaussian Naive Bayes



In [None]:
x_train = train.drop(['Survived'], axis = 1)
y_train = train['Survived']

x_test = test.drop('PassengerId', axis=1).copy()


## Logistic Regression


In [None]:
turned_param = [{'C': [10**-4, 10**-2, 10**0, 10**2, 10**4],
                 'penalty' : ['l1', 'l2', 'none', 'elasticnet'],
                'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}]
#Using GridSearch
log_reg = GridSearchCV(LogisticRegression(), turned_param, scoring = 'accuracy')
log_reg.fit(x_train, y_train)

print(log_reg.best_estimator_)
#print(model.score(x_test))

pred = log_reg.predict(x_test)

log_reg_acc = round(log_reg.score(x_train,y_train)*100, 2)
print('Accuracy: ', log_reg_acc,'%')

## Cross-Validation
#### Cross-Validation protects against overfitting.
#### It is a resampling method which tells us how well our model would generalize to unseen data. This is achieved by fixing a number of partitions of the dataset called folds, predicting each fold separately, and averaging the predictions in the end.



In [None]:
# Set our robust cross-validation scheme!
kf = KFold(n_splits = 5, random_state = 2)

# Print CV accuracy estimate:
#print(cross_val_score(logisticRegression, X_test, y_test, cv = kf).mean())
scores= cross_val_score(log_reg, x_train, y_train, cv = kf, scoring='f1')

mean_acc_log = scores.mean()*100

print('Scores: ', scores*100, '%')
print('Mean: ', mean_acc_log, '%')
print('Standard Deviation: ', scores.std()*100, '%\n')
 
pred= cross_val_predict(log_reg, x_train, y_train, cv=kf)
print('Confusion Matrix: \n' ,confusion_matrix(y_train, pred),'\n')

print("Precision: ", round(precision_score(y_train, pred)*100, 2),'%')
print("Recall: ", round(recall_score(y_train, pred)*100, 2), '%')
print('F1 Score: ', round(f1_score(y_train, pred)*100, 2), '%')


## KNN

In [None]:
# Creating odd list of K values
my_K_list = list(range(0, 30))
neighbors = list(filter(lambda x: x%2 != 0, my_K_list))

# empty list that will hold cv scores
cv_scores = []

# Perform K-fold cross validation
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, x_train, y_train, cv=3, scoring='f1')
    cv_scores.append(scores.mean())

# changing to misclassification error
MSE = [1 - x for x in cv_scores]

# determining best k
optimal_k =neighbors[MSE.index(min(MSE))]
print('\nThe optimal number of neighbors is %d.' % optimal_k)

# Plotting misclassification error vs optimal_k
plt.plot(neighbors, MSE)

for xy in zip(neighbors, np.round(MSE, 3)):
    plt.annotate('(%s, %s)' % xy, xy=xy, textcoords='data')

plt.xlabel('Number of Neighbors k')
plt.ylabel('Misclassification Error')
plt.show()

print('The misclassification error for each k value is :', np.round(MSE, 3))

In [None]:
# ============================== KNN with k = optimal_k ===============================================
# instantiate learning model k = optimal_k
knn = KNeighborsClassifier(n_neighbors=optimal_k)

# fitting the model with
knn.fit(x_train, y_train)

# predict the response
pred = knn.predict(x_test)

pred

In [None]:
knn_acc = round(knn.score(x_train, y_train)*100, 2)

print("Accuracy:", knn_acc, '%')

## Cross-Validation

In [None]:

# Print CV accuracy estimate:
scores = cross_val_score(KNeighborsClassifier(), x_train, y_train, cv=kf, scoring='accuracy')

mean_acc_knn = scores.mean()*100

print('Scores: ', scores*100, '%')
print('Mean: ', mean_acc_knn, '%')
print('Standard Deviation: ', scores.std()*100, '%\n')

knn_pred = cross_val_predict(knn, x_train, y_train, cv=kf)

cm = confusion_matrix(y_train, knn_pred)
ax = sns.heatmap(cm, square=True, annot=True, cbar=False, fmt='d')
#ax.xaxis.set_ticklabels(fon )
ax.set_xlabel('Actual Label', fontsize=13)
ax.set_ylabel('Predicted Label', fontsize=13)

print("Precision: ", round(precision_score(y_train, knn_pred)*100, 2),'%')
print("Recall: ", round(recall_score(y_train, knn_pred)*100, 2), '%')
print('F1 Score: ', round(f1_score(y_train, knn_pred)*100, 2), '%')



## Decision Tree



In [None]:
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

tree_pred = tree.predict(x_test)
tree_pred

In [None]:
#Checking accuracy of Decision Tree model

tree_acc = round(tree.score(x_train, y_train)*100, 3)
print('Accuracy:',tree_acc,'%')

In [None]:
# CV_Tree
scores = cross_val_score(DecisionTreeClassifier(), x_train, y_train, cv=kf, scoring='accuracy')

mean_acc_tree = scores.mean() * 100

print('Scores: ', scores*100, '%')
print('Mean: ', mean_acc_tree, '%')
print('Standard Deviation: ', scores.std()*100, '%\n')

pred_tree = cross_val_predict(tree, x_train, y_train, cv=kf)


print("Precision: ", round(precision_score(y_train, pred_tree)*100, 2),'%')
print("Recall:  ", round(recall_score(y_train, pred_tree)*100, 2), '%')
print('F1 Score: ', round(f1_score(y_train, pred_tree)*100, 2), '%')


cm = confusion_matrix(y_train, pred_tree)
ax = sns.heatmap(cm, square=True, annot=True, cbar=False, fmt='d')
#ax.xaxis.set_ticklabels()
ax.set_xlabel('Actual Label', fontsize=13)
ax.set_ylabel('Predicted Label', fontsize=13)

## Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=30)

rf.fit(x_train, y_train)

pred_rf = rf.predict(x_test)

print(pred_rf)

In [None]:
# Checking accuracy of the Random Forest Classifier

#rf.score(X_train, y_train)
acc_rf = round(rf.score(x_train, y_train) * 100, 3)
print("Accuracy", acc_rf,'%')

In [None]:
# CV_Tree
scores = cross_val_score(RandomForestClassifier(), x_train, y_train, cv=kf, scoring='accuracy')

mean_acc_rf = scores.mean() * 100

print('Scores: ', scores*100, '%')
print('Mean: ', mean_acc_rf, '%')
print('Standard Deviation: ', scores.std()*100, '%\n')

pred_rf = cross_val_predict(rf, x_train, y_train, cv=kf)


print("Precision: ", round(precision_score(y_train, pred_tree)*100, 2),'%')
print("Recall:  ", round(recall_score(y_train, pred_tree)*100, 2), '%')
print('F1 Score: ', round(f1_score(y_train, pred_tree)*100, 2), '%')


cm = confusion_matrix(y_train, pred_rf)
ax = sns.heatmap(cm, square=True, annot=True, cbar=False, fmt='d')
#ax.xaxis.set_ticklabels()
ax.set_xlabel('Actual Label', fontsize=13)
ax.set_ylabel('Predicted Label', fontsize=13)


## Support Vector Machine (Support Vector Classification)

In [None]:
svc = SVC()

svc.fit(x_train, y_train)
svc_pred = svc.predict(x_test)

svc_acc = round(svc.score(x_train, y_train)*100, 2)

print('Accuracy: ',svc_acc , '%')

In [None]:
# CV_Tree
scores = cross_val_score(SVC(), x_train, y_train, cv=kf, scoring='accuracy')

mean_acc_svc = scores.mean() * 100

print('Scores: ', scores*100, '%')
print('Mean: ', mean_acc_svc, '%')
print('Standard Deviation: ', scores.std()*100, '%\n')

pred_svc  = cross_val_predict(svc, x_train, y_train, cv=kf)


print("Precision: ", round(precision_score(y_train, pred_svc)*100, 2),'%')
print("Recall:  ", round(recall_score(y_train, pred_svc)*100, 2), '%')
print('F1 Score: ', round(f1_score(y_train, pred_svc)*100, 2), '%')


cm = confusion_matrix(y_train, pred_svc)
plt.figure(figsize=(12, 8))
ax = sns.heatmap(cm, square=True, annot=True, cbar=False, fmt='d')
ax.xaxis.set_ticklabels(["False","True"], fontsize = 12)
ax.yaxis.set_ticklabels(["False","True"], fontsize = 12, rotation=0)
ax.set_xlabel('Actual Label', fontsize=13)
ax.set_ylabel('Predicted Label', fontsize=13)
plt.show()

print('Confusion Matrix: \n' ,confusion_matrix(y_train, pred_svc),'\n')

## Naive Bayes

In [None]:
gaussian = GaussianNB() 
gaussian.fit(x_train, y_train)  

y_pred = gaussian.predict(x_test)
print(y_pred)


In [None]:
#Checking accuracy for the Gaussian Naive Bayes model

acc_gaussian = round(gaussian.score(x_train, y_train) * 100, 2)
print(acc_gaussian,'%')

In [None]:
# Print CV accuracy estimate:
#print(cross_val_score(GaussianNB() , X_test, y_test, cv = kf).mean())

scores= cross_val_score(GaussianNB() , x_train, y_train, cv = kf, scoring='f1')

mean_acc_gau = scores.mean()*100

print('Scores: ', scores*100, '%')
print('Mean: ', mean_acc_gau, '%')
print('Standard Deviation: ', scores.std()*100, '%\n')

#Confusion Matrix
pred= cross_val_predict(gaussian, x_train, y_train, cv=kf)

print("Precision: ", round(precision_score(y_train, pred)*100, 2),'%')
print("Recall: ", round(recall_score(y_train, pred)*100, 2), '%')
print('F1 Score: ', round(f1_score(y_train, pred)*100, 2), '%')


cm = confusion_matrix(y_train, pred)
plt.figure(figsize=(12, 8))
ax = sns.heatmap(cm, square=True, annot=True, cbar=False, fmt='d')
ax.xaxis.set_ticklabels(["False","True"], fontsize = 12)
ax.yaxis.set_ticklabels(["False","True"], fontsize = 12, rotation=0)
ax.set_xlabel('Actual Label', fontsize=13)
ax.set_ylabel('Predicted Label', fontsize=13)
plt.show()

## Using Bagging Classifier

In [None]:

bag_clf = BaggingClassifier(base_estimator=tree, n_estimators=2000,
                            bootstrap=True, n_jobs=-1,
                            random_state=42)

In [None]:
bag_clf.fit(x_train, y_train)

In [None]:
bagg_clf_pred = bag_clf.predict(x_train)

bagg_acc = round(bag_clf.score(x_train, y_train)*100, 2)
print('Accuracy: ',bagg_acc , '%')


In [None]:
# CV_Tree
#scores = cross_val_score(BaggingClassifier(), x_train, y_train, cv=kf, scoring='accuracy')

bag_pred = cross_val_predict(BaggingClassifier(), x_train, y_train, cv=kf)

mean_acc_bag = scores.mean()*100

print('Scores: ', scores*100, '%')
print('Mean: ', mean_acc_bag, '%')
print('Standard Deviation: ', scores.std()*100, '%\n')

print("Precision: ", round(precision_score(y_train, bag_pred)*100, 2),'%')
print("Recall:  ", round(recall_score(y_train, bag_pred)*100, 2), '%')
print('F1 Score: ', round(f1_score(y_train, bag_pred)*100, 2), '%')

#Confusion Matrix
pred= cross_val_predict(bag_clf, x_train, y_train, cv=kf)
cm = confusion_matrix(y_train, pred)
plt.figure(figsize=(12, 8))
ax = sns.heatmap(cm, square=True, annot=True, cbar=False, fmt='d')
ax.xaxis.set_ticklabels(["False","True"], fontsize = 12)
ax.yaxis.set_ticklabels(["False","True"], fontsize = 12, rotation=0)
ax.set_xlabel('Actual Label', fontsize=13)
ax.set_ylabel('Predicted Label', fontsize=13)
plt.show()

In [None]:
ada_clf = AdaBoostClassifier(learning_rate =0.02, n_estimators =5000)#%%

ada_clf.fit(x_train, y_train)

In [None]:
pred_ada = ada_clf.predict(x_train)

ada_acc = round(ada_clf.score(x_train, y_train)*100, 2)
print('Accuracy: ', ada_acc, '%')

In [None]:
# Print CV accuracy estimate:
#print(cross_val_score(GaussianNB() , X_test, y_test, cv = kf).mean())

scores= cross_val_score(AdaBoostClassifier() , x_train, y_train, cv = kf, scoring='f1')

mean_acc_ada = scores.mean()*100

print('Scores: ', scores*100, '%')
print('Mean: ', mean_acc_ada, '%')
print('Standard Deviation: ', scores.std()*100, '%\n')

print("Precision: ", round(precision_score(y_train, pred)*100, 2),'%')
print("Recall: ", round(recall_score(y_train, pred)*100, 2), '%')
print('F1 Score: ', round(f1_score(y_train, pred)*100, 2), '%')

#Confusion Matrix
pred= cross_val_predict(ada_clf, x_train, y_train, cv=kf)
cm = confusion_matrix(y_train, pred)
plt.figure(figsize=(12, 8))
ax = sns.heatmap(cm, square=True, annot=True, cbar=False, fmt='d')
ax.xaxis.set_ticklabels(["False","True"], fontsize = 12)
ax.yaxis.set_ticklabels(["False","True"], fontsize = 12, rotation=0)
ax.set_xlabel('Actual Label', fontsize=13)
ax.set_ylabel('Predicted Label', fontsize=13)
plt.show()

## Finding the Best Model



In [283]:
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Support Vector Machines', 'K-Nearest Neighbors', 'Decision Tree', 'Random forest', 'Gaussian Naive Bayes', 'Bagging Classifier', 'AdaBoost'],
    'Accuracy Score': [log_reg_acc, svc_acc, knn_acc, tree_acc, acc_rf, acc_gaussian, bagg_acc, ada_acc],
    'Mean Score': [mean_acc_log, mean_acc_svc, mean_acc_knn, mean_acc_tree, mean_acc_rf, mean_acc_gau, mean_acc_bag, mean_acc_ada]
})
df_result = results.sort_values(by='Mean Score', ascending=False)
df_result = df_result.set_index('Model')

df_result

Unnamed: 0_level_0,Accuracy Score,Mean Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Support Vector Machines,83.01,82.457945
Random forest,86.389,79.869866
Decision Tree,86.389,78.97099
K-Nearest Neighbors,83.35,76.71999
AdaBoost,81.1,73.886835
Gaussian Naive Bayes,79.98,72.049361
Bagging Classifier,86.39,72.049361
Logistic Regression,79.53,70.395898
