# Dataset Information
   
   This is a standard supervised classification task.A classification problem where we have to predict whether a passenger would survive the Titanic or not. Below is the dataset attributes with description.
   
Variable | Description
----------|--------------
Passenger ID | Identification Number of Passenger
Sex | Male/ Female
Pclass | Passenger class(l,2,3)
Name | Name of the passenger
Age | Age of the passenger
SibSp | Number of sibling or spouse on the ship
Parch | Number of the children or parent on the ship
Ticket | Ticket Number
Fare | Price of the ticket
Cabin | Cabin number of the passenger
Embarked | Port of embarkation
Survived | Target Variable(value 0 for perished , 1 for survived

### Import module

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df.head(10)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
print('The training dataset has ', df.shape )

# Preprocessing the dataset

### Finding Null Values 

In [None]:
# find the null values
df.isnull().sum()

In [None]:
# we have missing data in Age, Cabin and Embarked
# filling the Age with mean of Age and rounding it
#df['Age'] = df['Age'].fillna(round(df['Age'].mean(),0))

## Dropping Columns 

#### Dropping Column Cabin

In [None]:
# dropping Cabin Column
df = df.drop(['Cabin'], axis = 1)

#### Dropping Columns (PassengerID and Ticket)

In [None]:
##Dropping unnecessary columns (PassengerId, Ticket)

df = df.drop(['PassengerId'], axis = 1)
df = df.drop(['Ticket'], axis = 1)

## Handling Null Values 

In [None]:
df.isnull().sum()

#### Handling Null Values in Embarked Column

In [None]:
#filling the Embarked with mode values (categorical)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [None]:
df.isnull().sum()

## Additional Columns 

#### Addition of Family Size Column

In [None]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

#### Addition of Title Column

In [None]:
df['Name']

In [None]:
def get_title(x):
    return x.split(',')[1].split('.')[0].strip()

In [None]:
df['Title'] = df['Name'].apply(get_title)

In [None]:
df['Title'].unique()

## Handling null values in Age Column

In [None]:
age_mean = df.groupby('Title')['Age'].mean()
age_mean

In [None]:
def fill_age(x):
    for index, age in zip(age_mean.index, age_mean.values):
        if x['Title'] == index:
            return age

In [None]:
df['Age'] = df.apply(lambda x: fill_age(x) if np.isnan(x['Age']) else x['Age'], axis=1)
df['Age'] = df.Age.round(1)

In [None]:
df.head(10)


### Drop Name Column

In [None]:
df = df.drop(['Name'], axis = 1)

In [None]:
df.head(10)

In [None]:
# Create IsAlone feature

df['IsAlone'] = 0
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

In [None]:
# Mean of survival by IsAlone

df[['IsAlone', 'Survived']].groupby('IsAlone', as_index = False).mean().sort_values(by = 'Survived', ascending = False)

In [None]:
df.head(10)

## Pclass and Sex

In [None]:
#df['PoorMen'] = (df['Sex'] == "male") + (df['Age'] > 16) and (df['Pclass'] == 3)
#third_class_adult_men = [(df['Sex'] == "male") and (df['Age'] > 16) and (df['Pclass'] == 3)]
#df['third_class_adult_men'] = third_class_adult_men
#df.head()
#poor_female = (df.Sex == "female") & (df.Pclass == 3) & (df.Age > 16)
#df["poor_female"] = poor_female
#rich_female = (df.Sex == "female") & (df.Pclass == 1) & (df.Age > 16)
#df["rich_female"] = rich_female

#poor_male = (df.Sex == "male") & (df.Pclass == 3) & (df.Age > 16)
#df["poor_male"] = poor_male
#rich_male = (df.Sex == "male") & (df.Pclass == 1) & (df.Age > 16)
#df["rich_male"] = rich_male

In [None]:
df.head(10)

# Exploratory Data Analysis

### Counting the number of males and females on board the Titanic

In [None]:
df['Sex'].value_counts(dropna = False)


### Graph plotting the number of males and females on board the Titanic

In [None]:
df['Sex'].value_counts().plot(kind='bar', color = ['r', 'b'])

### Counting the number of passengers within each class on board the Titanic

In [None]:
df['Pclass'].value_counts(dropna = False)

### Graph plotting the count of the number of passengers by Pclass on board the Titanic

In [None]:
df['Pclass'].value_counts().plot(kind = 'bar', color = ['b', 'r', 'g'])

### Graph showing how many passengers by sex were in each Pclass

In [None]:
sns.countplot(x = df['Pclass'], hue=df["Sex"])

### Pie chart plotting count of passengers from point of Embarkation

In [None]:
df['Embarked'].value_counts().plot(kind = 'pie', autopct='%1.2f%%')
plt.legend()


### Graph showing how many passengers by sex embarked from each location  

In [None]:
sns.countplot(x = df['Embarked'], hue=df["Sex"])

### Counting the distribution of SibSp on board the Titanic

In [None]:
df['SibSp'].value_counts(dropna = False)

### Graph showing distribution of SibSp on board the Titanic

In [None]:
df['SibSp'].value_counts().plot(kind = 'bar', color = "purple")
plt.ylabel("count")
plt.xlabel("SibSp")
plt.title("Distribution of SibSp on board the Titanic")

### Counting the distribution of Parch on board the Titanic

In [None]:
df['Parch'].value_counts(dropna = False)

### Graph showing distribution of Parch on board the Titanic

In [None]:
df['Parch'].value_counts().plot(kind = 'bar', color = "red")
plt.ylabel("count")
plt.xlabel("Parch")
plt.title("Distribution of Parch on board the Titanic")

### Counting the distribution of family size on board the Titanic

In [None]:
df['FamilySize'].value_counts(dropna = False)

### Graph plotting the distribution of family size  on board the Titanic

In [None]:
df['FamilySize'].value_counts().plot(kind = 'bar')
plt.ylabel("count")
plt.xlabel("FamilySize")
plt.title("Distribution of Family Size on board the Titanic")


### Age Distribution on the Titanic 

In [None]:
sns.distplot(df['Age'])

### Graph plot showing the Age Distribution by survival on the Titanic 

In [None]:
sns.kdeplot(df['Age'][df['Survived']== 0], label = 'Died')
sns.kdeplot(df['Age'][df['Survived']== 1], label = 'Survived')
plt.xlabel('Age')
plt.title('Age Distribution by Survival')

### Fare Distribution on the Titanic

In [None]:
sns.distplot(df['Fare'])

# Subplots for Distribution of Features 

In [None]:
#import matplotlib.pylab as plt

fig, axes = plt.subplots(nrows=1, ncols=3, figsize = (20,25))

plt.subplot(3, 3, 1)
plt.title("The distribution of males and females on board the Titanic")
df['Sex'].value_counts().plot(kind='bar', color = ['r', 'b'])
plt.ylabel("count")
plt.xlabel("Sex")
 #2 plots on top of each other, 3 columns, first plot
#plt.plot(ax=axes[0])
#plt.plot(df1)

plt.subplot(3, 3, 2)
plt.title("The distribution of Pclass on board the Titanic")
df['Pclass'].value_counts().plot(kind = 'bar', color = ['b', 'r', 'g'])
plt.ylabel("count")
plt.xlabel("Pclass")
 #2 plots on top of each other, 3 columns, first plot
#plt.plot(ax=axes[1])
#plt.plot(df2)


plt.subplot(3, 3, 3)
df['Embarked'].value_counts().plot(kind = 'pie')
plt.legend()
plt.title("The distribution of passengers from point of Embarkation")


plt.subplot(3, 3, 4)
sns.countplot(x = df['Pclass'], hue=df["Sex"])
plt.title("The distribution of passengers by sex were in each Pclass")
#plt.plot(ax=axes[2])

plt.subplot(3, 3, 5)
sns.countplot(x = df['Embarked'], hue=df["Sex"])
plt.title("The distribution of passengers by sex from point of embarkation")

plt.subplot(3, 3, 6)
df['SibSp'].value_counts().plot(kind = 'bar', color = "purple")
plt.ylabel("count")
plt.xlabel("SibSp")
plt.title("Distribution of SibSp on board the Titanic")
#df['Pclass'].plot(ax=axes[0])

plt.subplot(3, 3, 7)
df['Parch'].value_counts().plot(kind = 'bar', color = "red")
plt.ylabel("count")
plt.xlabel("Parch")
plt.title("Distribution of Parch on board the Titanic")

plt.subplot(3, 3, 8)
df['FamilySize'].value_counts().plot(kind = 'bar')
plt.ylabel("count")
plt.xlabel("FamilySize")
plt.title("Distribution of Family Size on board the Titanic")


plt.show()

# Graphs showing how the features correlate with survival

### Mean of Survival by Sex 

Female passengers have a higher mean of survival than male passengers and reason for this is that female a passengers were prioritized during the Titanic evacuation

In [None]:
df[['Sex', 'Survived']].groupby('Sex', as_index = False).mean().sort_values(by = 'Survived', ascending = False)

### Barplot showing the survival rate by Sex

In [None]:
sns.barplot(x = 'Sex', y = 'Survived', data = df)
plt.ylabel('Survival Rate')
plt.xlabel('Sex')
plt.title('Survival Rate by sex')
plt.show()

### Mean of Survival by Pclass

In [None]:
df[['Pclass', 'Survived']].groupby('Pclass', as_index = False).mean().sort_values(by = 'Survived', ascending = False)

### Barplot showing the survival rate by Pclass

In [None]:
sns.barplot(x = 'Pclass', y = 'Survived', data = df)
plt.ylabel('Survival Rate')
plt.xlabel('Pclass')
plt.title('Survival Rate by Pclass')

plt.show()

### Catplot showing the survival rate of passengers by sex for each Pclass

In [None]:
sns.catplot(x="Sex", y="Survived", hue="Pclass", kind="bar", data=df)
plt.title("Survival Rate by Sex and Pclass")

### Mean of Survival by point of Embarkation

In [None]:
df[['Embarked', 'Survived']].groupby('Embarked', as_index = False).mean().sort_values(by = 'Survived', ascending = False)

### Barplot showing the survival rate by point of Embarkation

In [None]:
sns.barplot(x = 'Embarked', y = 'Survived', data = df)
plt.ylabel('Survival Rate')
plt.xlabel('Embarked')
plt.title('Survival Rate by point of Embarkation')

plt.show()

## need to check but shows that majority of first class passengers embarked from location C

### Factorplot of count of people by Pclass that Embarked from each location
Proof of hypothesis that majority of 1st class passengers embarked from C compared to S where majority of 3rd class passengers emabarked and therefore had a lower survival rate

In [None]:
sns.factorplot('Pclass', col = 'Embarked', data = df, kind = 'count')
#shows count of people by Pclass that have embarked from each location

 ### Mean of Survival by SibSp

In [None]:
df[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

### Barplot showing the Survival Rate by SibSp

In [None]:
sns.barplot(x = 'SibSp', y = 'Survived', data = df)
plt.ylabel('Survival Rate')
plt.xlabel('SibSp')
plt.title('Survival Rate by SibSp')

plt.show()


 ### Mean of Survival by Parch

In [None]:
df[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

### Barplot showing the Survival Rate by Parch

In [None]:
sns.barplot(x = 'Parch', y = 'Survived', data = df)
plt.ylabel('Survival Rate')
plt.xlabel('Parch')
plt.title('Survival Rate by Parch')

plt.show()

### Mean of Survival by Family Size

In [None]:
df[["FamilySize", "Survived"]].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

### Barplot showing the Survival Rate by Family Size

In [None]:
sns.barplot(x = 'FamilySize', y = 'Survived', data = df)
plt.ylabel('Survival Rate')
plt.xlabel('FamilySize')
plt.title('Survival Rate by FamilySize')

plt.show()

# Subplots of Survival Rate by Feature

Have not included the graphs survival rate by sex and Pclass as not able to fit into subplot. Have also not included the graphs Factorplot of count of people by Pclass that Embarked from each location as not able to fit on plot. Both these graphs will go into the report. 

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize = (19,14))

plt.subplot(2, 3, 1)
plt.ylabel('Survival Rate')
plt.xlabel('Sex')
plt.title('Survival Rate by sex')
sns.barplot(x = 'Sex', y = 'Survived', data = df)

plt.subplot(2, 3, 2)
sns.barplot(x = 'Pclass', y = 'Survived', data = df)
plt.ylabel('Survival Rate')
plt.xlabel('Pclass')
plt.title('Survival Rate by Pclass')

#plt.subplot(2, 3, 3)
#sns.catplot(x="Sex", y="Survived", hue="Pclass", kind="bar", data=df)
#plt.title("Survival Rate by Sex and Pclass")

plt.subplot(2, 3, 3)
sns.barplot(x = 'Embarked', y = 'Survived', data = df)
plt.ylabel('Survival Rate')
plt.xlabel('Embarked')
plt.title('Survival Rate by point of Embarkation')


plt.subplot(2, 3, 4)
sns.barplot(x = 'SibSp', y = 'Survived', data = df)
plt.ylabel('Survival Rate')
plt.xlabel('SibSp')
plt.title('Survival Rate by SibSp')

plt.subplot(2, 3, 5)
sns.barplot(x = 'Parch', y = 'Survived', data = df)
plt.ylabel('Survival Rate')
plt.xlabel('Parch')
plt.title('Survival Rate by Parch')
df['Pclass'].plot(ax=axes[0])

plt.subplot(2, 3, 6)
sns.barplot(x = 'FamilySize', y = 'Survived', data = df)
plt.ylabel('Survival Rate')
plt.xlabel('FamilySize')
plt.title('Survival Rate by FamilySize')


plt.show()

# Changing Categorical Data to Numerical

### Changing Sex and Embarked to numerical values 

In [None]:
# Changing from Categorical (Sex, Embarked) to numerical values for data analytics
from sklearn.preprocessing import LabelEncoder

In [None]:
cols = ['Sex', 'Embarked']
le = LabelEncoder()
for col in cols:
    df[col] = le.fit_transform(df[col])

In [None]:
df.head(10)

### Replacing Titles

In [None]:
df['Title'] = df['Title'].replace(['Dr', 'Rev', 'Col', 'Major', 'Lady', 'Jonkheer', 'Don', 'Capt', 'the Countess', 'Sir', 'Dona'], 'Other')
df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')

### Changing Titles to numerical values 

In [None]:
#change = {'Mr': 0, 'Mrs': 1, 'Miss': 2 , 'Master': 3, 'Don': 4, 'Rev': 5, 'Dr': 6,'Major': 7, 'Lady': 8, 'Sir': 9, 'Col': 10, 'Capt': 11, 'the Countess': 12,'Jonkheer': 13}
change = {'Mr': 0, 'Mrs': 1, 'Miss': 2 , 'Master':3,'Other': 5}
data = [df]
for dataset in data:
    dataset['Title'] = dataset['Title'].map(change)

In [None]:
df.head()

In [None]:
df.isnull().sum()

## Table showing Title with corresponding number 


Title | Number
----------|--------------
Mr | 0
Mrs | 1
Miss | 2
Master | 3
Don | 4
Rev | 5
Dr | 6
Mme | 7
Ms | 8
Major | 9
Lady | 10
Sir | 11
Mlle | 12
Col | 13
Capt | 14
the Countess | 15
Jonkheer | 16

# Normalization and Scaling All Data in DF

In [None]:
#df = df.drop(['Survived'], axis = 1)

In [None]:
from sklearn.preprocessing import MinMaxScaler
#from sklearn.preprocessing import StandardScaler

scaler = MinMaxScaler()
#df.Age = scaler.fit_transform(df.Age.values.reshape(-1,1))
#df.Fare = scaler.fit_transform(df.Fare.values.reshape(-1,1))
scaler.fit(df)
#df.Age = scaler.fit_transform(df['Age'])
df_scaled = scaler.fit_transform(df)

In [None]:
df.head(30)

In [None]:
# Analysing numerical data for normalised distribution
#plt.figure(figsize=(20,5))
sns.distplot(df['Age'])

In [None]:
df['Age_Log'] = np.log(df['Age'])

In [None]:
sns.distplot(df['Age_Log'])

In [None]:
#df['Age'] = df['Age'].map(lambda x: np.log(x) if x > 0 else 0)

In [None]:
# Passenger fare distribution after log transformation

#sns.distplot(df['Age'], label = 'Skewness: %.2f'%(df['Age'].skew()))
#plt.legend(loc = 'best')
#plt.title('Passenger Age Distribution After Log Transformation')

In [None]:
sns.distplot(df['Fare'])

In [None]:
# Apply log transformation to Fare column to reduce skewness

df['Fare'] = df['Fare'].map(lambda x: np.log(x) if x > 0 else 0)

In [None]:
# Passenger fare distribution after log transformation

sns.distplot(df['Fare'], label = 'Skewness: %.2f'%(df['Fare'].skew()))
plt.legend(loc = 'best')
plt.title('Passenger Fare Distribution After Log Transformation')

In [None]:
#df['Fare_Log'] = np.log(df['Fare'])

In [None]:
#sns.distplot(df['Fare_Log'])

In [None]:
# normalising the Fare and Age data
# https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html 

#from sklearn.preprocessing import Normalizer
#Scalar = Normalizer()
#dffnew = np.column_stack((df['Fare'], df['Age']))
#dffnew = Scalar.fit_transform(dffnew)
#dff1 = pd.DataFrame(dffnew)
#dff1.columns = ['FareScaled', 'AgeScaled']
#dff['FareScaled'] = dff1['FareScaled']
#dff['AgeScaled'] = dff1['AgeScaled']

In [None]:
#sns.distplot(df['AgeScaled'])

In [None]:
#sns.distplot(df['FareScaled'])

# Feature Engineering 

### Correlation Matrix

In [None]:
corr=df.corr()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(corr, annot = True, cmap="BuPu")

### Drop SibSp and Parch column due to high correlation to family size

In [None]:
df = df.drop(['SibSp'], axis = 1)
df = df.drop(['Parch'], axis = 1)

In [None]:
df.head(10)

In [None]:
# specify input and output attribute
X = df.drop(['Survived'],axis=1)
y = df['Survived']

In [None]:
# select k best features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
# apply k-best algorithm
kbest = SelectKBest(score_func = chi2, k = 'all')
ordered_features = kbest.fit(X,y)

In [None]:
df_scores = pd.DataFrame(ordered_features.scores_, columns=['Score'])

In [None]:
df_columns = pd.DataFrame(X.columns, columns = ['Feature_name'])

In [None]:
feature_rank = pd.concat([df_scores,df_columns],axis=1)

In [None]:
feature_rank.nlargest(7,'Score')

### Feature Importance

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
model = ExtraTreesClassifier()
model.fit(X,y)

In [None]:
model.feature_importances_

In [None]:
ranked_features = pd.Series(model.feature_importances_, index = X.columns)

In [None]:
ranked_features.nlargest(7).plot(kind='bar')

### Information Gain

In [None]:
from sklearn.feature_selection import mutual_info_classif
mu_ifo = mutual_info_classif(X,y)

In [None]:
mu_data = pd.Series(mu_ifo, index = X.columns)
mu_data.sort_values(ascending=False)

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state=42)

### Count of items in test and train datasets

In [None]:
print("x_train shape is: ", x_train.shape)
print("x_test shape is: ", x_test.shape)


# Modelling and Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
def classify(model, x, y):
    x_train,x_test,y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state=42)
    model.fit(x_train,y_train)
    print('Accuracy is: ', model.score(x_test,y_test)*100)
    score = cross_val_score(model,x,y,cv=5)
    print('Cross validation Accuaracy: ', np.mean(score)*100)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
classify(clf, X,y)

## Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
classify(clf, X,y)
#decision_tree.fit(X_train, Y_train)
#Y_pred = decision_tree.predict(X_test)
#acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
#acc_decision_tree

## Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
classify(clf, X,y)


## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
classify(knn,X,y)

## SVC

In [None]:
from sklearn.svm import SVC
clf = SVC(kernel='linear', C = 1)
classify(clf, X,y)

## Confusion Matrix

### Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

In [None]:
clf.fit(x_train,y_train)

In [None]:
y_pred = clf.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,y_pred)

In [None]:
sns.heatmap(cm,annot=True)

In [None]:
df['Survived'].value_counts()

### Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,y_pred)
sns.heatmap(cm,annot=True)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,y_pred)
sns.heatmap(cm,annot=True)

In [None]:
from sklearn.metrics import precision_recall_fscore_support 
Score_model = {} 
clfs = [RandomForestClassifier(), LogisticRegression(), DecisionTreeClassifier()] 
models = ['RandomForest', 'LogisticRegression', 'DecisionTreeClassificer'] 
for i in range(3): 
    clf = clfs[i] 
    clf.fit(x_train,y_train) 
    y_pred = clf.predict(x_test) 
    (precision, recall, fscore, none) = precision_recall_fscore_support(y_test, y_pred, average='binary') 
    print("\nModel: ", models[i],"[\nprecision:", precision,"],[recall:",recall, "],[fscore:", fscore,"] ")