#### Copyright 2020 Google LLC.

In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Classification Project

In this project you will apply what you have learned about classification and TensorFlow to complete a project from Kaggle. The challenge is to achieve a high accuracy score while trying to predict which passengers survived the Titanic ship crash. After building your model, you will upload your predictions to Kaggle and submit the score that you get.

## The Titanic Dataset

[Kaggle](https://www.kaggle.com) has a [dataset](https://www.kaggle.com/c/titanic/data) containing the passenger list on the Titanic. The data contains passenger features such as age, gender, ticket class, as well as whether or not they survived.

Your job is to create a binary classifier using TensorFlow to determine if a passenger survived or not. The `Survived` column lets you know if the person survived. Then, upload your predictions to Kaggle and submit your accuracy score at the end of this Colab, along with a brief conclusion.


To get the dataset, you'll need to accept the competition's rules by clicking the "I understand and accept" button on the [competition rules page](https://www.kaggle.com/c/titanic/rules). Then upload your `kaggle.json` file and run the code below.

In [None]:
! chmod 600 kaggle.json && (ls ~/.kaggle 2>/dev/null || mkdir ~/.kaggle) && cp kaggle.json ~/.kaggle/ && echo 'Done'
! kaggle competitions download -c titanic
! ls

**Note: If you see a "403 - Forbidden" error above, you still need to click "I understand and accept" on the [competition rules page](https://www.kaggle.com/c/titanic/rules).**

Three files are downloaded:

1. `train.csv`: training data (contains features and targets)
1. `test.csv`: feature data used to make predictions to send to Kaggle
1. `gender_submission.csv`: an example competition submission file

## Step 1: Exploratory Data Analysis

Perform exploratory data analysis and data preprocessing. Use as many text and code blocks as you need to explore the data. Note any findings. Repair any data issues you find.

**Student Solution**

In [None]:
# Getting Titanic Data and summary statistics

# Imports
import pandas as pd
import numpy as np
import zipfile
import seaborn as sns
import matplotlib.pyplot as plt
import kaggle
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import LabelEncoder

# Extract the train & test CSVs and store them into DataFrames
with zipfile.ZipFile('titanic.zip','r') as z:
  z.extractall('./')

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Printing the data types of our columns and getting statistics for the DataFrame
print(train_df.dtypes, end='\n-----------------------')
train_df.describe()

In [None]:
# Inspecting the Data

# Check for missing values in training DataFrame
print("Training Missing Values:")
print(train_df.isna().sum())

# Check for missing values  in testing DataFrame
print("-----------------------\nTesting Missing Values:")
print(test_df.isna().sum())

We start by checking our Training DataFrame for missing values in each column. We find that there are 177 mising Age values, 687 Cabin values, and 2 missing Embarked values. Afterwards, we checking our Testing DataFrame for missing values in each column. We find that there are 86 mising Age values, 1 missing Fare value, and 327 missing Cabin values.

## Data Preprocessing

The main goals of our Data Processing are to clean our datasets of null values and encode features to allow our model to fit and predict using them. You'll see in the code blocks below, we expand our 12 columns into 69 columns using dummy encoding.

In [None]:
# Combine both our datasets to simplify preprocessing

df = train_df.append(test_df)
df.reset_index(inplace=True)
df.drop(['index'], inplace=True, axis=1)

# Extract the titles from Name and create a new Title Column with mapped titles
df['Title'] = df['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
df['Title'] = df['Title'].map({
  "Capt": "Officer",
  "Col": "Officer",
  "Major": "Officer",
  "Jonkheer": "Royalty",
  "Don": "Royalty",
  "Sir" : "Royalty",
  "Dr": "Officer",
  "Rev": "Officer",
  "the Countess":"Royalty",
  "Mme": "Mrs",
  "Mlle": "Miss",
  "Ms": "Mrs",
  "Mr" : "Mr",
  "Mrs" : "Mrs",
  "Miss" : "Miss",
  "Master" : "Master",
  "Lady" : "Royalty"
})

In [None]:
# Group factors and their respective Age Medians

def get_age_median(row):
  train_median_df = df[:891].groupby(['Sex', 'Pclass', 'Title']).median().\
    reset_index()[['Sex', 'Pclass', 'Title', 'Age']]
  conditional = (
        (train_median_df['Sex'] == row['Sex']) & 
        (train_median_df['Title'] == row['Title']) & 
        (train_median_df['Pclass'] == row['Pclass'])
  )
  return train_median_df[conditional]['Age'].values[0]

df['Age'] = df.apply(lambda row: get_age_median(row) if np.isnan(row['Age']) else row['Age'], axis=1)


In [None]:
#Dummy encode our Titles & drop Name Column

df.drop('Name', axis=1, inplace=True)

titles_dummies = pd.get_dummies(df['Title'], prefix='Title')
df = pd.concat([df, titles_dummies], axis=1)
df.drop('Title', axis=1, inplace=True)

In [None]:
# Replace empty Embarked values w/ the Mode (S) and Dummy encode the column

df.Embarked.fillna('S', inplace=True)

embarked_dummies = pd.get_dummies(df['Embarked'], prefix='Embarked')
df = pd.concat([df, embarked_dummies], axis=1)
df.drop('Embarked', axis=1, inplace=True)


In [None]:
# Set null Cabin to "X", set existing values to their 1st letter, and Dummy encode

df.Cabin.fillna('X', inplace=True)
df['Cabin'] = df['Cabin'].map(lambda c: c[0])

cabin_dummies = pd.get_dummies(df['Cabin'], prefix='Cabin')    
df = pd.concat([df, cabin_dummies], axis=1)
df.drop('Cabin', axis=1, inplace=True)

In [None]:
# Dummy encode Pclass

pclass_dummies = pd.get_dummies(df['Pclass'], prefix="Pclass")
df = pd.concat([df, pclass_dummies],axis=1)
df.drop('Pclass',axis=1,inplace=True)

In [None]:
# Extract the ticket prefix and Dummy encode the column 

def clean_ticket(ticket):
  ticket = ticket.replace('.','')
  ticket = ticket.replace('/','')
  ticket = ticket.split()
  ticket = map(lambda t : t.strip(), ticket)
  ticket = list(filter(lambda t : not t.isdigit(), ticket))
  if len(ticket) > 0:
      return ticket[0]
  else: 
      return 'XXX'

df['Ticket'] = df['Ticket'].map(clean_ticket)

tickets_dummies = pd.get_dummies(df['Ticket'], prefix='Ticket')
df = pd.concat([df, tickets_dummies], axis=1)
df.drop('Ticket', inplace=True, axis=1)

In [None]:
# Create a Family Size column and columns that correspond to groups of sizes

df['FamilySize'] = df['Parch'] + df['SibSp'] + 1

df['Singleton'] = df['FamilySize'].map(lambda s: 1 if s == 1 else 0)
df['SmallFamily'] = df['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
df['LargeFamily'] = df['FamilySize'].map(lambda s: 1 if 5 <= s else 0)

In [None]:
# Replace missing values with the average fare
df['Fare'].fillna(df.iloc[:891]['Fare'].mean(), inplace=True)

# Replace "male" with 1 and "female" with 0
df['Sex'] = df['Sex'].map({"male":1, "female":0})

# Split back into our two DataFrames
train_df = df.iloc[:891]
test_df = df.iloc[891:]

## Exploratory Data Analysis (cont.)

In [None]:
# Define target and feature columns
target_column = 'Survived'
feature_columns = [c for c in test_df.columns if c not in ['PassengerId','Survived']]
orig_columns = ['Survived','Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3']

# Visualize the correlation using heatmaps for both DataFrames
plt.figure(figsize=(10, 10))
sns.heatmap(train_df[orig_columns].corr(), vmax=0.6, annot=True, square=True, cmap="coolwarm")
plt.title('Training Data Column Correlations')
plt.show()
print("\n")

orig_columns.remove('Survived')
plt.figure(figsize=(10, 10))
sns.heatmap(test_df[orig_columns].corr(), vmax=0.6, annot=True, square=True, cmap="coolwarm")
plt.title('Testing Data Column Correlations')
plt.show()

The heatmaps above show the correlation between columns for our Training and Testing DataFrames. We can see some **similar patterns** for both sets with relatively strong negative correlations between Lower Classes & Fare and Lower Classes & Age. These are reasonable correlation as people in lower classes (*3rd class) often cannot afford expensive tickets and younger people are more likely to be lower class. We can see the opposite is true for Higher Classes & the mentioned columns as there are strong postive correlations.

When comparing Training & Testing correlations, it's important to note **differences**. A significant difference is that there seems to be a much higher correlation between Age and Fare for Testing than for Training.

In [None]:
# Create a Violin Plot to show the difference in survival based on age & sex
fig = plt.figure(figsize=(25, 7))
sns.violinplot(x='Sex', y='Age', 
               hue='Survived', data=train_df, 
               split=True,
               palette={0: "r", 1: "g"}
              );

*Note: 0 = Female, 1 = Male*

Based on the Violin Plot above, we can see that women survive more than men (bigger green section). We can also see the distribution of age in the survival for each sex. The age distribution for men seems approximately normal, centered around ~30 but we see a jump in survival for young boys.

In [None]:
plt.figure(figsize=(15,10))
axes = sns.factorplot('FamilySize','Survived', 
                      data=train_df, aspect = 2.5, )
plt.show()

The factorplot above shows us the chances of survival with relation to size of your family (including self). Based on the plot, the peak survival rates occurred with families of size 3-4. It seems that large families had low likelihood of survival with the biggest dropoff happening after 4 family members.

---

## Step 2: The Model

Build, fit, and evaluate a classification model. Perform any model-specific data processing that you need to perform. If the toolkit you use supports it, create visualizations for loss and accuracy improvements. Use as many text and code blocks as you need to explore the data. Note any findings.

**Student Solution**

In [None]:
# Create a Logistic Regression Model
lr = LogisticRegression(max_iter = 100000)

# Fit the Model using our training data
lr.fit(train_df[feature_columns], train_df[target_column])

---

## Step 3: Make Predictions and Upload To Kaggle

In this step you will make predictions on the features found in the `test.csv` file and upload them to Kaggle using the [Kaggle API](https://github.com/Kaggle/kaggle-api). Use as many text and code blocks as you need to explore the data. Note any findings.

**Student Solution**

In [None]:
# Make a prediction using the testing input data
lr_pred = lr.predict(test_df[feature_columns]).astype(int)

# Create a new DataFrame with our prediction for submission and convert it to a CSV
submission_df = pd.DataFrame({'PassengerId':test_df['PassengerId'],'Survived':lr_pred})
submission_df.to_csv('submission.csv',index=False)

# Upload the CSV to Kaggle
# !kaggle competitions submit -c titanic -f submission.csv -m "Default LR"

What was your Kaggle score?

>0.77511

---

## Step 4: Iterate on Your Model

In this step you're encouraged to play around with your model settings and to even try different models. See if you can get a better score. Use as many text and code blocks as you need to explore the data. Note any findings.

**Student Solution**

In [None]:
# EXTREME GRADIENT BOOSTING

xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=1, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.03, max_delta_step=0, max_depth=8, min_child_weight=1, 
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=0)

# Fit the Model using our training data & make a prediction using the testing input data
xgb.fit(train_df[feature_columns], train_df[target_column])

# Create a dataframe and plot of the importance of each feature
features = pd.DataFrame()
features['feature'] = feature_columns
features['importance'] = xgb.feature_importances_
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)
features[-20:].plot(kind='barh', figsize=(15, 15))

xg_pred = xgb.predict(test_df[feature_columns]).astype(int)

# Create a new DataFrame with our prediction for submission and convert it to a CSV
sub_df = pd.DataFrame({'PassengerId':test_df['PassengerId'],'Survived':xg_pred})
sub_df.to_csv('submission.csv',index=False)

# Upload the CSV to Kaggle
# !kaggle competitions submit -c titanic -f submission.csv -m "XGB"


> Kaggle score:  0.76555




In [None]:
# RANDOM FOREST CLASSIFIER

rf = RandomForestClassifier(n_estimators=200, min_samples_leaf=3, max_features=.5, n_jobs=-1)
rf.fit(train_df[feature_columns], train_df[target_column])

# Create a dataframe and plot of the importance of each feature
features = pd.DataFrame()
features['feature'] = feature_columns
features['importance'] = rf.feature_importances_
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)
features[-20:].plot(kind='barh', figsize=(15, 15))

rf_pred = rf.predict(test_df[feature_columns]).astype(int)

# Create a new DataFrame with our prediction for submission and convert it to a CSV
submission_df = pd.DataFrame({'PassengerId':test_df['PassengerId'],'Survived':rf_pred})
submission_df.to_csv('submission.csv',index=False)

# Upload the CSV to Kaggle
# !kaggle competitions submit -c titanic -f submission.csv -m "RFC"



> Kaggle Score: 0.77751


In [None]:
# DECISION TREE CLASSIFIER

from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

params = [{'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}]
b_clf=BaggingClassifier(GridSearchCV(DecisionTreeClassifier(random_state=42),params,cv=3,verbose=1),n_estimators=1000,max_samples=100,bootstrap=True,n_jobs=-1)
b_clf.fit(train_df[feature_columns],train_df[target_column])
y_pred=b_clf.predict(test_df[feature_columns]).astype(int)

# Create a new DataFrame with our prediction for submission and convert it to a CSV
sub_df = pd.DataFrame({'PassengerId':test_df['PassengerId'],'Survived':y_pred})
sub_df.to_csv('submission.csv',index=False)

# Upload the CSV to Kaggle
# !kaggle competitions submit -c titanic -f submission.csv -m "BC + Dtree"



> Kaggle score: 0.78708



---