# Introduction




# Prepare for analysis

## Load packages

In [None]:
import pandas as pd
import numpy as np
import sys
import os
import random
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns

## Read the data

In [None]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")

# Preliminary data inspection

## Quick glimpse of the data

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.describe()

In [None]:
test_df.describe()

## Few statistics on the data

### Missing data

In [None]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
missing_data(train_df)

In [None]:
missing_data(test_df)

### Most frequent data

In [None]:
def most_frequent_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    items = []
    vals = []
    for col in data.columns:
        try:
            itm = data[col].value_counts().index[0]
            val = data[col].value_counts().values[0]
            items.append(itm)
            vals.append(val)
        except Exception as ex:
            print(ex)
            items.append(0)
            vals.append(0)
            continue
    tt['Most frequent item'] = items
    tt['Frequence'] = vals
    tt['Percent from total'] = np.round(vals / total * 100, 3)
    return(np.transpose(tt))

In [None]:
most_frequent_values(train_df)

In [None]:
most_frequent_values(test_df)

### Unique values

In [None]:
def unique_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    uniques = []
    for col in data.columns:
        unique = data[col].nunique()
        uniques.append(unique)
    tt['Uniques'] = uniques
    return(np.transpose(tt))

In [None]:
unique_values(train_df)

In [None]:
unique_values(test_df)

# Exploratory data analysis

## Univariate analysis for all features


We show here two graphs in paralel:
* distribution of class values, split per Survived value
* comparison of class values, in train and test data


Let's first aggregate train and test data into one single dataframe, `all_df`.

In [None]:
all_df = pd.concat([train_df, test_df], axis=0)
all_df["set"] = "train"
all_df.loc[all_df.Survived.isna(), "set"] = "test"

In [None]:
all_df.head()

### Visual identity of our Notebook

We will use a unified set of colors for the graphs in our Notebook.  

For this Titanic Notebook, let's select a group of **marine** shades of blue.

We use as well a small function to visualize this color map.

In [None]:
def set_color_map(color_list):
    cmap_custom = ListedColormap(color_list)
    print("Notebook Color Schema:")
    sns.palplot(sns.color_palette(color_list))
    plt.show()
    return cmap_custom

In [None]:
color_list = ["#A5D7E8", "#576CBC", "#19376D", "#0b2447"]
cmap_custom = set_color_map(color_list)

In [None]:
def plot_count_pairs(data_df, feature, title, hue="set"):
    f, ax = plt.subplots(1, 1, figsize=(8, 4))
    sns.countplot(x=feature, data=data_df, hue=hue, palette= color_list)
    plt.grid(color="black", linestyle="-.", linewidth=0.5, axis="y", which="major")
    ax.set_title(f"Number of passengers / {title}")
    plt.show()    

In [None]:
def plot_distribution_pairs(data_df, feature, title, hue="set"):
    f, ax = plt.subplots(1, 1, figsize=(8, 4))
    for i, h in enumerate(data_df[hue].unique()):
        g = sns.histplot(data_df.loc[data_df[hue]==h, feature], color=color_list[i], ax=ax, label=h)
    ax.set_title(f"Number of passengers / {title}")
    g.legend()
    plt.show()  

In [None]:
plot_count_pairs(all_df,  "Sex", "Sex")

In [None]:
plot_count_pairs(train_df,  "Sex", "Sex", hue="Survived")

In [None]:
plot_count_pairs(all_df,  "Pclass", "Passenger Class")

In [None]:
plot_count_pairs(train_df,  "Pclass", "Passenger Class", hue="Survived")

In [None]:
plot_count_pairs(all_df,  "SibSp", "Sibilings or Spouse")

In [None]:
plot_count_pairs(train_df,  "SibSp", "Sibilings or Spouse", hue="Survived")

In [None]:
plot_count_pairs(all_df,  "Parch", "Parents or Children aboard")

In [None]:
plot_count_pairs(train_df,  "Parch", "Parents or Children aboard", hue="Survived")

In [None]:
plot_count_pairs(all_df,  "Embarked", "Embarking port")

In [None]:
plot_count_pairs(train_df,  "Embarked", "Embarking port", hue="Survived")

In [None]:
plot_distribution_pairs(all_df, "Age", "Age (grouped by dataset)")

In [None]:
plot_distribution_pairs(train_df, "Age", "Age (grouped by survival)", hue="Survived")

In [None]:
plot_distribution_pairs(all_df, "Fare", "Fare (grouped by dataset)")

In [None]:
plot_distribution_pairs(train_df, "Fare", "Fare (grouped by survival)", hue="Survived")

## Family size


Based on SibSp (sibilings or spouse) and Parch (parents or children), we set the Family Size field.

In [None]:
all_df["Family Size"] = all_df["SibSp"] + all_df["Parch"] + 1

In [None]:
train_df["Family Size"] = train_df["SibSp"] + train_df["Parch"] + 1

In [None]:
plot_count_pairs(all_df, "Family Size", "Family Size (grouped by dataset)")

In [None]:
plot_count_pairs(all_df, "Family Size", "Family Size (grouped by survival)", hue="Survived")

## Age interval

In [None]:
all_df["Age Interval"] = 0.0
all_df.loc[ all_df['Age'] <= 16, 'Age Interval']  = 0
all_df.loc[(all_df['Age'] > 16) & (all_df['Age'] <= 32), 'Age Interval'] = 1
all_df.loc[(all_df['Age'] > 32) & (all_df['Age'] <= 48), 'Age Interval'] = 2
all_df.loc[(all_df['Age'] > 48) & (all_df['Age'] <= 64), 'Age Interval'] = 3
all_df.loc[ all_df['Age'] > 64, 'Age Interval'] = 4

In [None]:
train_df["Age Interval"] = 0.0
train_df.loc[train_df['Age'] <= 16, 'Age Interval']  = 0
train_df.loc[(train_df['Age'] > 16) & (train_df['Age'] <= 32), 'Age Interval'] = 1
train_df.loc[(train_df['Age'] > 32) & (train_df['Age'] <= 48), 'Age Interval'] = 2
train_df.loc[(train_df['Age'] > 48) & (train_df['Age'] <= 64), 'Age Interval'] = 3
train_df.loc[ train_df['Age'] > 64, 'Age Interval'] = 4

In [None]:
all_df.head()

In [None]:
plot_count_pairs(all_df, "Age Interval", "Age (grouped by dataset)")

In [None]:
plot_count_pairs(all_df, "Age Interval", "Age (grouped by survival)", hue="Survived")

## Fare interval

In [None]:
all_df['Fare Interval'] = 0.0
all_df.loc[ all_df['Fare'] <= 7.91, 'Fare Interval'] = 0
all_df.loc[(all_df['Fare'] > 7.91) & (all_df['Fare'] <= 14.454), 'Fare Interval'] = 1
all_df.loc[(all_df['Fare'] > 14.454) & (all_df['Fare'] <= 31), 'Fare Interval']   = 2
all_df.loc[ all_df['Fare'] > 31, 'Fare Interval'] = 3

In [None]:
train_df['Fare Interval'] = 0.0
train_df.loc[ train_df['Fare'] <= 7.91, 'Fare Interval'] = 0
train_df.loc[(train_df['Fare'] > 7.91) & (train_df['Fare'] <= 14.454), 'Fare Interval'] = 1
train_df.loc[(train_df['Fare'] > 14.454) & (train_df['Fare'] <= 31), 'Fare Interval']   = 2
train_df.loc[ train_df['Fare'] > 31, 'Fare Interval'] = 3

In [None]:
plot_count_pairs(all_df, "Fare Interval", "Fare (grouped by dataset)")

Let's create a composed feature: Pclass + Sex.

In [None]:
train_df["Sex_Pclass"] = train_df.apply(lambda row: row['Sex'][0].upper() + "_C" + str(row["Pclass"]), axis=1)

In [None]:
all_df["Sex_Pclass"] = all_df.apply(lambda row: row['Sex'][0].upper() + "_C" + str(row["Pclass"]), axis=1)

In [None]:
plot_count_pairs(all_df, "Fare Interval", "Fare (grouped by survival)", hue="Survived")

## Deck

Based on Cabin code, we extract the deck name

In [None]:
def get_deck(text):
    try:
        return text[0]
    except Exception as ex:
        return "Unknown"

In [None]:
all_df["Deck"] = all_df["Cabin"].apply(lambda x: get_deck(x))

In [None]:
plot_count_pairs(all_df, "Deck", "Deck (grouped by dataset)")

In [None]:
plot_count_pairs(all_df, "Deck", "Fare (grouped by survival)", hue="Survived")

In [None]:
np.transpose(pd.crosstab(all_df['Deck'], all_df['Pclass']))

## Process names


When we process names, we would like to extract the following information:

- Family name - this is the first word (or few first words, if a family name with multiple names), followed by a comma  
- Title - this follows just after the comma   
- Given name - this is the word or group of words following family name  
- Maiden name - for ladies, is given between parantheses  

We start with creating a function that parses the Name string and extract (if possible) these 4 elements


In [None]:
def parse_names(row):
    try:
        text = row["Name"]
        split_text = text.split(",")
        family_name = split_text[0]
        next_text = split_text[1]
        split_text = next_text.split(".")
        title = (split_text[0] + ".").lstrip().rstrip()
        next_text = split_text[1]
        if "(" in next_text:
            split_text = next_text.split("(")
            given_name = split_text[0]
            maiden_name = split_text[1].rstrip(")")
            return pd.Series([family_name, title, given_name, maiden_name])
        else:
            given_name = next_text
            return pd.Series([family_name, title, given_name, None])
    except Exception as ex:
        print(f"Exception: {ex}")
    
    

In [None]:
all_df[["Family Name", "Title", "Given Name", "Maiden Name"]] = all_df.apply(lambda row: parse_names(row), axis=1)

In [None]:
train_df[["Family Name", "Title", "Given Name", "Maiden Name"]] = train_df.apply(lambda row: parse_names(row), axis=1)

## Deep dive into titles


Let's check few things about title:  
- relationship between Age Interval and Title;   
- relationship between Sex and Title;  
- relationship between SibSp, Parch, Family Size and Title;  


In [None]:
np.transpose(pd.crosstab(all_df['Title'], all_df['Age Interval']))

In [None]:
np.transpose(pd.crosstab(all_df['Title'], all_df['Sex']))

In [None]:
np.transpose(pd.crosstab(all_df['Title'], all_df['SibSp']))

In [None]:
np.transpose(pd.crosstab(all_df['Title'], all_df['Parch']))

In [None]:
np.transpose(pd.crosstab(all_df['Title'], all_df['Family Size']))

In [None]:
np.transpose(pd.crosstab(all_df['Title'], all_df['Pclass']))

## Deep dive into families data

We would like to understand what happened with different families, we will follow their fate through the data.

Let's look first to few large families.

In [None]:
sel_columns = ["Name", "Sex","Age", "Title", "Family Name", "Given Name", "Maiden Name", "SibSp", "Parch", "Family Size", "Ticket", "Cabin", "Pclass", "Survived"]

In [None]:
all_df["Family Name"].value_counts()[0:5]

In [None]:
all_df.loc[all_df["Family Name"]=="Andersson"][sel_columns].sort_values(by=["Family Size", "Ticket", "Age"], ascending=False)

In [None]:
all_df.loc[all_df["Family Name"]=="Sage"][sel_columns].sort_values(by=["Family Size", "Ticket", "Age"], ascending=False)

In [None]:
all_df.loc[all_df["Family Name"]=="Asplund"][sel_columns].sort_values(by=["Family Size", "Ticket", "Age"], ascending=False)

Let's understand more about the special case of the women with a Dr. Title. This is rather rare for that period.

In [None]:
all_df.loc[(all_df['Title'] == 'Dr.') & (all_df['Sex'] == 'female')][sel_columns]

Let's see if she is traveling alone in cabin D17.

In [None]:
all_df.loc[all_df['Cabin'] == 'D17'][sel_columns]

She is actually traveling with a Mrs. Swift, a woman companion, in the same 1st class cabin, on a separate ticket. They both survived.

### Family Names wordclouds

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

def show_wordcloud(data, mask=None, title=""):
    text = " ".join(t for t in data.dropna())
    stopwords = set(STOPWORDS)
    stopwords.update(["t", "co", "https", "amp", "U", "Comment", "text", "attr", "object"])
    wordcloud = WordCloud(stopwords=stopwords, scale=4, max_font_size=50, max_words=500,mask=mask, background_color="white",
                         colormap=cmap_custom).generate(text)
    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    fig.suptitle(title, fontsize=14)
    fig.subplots_adjust(top=2.3)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.show()    

In [None]:
show_wordcloud(all_df["Family Name"], title="Family Names on Titanic")

In [None]:
show_wordcloud(all_df.loc[all_df["Title"].isin(["Miss.", "Mlle."])]["Given Name"], title="Girls and Young Women Names on Titanic")

In [None]:
show_wordcloud(all_df.loc[all_df["Sex"]=="male"]["Given Name"], title="Boys and Men Names on Titanic")

In [None]:
all_df["Embarked"].unique()

In [None]:
for embark in all_df["Embarked"].unique():
    try:
        show_wordcloud(all_df.loc[all_df["Embarked"]==embark]["Family Name"], title=f"Family Names on Titanic, embarked in {embark}")
    except:
        pass

## Multivariate analysis


Let's look now to the interaction of multiple features.

In [None]:
plot_count_pairs(all_df.sort_values(by=("Sex_Pclass")), "Sex_Pclass", "Survival by Sex and Passenger Class", "Survived")

In [None]:
plot_count_pairs(all_df, "Age Interval", "Age Interval (grouped by Class)", "Pclass")

In [None]:
plot_count_pairs(all_df, "Pclass", "Class (grouped by Embarked)", "Embarked")

In [None]:
plot_count_pairs(all_df, "Age Interval", "Age Interval (grouped by Embarked)", "Embarked")

In [None]:
plot_count_pairs(all_df, "Pclass", "Pclass (grouped by Fare Interval)", "Fare Interval")

In [None]:
plot_count_pairs(all_df, "Deck", "Deck (grouped by Fare Interval)", "Fare Interval")

In [None]:
plot_count_pairs(all_df, "Family Size", "Family Size (grouped by Fare Interval)", "Fare Interval")

In [None]:
def plot_count_distrib_pairs(data_df, f_one, f_two, title, hue="Survived"):
    sns.set_style("whitegrid", {'axes.grid' : True})
    fig = plt.figure(figsize=(6,6))
    ax = fig.add_subplot(111, projection='3d')
    sns.kdeplot(x=data_df[f_one], y=data_df[f_two], hue=data_df[hue], palette=color_list)
    ax.set_zlabel('Density')
    ax.set_title(title)
    plt.show()    

In [None]:
plot_count_distrib_pairs(all_df, "Family Size", "Pclass", "Survival by Family Size and Class")

In [None]:
plot_count_distrib_pairs(all_df, "Age Interval", "Pclass", "Survival by Age Interval and Class")

In [None]:
sns.kdeplot(x=all_df["Family Size"], y=all_df["Pclass"], hue=all_df["Survived"], palette= color_list[0:3])
plt.show()

### Few more engineered data 


Let's create two more engineered features:  
* Family size interval: Single, Small, Large  
* Aggregated titles: Mr, Mrs, Master, Miss, and Rare  

In [None]:
for dataset in [all_df, train_df]:
    dataset["Family Type"] = dataset["Family Size"]

In [None]:
for dataset in [all_df, train_df]:
    dataset.loc[dataset["Family Size"] == 1, "Family Type"] = "Single"
    dataset.loc[(dataset["Family Size"] > 1) & (dataset["Family Size"] < 5), "Family Type"] = "Small"
    dataset.loc[(dataset["Family Size"] >= 5), "Family Type"] = "Large"

In [None]:
for dataset in [all_df, train_df]:
    dataset["Titles"] = dataset["Title"]

In [None]:

for dataset in [all_df, train_df]:
    #unify `Miss`
    dataset['Titles'] = dataset['Titles'].replace('Mlle.', 'Miss.')
    dataset['Titles'] = dataset['Titles'].replace('Ms.', 'Miss.')
    #unify `Mrs`
    dataset['Titles'] = dataset['Titles'].replace('Mme.', 'Mrs.')
    # unify Rare
    dataset['Titles'] = dataset['Titles'].replace(['Lady.', 'the Countess.','Capt.', 'Col.',\
     'Don.', 'Dr.', 'Major.', 'Rev.', 'Sir.', 'Jonkheer.', 'Dona.'], 'Rare')

In [None]:
train_df[['Titles', 'Sex', 'Survived']].groupby(['Titles', 'Sex'], as_index=False).mean()

## Complex analysis in one graph

In [None]:
f, ax = plt.subplots(2, 3, figsize=(15, 10))

features = ["Sex", "Pclass", "Age Interval", "Fare Interval", "Family Type", "Titles"]

for i, feature in enumerate(features):
    crt_ax = (int(i/3), i%3)
    total = float(len(train_df))
    sns.countplot(x=feature, data=train_df, hue="Survived", palette= color_list, ax = ax[crt_ax])
    ax[crt_ax].grid(color="black", linestyle="-.", linewidth=0.5, axis="y", which="major")
    ax[crt_ax].set_title(f"Survived passenger / {feature}")
    for p in ax[crt_ax].patches:
        height = p.get_height()
        ax[crt_ax].text(p.get_x()+p.get_width()/2.,
                height,
                '{:1.1f}%'.format(100*height/total),
                ha="center", fontsize=10) 


plt.show()    



# Baseline model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

## Map categorical value to numerical values

In [None]:
for dataset in [train_df, test_df]:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

Create train-validation split.

In [None]:
VALID_SIZE = 0.2
train, valid = train_test_split(train_df, test_size=VALID_SIZE, random_state=42, shuffle=True)

Define predictor features and target feature.

In [None]:
predictors = ["Sex", "Pclass"]
target = 'Survived'

Define the training and validation data and labels.

In [None]:
train_X = train[predictors]
train_Y = train[target].values
valid_X = valid[predictors]
valid_Y = valid[target].values

Initialize the classifiction algorithm.

In [None]:
clf = RandomForestClassifier(n_jobs=-1, 
                             random_state=42,
                             criterion="gini",
                             n_estimators=100,
                             verbose=False)

Fit the classifier with the training data.

In [None]:
clf.fit(train_X, train_Y)

Predict the train data (to check the training classification error).

In [None]:
preds_tr = clf.predict(train_X)

Predict the validation data.

In [None]:
preds = clf.predict(valid_X)

Classification report for training data.

In [None]:
print(metrics.classification_report(train_Y, preds_tr, target_names=['Not Survived', 'Survived']))

Classification report for validation data.

In [None]:
print(metrics.classification_report(valid_Y, preds, target_names=['Not Survived', 'Survived']))