In [6]:
# Visualisation report
# Import Packages

import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import altair as alt
import typing

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

import sys
import os

# Add the src directory to the Python path
sys.path.append(os.path.abspath('../src'))
from prelim_utils import *
from eda_univariate import *
from feature_engineering import *
from baseline import *
from name_proc import *

In [3]:
# PATHS
DATA_PATH = Path("../data")
TRAIN_PATH = DATA_PATH / "train.csv"
TEST_PATH = DATA_PATH / "test.csv"

In [13]:
# Color palette
color_list = ["#A5D7E8", "#576CBC", "#19376D", "#0b2447"]

In [4]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

In [5]:
all_df = combine_train_test(train_df, test_df)

In [9]:
print(all_df.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'set'],
      dtype='object')


In [22]:
def facet_group_countplot(df: pd.DataFrame, x: str, group: str, facet: str, title: str) -> alt.Chart:
    '''
    Function to create a facet group count plot which groups by the group variable and facets by the facet variable
    '''
    # Remove null values
    df = df[[x, group, facet]].dropna()
    # Create the chart
    chart = alt.Chart(df).mark_bar().encode(
        x=alt.X(f"{x}:N", title=f"{x}"),
        y=alt.Y("count()", title="Count"),
        color=alt.Color(f"{group}:N", scale=alt.Scale(range=color_list)),
        column=alt.Column(f"{facet}:N", title=f"{facet}")
    ).properties(
        title=title
    )
    return chart

In [23]:
facet_group_countplot(all_df, x="Pclass", group="Survived", facet="Sex", title="Survival Rate by Pclass and Sex")

This visualisation plots the survival count on the y axis versus the passenger class on the x axis. The facet aspect of the chart splits the dataset between female and male passengers. This allows for easy analysis on whether gender impacts liklihood to survive given cabin class.

Firstly, in terms of the numbers of passengers per class and their gender split we see that there are more male passengers than female passengers in all 3 classes. However the ratio of male to female are not equal between classes. We see that 1st class has the most equal ratio with only a fraction more male than female, then 2nd class and finally in 3rd class we see a huge disparity between the number of males and females.

This as important as general intuition would predict the females 'women and children first' and higher classes would be more likely to survive. A plot that looked only at class or sex vs survival might overestimate the correlation between male or 3rd class and survival due to the confounding effects of having more males in 3rd class.

By visualising both class and sex on the same plot we can dig into the joint effects of both variables. For example, we see that on average female passengers are more likely to survive than male passengers. Yet there is not an equal relationshop across classes. Nearly all female passengers in 1st and 2nd class survived, but in 3rd class it is closer to a 50/50 chance of survival. 
On the male section of the dataset the relationship between class and survival is less clear, although we can conclude that 1st class males are more likely to survive than 2nd and 3rd class males.

To account for these joint effects, I would include a joint feature of class and sex. This is an improvement on just selecting sex and class as seperate features as it controls for the joint impacts of class and gender on survival rate as shown most clearly in the female section of the dataset

In [31]:
all_df["Family Size"] = calculate_family_size(all_df)

In [41]:
def proportion_chart(df: pd.DataFrame, x: str, group: str, title: str) -> alt.Chart:
    '''
    Function to create a stacked chart
    '''
    # Calculate proportions
    grouped_df = df.groupby([x, group]).size().reset_index(name='Count')

    # Step 2: Calculate the total count for each Family
    grouped_df['Total'] = grouped_df.groupby(x)['Count'].transform('sum')
    grouped_df['Proportion'] = grouped_df['Count'] / grouped_df['Total']

    chart = alt.Chart(grouped_df).mark_bar().encode(
        x=alt.X(f"{x}:N", title=f"{x}"),
        y=alt.Y('Proportion:Q', title='Proportion', axis=alt.Axis(format='%')),
        color=alt.Color(f"{group}:N", scale=alt.Scale(range=color_list))
    ).properties(
        title=title
    )

    return chart

In [42]:
proportion_chart(all_df, x="Sex", group="Survived", title="Survival Rate by Sex")

From the original analysis, family size clearly had an impact on survival rate. Based on our above plot, it may be that class or sex has interaction effects with family size.
Possible interaction with pclass was explored but the results were insignificant. Lets investigate interaction with the Sex feature.
First, above is the survival rate split only on sex. We see that females have a 75% chance of survival whereas males have 20.

In [43]:
proportion_chart(all_df, x="Family Size", group="Survived", title="Survival Rate by Family Size")

Here is the survival rate just by family size we see that single travellers have a 30% chance of survival but small to medium size families have an above even chance of survival. This decreases for larger families.

Family size is an engineered variable adding together spouse/siblings and parents/children. 
From the original analysis we saw that single travelers (family size = 1 ) were about twice as likely to die than survive, however for small to medium famillies (family size 2-4 inclusive) the chances of survival were 

In [46]:
def barchart_proportions(df: pd.DataFrame, x: str, group: str, facet: str, title: str) -> alt.Chart:
    '''
    Function to create a bar chart with proportions
    '''
    # Calculate proportions
    grouped_df = df.groupby(['Family Size', 'Sex', 'Survived']).size().reset_index(name='Count')

    # Step 2: Calculate the total count for each FamilySize and Sex group
    grouped_df['Total'] = grouped_df.groupby(['Family Size', 'Sex'])['Count'].transform('sum')

    # Step 3: Calculate the proportion
    grouped_df['Proportion'] = grouped_df['Count'] / grouped_df['Total']


    # Create the stacked bar chart with proportions
    chart = alt.Chart(grouped_df).mark_bar().encode(
        x=alt.X(f"{x}:N", title=f"{x}"),
        y=alt.Y('Proportion:Q', title='Proportion', axis=alt.Axis(format='%')),
        color=alt.Color(f"{group}:N", scale=alt.Scale(range=color_list)),
        column=alt.Column(f"{facet}:N", title=f"{facet}")
    ).properties(
        title=title
    )

    return chart

In [47]:
barchart_proportions(all_df, x="Family Size", group="Survived", facet="Sex", title="Survival Rate by Family Size and Sex")

Recall that the average survival rate for women was 70%. We see that for family size up to 4 the survival chance for females is slightly higher than this average. 
Additionally, recall that the average survival rate for men was 20%. We see that single men have a lower than average chance of survival, but men with small to medium families have a higher than average survival rate.

This motivates the inclusion of an interaction term between Sex and Family Size. Family size has an impact on survival chances for both male and female but
its impact is not constant between Sex.