# Week 09 - The Titanic

* https://towardsdatascience.com/powerful-one-liners-in-pandas-every-data-scientist-should-know-737e721b81b6
* https://www.quora.com/What-kind-of-statistics-should-be-learned-to-make-a-good-data-analyst
* https://towardsdatascience.com/understanding-train-test-split-scikit-learn-python-ea676d5e3d1
* https://towardsdatascience.com/8-seaborn-plots-for-univariate-exploratory-data-analysis-eda-in-python-9d280b6fe67f
* https://www.encyclopedia-titanica.org/
* https://www.encyclopedia-titanica.org/class-gender-titanic-disaster-1912~chapter-2~part-2.html
* https://github.com/davidjamesknight/SQLite_databases_for_learning_data_science
* the Titanic dataset is from https://data.world/datasets/titanic-dataset

In [None]:
# create seed
import random

# provide your student id as stud_id
stud_id = ...
my_seed = random.seed(stud_id)

In [None]:
import sqlite3
import pandas as pd

db_connection = sqlite3.connect('titanic.db')

titanic = pd.read_sql_query('SELECT * FROM titanic', db_connection)
titanic = titanic.sample(frac=1, axis=1, random_state=my_seed).reset_index(drop=True)
titanic.drop('id', axis=1, inplace=True)
titanic.head()

In [None]:
# print shape and info


In [None]:
# sample


## Preprocessing

In [None]:
# find nulls


In [None]:
# find index of blank row


In [None]:
# delete row by index


In [None]:
# find nulls


In [None]:
# observation for missing fare


In [None]:
# average 3rd class fare


In [None]:
# impute missing fare


In [None]:
# observation for missing embarked


In [None]:
# find name contains Stone


In [None]:
# find cabin B28


In [None]:
# https://www.encyclopedia-titanica.org/titanic-survivor/martha-evelyn-stone.html
# fillna embarked with S


In [None]:
# recheck null count


In [None]:
# feature with only one value

constant_features = [
    feat for feat in titanic.columns if len(titanic[feat].unique()) == 1
]

constant_features

In [None]:
# quasi constant values
for feat in titanic.columns.sort_values():
    if (len(titanic[feat].unique()) < 3):
        print(titanic[feat].value_counts())

In [None]:
# check of duplications
duplicated_feat = []
for i in range(0, len(titanic.columns)):
    orig = titanic.columns[i]

    for dupe in titanic.columns[i + 1:]:
        if titanic[orig].equals(titanic[dupe]):
            duplicated_feat.append(dupe)
            
duplicated_feat

In [None]:
titanic.isnull().sum()

In [None]:
# find missing age groups
print(titanic[(pd.isna(titanic['age'])) & titanic['name'].str.contains('Mr\.')].shape)
print(titanic[(pd.isna(titanic['age'])) & titanic['name'].str.contains('Dr\.')].shape)
print(titanic[(pd.isna(titanic['age'])) & titanic['name'].str.contains('Ms\.')].shape)
print(titanic[(pd.isna(titanic['age'])) & titanic['name'].str.contains('Mrs\.')].shape)
print(titanic[(pd.isna(titanic['age'])) & titanic['name'].str.contains('Miss\.')].shape)
print(titanic[(pd.isna(titanic['age'])) & titanic['name'].str.contains('Master')].shape)

In [None]:
titanic[(pd.isna(titanic['age'])) & titanic['name'].str.contains('Dr\.')] # 46

In [None]:
titanic.at[40, 'age'] = 46

In [None]:
titanic[(pd.isna(titanic['age'])) & titanic['name'].str.contains('Ms\.')] # 21

In [None]:
titanic.at[1076, 'age'] = 21

In [None]:
cond1 = titanic['name'].str.contains('Master')
titanic.loc[cond1,'age'] = titanic.loc[cond1,'age'].fillna(titanic.loc[cond1,'age'].mean())

cond2 = titanic['name'].str.contains('Miss\.')
titanic.loc[cond2,'age'] = titanic.loc[cond2,'age'].fillna(titanic.loc[cond2,'age'].mean())

cond3 = titanic['name'].str.contains('Mrs\.')
titanic.loc[cond3,'age'] = titanic.loc[cond3,'age'].fillna(titanic.loc[cond3,'age'].mean())

cond4 = titanic['name'].str.contains('Mr\.')
titanic.loc[cond4,'age'] = titanic.loc[cond4,'age'].fillna(titanic.loc[cond4,'age'].mean())

In [None]:
titanic.isnull().sum()

In [None]:
# create adult male feature
import pandas as pd

def is_adult_male(row):
    if pd.isna(row['age']):
        return None
    if row['age'] > 15 and row['age'] < 81 and row['sex'] == 'male':
        return 1
    else:
        return 0
    
titanic['adult_male'] = titanic.apply(is_adult_male, axis=1)
titanic['adult_male'].value_counts(dropna=False)

## Exploratory Data Analysis

In [None]:
# split the titanic data into train test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
                                titanic.drop(['survived'], axis=1), 
                                titanic['survived'], 
                                test_size=0.25, 
                                random_state=42)

print(X_train.shape)
print(X_test.shape)
print(X_train.head())

In [None]:
# X_train describe
X_train.describe()

In [None]:
# plot histograms with tight layout
import matplotlib.pyplot as plt


In [None]:
# print boxplots with tight layout


### Outliers

In [None]:
for feat in X_train._get_numeric_data().columns:
    q1 = X_train[feat].quantile(0.25)
    q3 = X_train[feat].quantile(0.75)
    iqr = q3 - q1
    lower_fence = (q1 - 1.5 * iqr).round()
    upper_fence = (q3 + 1.5 * iqr).round()
    lower_count = X_train[feat][X_train[feat] < lower_fence].count()
    upper_count = X_train[feat][X_train[feat] > upper_fence].count()
    if lower_count > 0 or upper_count > 0:
        print(f'{feat} outliers = {lower_count + upper_count}: lower_fence: {lower_fence}, upper_fence: {upper_fence}, lower_count: {lower_count}, upper_count: {upper_count}')

### Handling Outliers

https://www.projectpro.io/recipes/deal-with-outliers-in-python

* Drop
* Mark
* Rescale

In [None]:
# code here

## Feature Engineering

In [None]:
# mapping female male
X_train['sex'] = X_train['sex'].map({'female':0,'male':1})
X_test['sex'] = X_test['sex'].map({'female':0,'male':1})
X_train['sex'].value_counts()

In [None]:
# reducing labels
import re

def cat_home(r):
    text = str(r['home_dest']).strip()
    if bool(re.search('[A-Z]{2}$', text[-2:])):
        return 'North America'
    elif text == 'nan':
        return 'Missing'
    else:
        return 'Not North America'

X_train['cat_home'] = X_train.apply(cat_home, axis=1)
X_test['cat_home'] = X_test.apply(cat_home, axis=1)

print(X_train['cat_home'].value_counts())
print()
print(X_test['cat_home'].value_counts())

In [None]:
# check info for categorical features
X_train.info()

In [None]:
# start list of features we won't use for analysis
features_to_drop = ['name', 'ticket', 'cabin', 'boat', 'body', 'home_dest']

## More EDA

### nlargest

* n = 6
* data.nlargest(n, "Employee Salary", keep = "all")

In [None]:
n = 6
X_train.nlargest(n, 'fare', keep='all')

### nsmallest

* n = 7
* data.nsmallest(n, "Employee Salary", keep = "all")

In [None]:
n = 7
X_train.nsmallest(n, 'fare', keep='all')

### Crosstab

In [None]:
pd.crosstab(X_train['cat_home'], X_train['pclass'])

In [None]:
import numpy as np
import seaborn as sns

result_crosstab = pd.crosstab(index = X_train['cat_home'], 
                              columns=X_train['pclass'], 
                              values =X_train['fare'], 
                              aggfunc=np.mean)
                              
sns.heatmap(result_crosstab, annot = True, fmt = 'g')

### Pivot Table

In [None]:
pd.pivot_table(X_train, 
               index=['cat_home'], 
               columns=['pclass'], 
               aggfunc='size', 
               fill_value=0)

In [None]:
result_pivot = pd.pivot_table(X_train, 
                              index=['cat_home'], 
                              columns=['pclass'], 
                              aggfunc='size', 
                              fill_value=0)
               
sns.heatmap(result_pivot, annot = True, fmt = 'g')

### Rug Plot

In [None]:
sns.rugplot(x='fare', data=X_train, height=.03, color='darkblue')
sns.histplot(x='fare', data=X_train, kde=True);

### Strip Plot

In [None]:
sns.stripplot(x=X_train['fare']);

### Describe Include Object

In [None]:
X_train.describe(include='object')

### Count Plot

In [None]:
cols = 4
rows = 1
fig = plt.figure(figsize= (16,6))
all_cats = X_train.select_dtypes(include='object')
cat_cols = all_cats.columns[all_cats.nunique() < 10]
for i, col in enumerate(cat_cols):    
    ax=fig.add_subplot(rows, cols, i+1)    
    sns.countplot(x=X_train[col], ax=ax)    
    plt.xticks(rotation=90, ha='right')
    
fig.tight_layout()  
plt.show()

### Group numeric features by each categorical feature

https://towardsdatascience.com/11-simple-code-blocks-for-complete-exploratory-data-analysis-eda-67c2817f56cd

In [None]:
for column in X_train.select_dtypes(include='object'):
    if X_train[column].nunique() < 10:
        display(X_train.groupby(column).mean())

### Correlation

In [None]:
# feature on feature
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='white')

# compute the correlation matrix
corr = X_train.drop(features_to_drop, axis=1).corr(method='pearson')

# generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# set up the matplotlib figure
f, ax = plt.subplots(figsize=(10,10))

# generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={'shrink': .5}, annot=True);

### Variance Inflation Factor

* Measures how much one predictor is influenced, or inflated, by the presence, or correlation, of another predictor
* Quick measure of the contribution of a predictor to the standard error, the standard deviation of a sample, in regression

https://www.statisticshowto.com/variance-inflation-factor/

In [None]:
# vif
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_train = X_train.drop(features_to_drop, axis=1)._get_numeric_data()
vif_data = [variance_inflation_factor(vif_train.values, i) for i in range(len(vif_train.columns))]

d = {'feature': vif_train.columns.values, 'vif': vif_data}
vif = pd.DataFrame(d)
vif

In [None]:
# add adult_male to features to drop and possibly fare
# for a list, we use append to add one item and extend to add multiple items
features_to_drop.extend(['adult_male', 'fare'])

### Mutual Information

* Measures the mutual dependence on two variables
* Persons r is linear where as MI measures non-linear relationships
* How much information can be extracted from one variable by observing another variable

In [None]:
# obtain the mutual information values and select features
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectPercentile

mi_train = X_train.drop(features_to_drop, axis=1)._get_numeric_data()

mi = mutual_info_regression(mi_train, y_train)
mi = pd.Series(mi)
mi.index = mi_train.columns
mi.sort_values(ascending=False).plot.bar()
plt.ylabel('Mutual Information')

In [None]:
# correlation anlaysis against our titanic target (survived)
X_train.drop(features_to_drop, axis=1)._get_numeric_data().corrwith(y_train).plot.bar(
        title = "Correlation with Target", fontsize = 15,
        rot = 45, grid = True);

In [None]:
import matplotlib.pyplot as plt

X_train.drop(features_to_drop, axis=1)._get_numeric_data().hist()
plt.tight_layout();

In [None]:
# example of a histogram with kde
import matplotlib.pyplot as plt
import seaborn as sns

sns.histplot(X_train['age'], alpha=0.3, kde=True, 
             bins=int(50), color = 'red',
             line_kws={'linewidth': 4})

plt.tight_layout()
plt.show();

In [None]:
# an interesting way to view sex and age .map({'female':1,'male':0})
import seaborn as sns

sns.swarmplot(x='sex', y='age', data=X_train.drop(features_to_drop, axis=1)._get_numeric_data(), size=1);

In [None]:
# look at pclass counts
fig, ax = plt.subplots(figsize=(8, 4))
sns.countplot(x=X_train.pclass, order=X_train.pclass.value_counts().index);

In [None]:
# bar chart with 95% confidence intervals (error bars)
sns.barplot(x='pclass', y=y_train, hue='sex', data=X_train);

In [None]:
# example of using groupby
X_train.groupby('pclass')['sex'].value_counts().plot(kind='bar')

In [None]:
X_train.groupby('pclass')['sex'].value_counts().plot.barh()
plt.xlabel('count')
plt.ylabel('(pclass, sex)')
plt.title('breakdown of pclass and sex')
plt.show()

In [None]:
# groupby data views
titanic.groupby(['pclass', 'embarked']).size()

In [None]:
# using aggregates
titanic.groupby('sex')['fare'].agg(['min', 'max', 'mean', 'median'])

## More Feature Engineering

* sibsp Number of Siblings/Spouses Aboard
* parch Number of Parents/Children Aboard

### One Hot Encoding

In [None]:
X_train.drop(features_to_drop, axis=1).info()

In [None]:
# use sklearn one hot encoder
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categories='auto', drop='first', sparse=False, handle_unknown='ignore')

cat_features = ['embarked', 'cat_home']
ohe_train = ohe.fit_transform(X_train[cat_features])
ohe_train = pd.DataFrame(ohe_train, columns=ohe.get_feature_names_out(cat_features))
ohe_train.index = X_train.index
X_train = X_train.join(ohe_train)
X_train.drop(cat_features, axis=1, inplace=True)

ohe_test = ohe.transform(X_test[cat_features])
ohe_test = pd.DataFrame(ohe_test, columns=ohe.get_feature_names_out(cat_features))
ohe_test.index = X_test.index
X_test = X_test.join(ohe_test)
X_test.drop(cat_features, axis=1, inplace=True)

print(X_train.drop(features_to_drop, axis=1).shape)
print(X_test.drop(features_to_drop, axis=1).shape)
print(X_train.drop(features_to_drop, axis=1).info())

## Feature Selection

* https://towardsdatascience.com/the-power-of-ridge-regression-4281852a64d6

In [None]:
# list features we targeted to drop
features_to_drop

In [None]:
# create new list that we can add and subtract from
drop_features = ['name', 'ticket', 'cabin', 'boat', 'body', 'home_dest', 'adult_male', 'fare']

### Variance Threshold

* Feature selector that removes all low-variance features

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html

In [None]:
from sklearn.feature_selection import VarianceThreshold

selections = VarianceThreshold(threshold=0.2)
selections.fit(X_train.drop(drop_features, axis=1))
X_train.drop(drop_features, axis=1).columns.values[selections.get_support()]

### Select K Best

* Select features according to the k highest scores
* Chi-squared stats of non-negative features for classification tasks

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

selections = SelectKBest(chi2, k=6) 
selections.fit(X_train.drop(drop_features, axis=1), y_train)
X_train.drop(drop_features, axis=1).columns.values[selections.get_support()]

### Select Features Using Logistic Regression

* Meta-transformer for selecting features based on importance weights

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

selections = SelectFromModel(estimator=LogisticRegression(solver='liblinear')).fit(
                    X_train.drop(drop_features, axis=1), y_train)
X_train.drop(drop_features, axis=1).columns.values[selections.get_support()]

### Recursive Feature Selection

* Feature ranking with recursive feature elimination
* Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), the goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

selections = RFE(estimator=LogisticRegression(solver='liblinear'), n_features_to_select=5).fit(
                    X_train.drop(drop_features, axis=1), y_train)
X_train.drop(drop_features, axis=1).columns.values[selections.get_support()]

## The Model

### Check for Balanced Dataset

In [None]:
# see if the titanic survived (target) is balanced
print(y_train.value_counts())
y_train.value_counts().plot.pie(labels=['0', '1']).legend();

In [None]:
# we can now build our titanic regression model
# is it balanced? no, use class_weight='balanced'
# is it a small dataset? yes, use liblinear for solver
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, confusion_matrix, roc_auc_score

model = LogisticRegression(class_weight='balanced', solver='liblinear')
model.fit(X_train.drop(drop_features, axis=1), y_train)
predictions = model.predict(X_test.drop(drop_features, axis=1))
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

rocauc_score = roc_auc_score(y_test, predictions)

print(f'Training Score: {model.score(X_train.drop(drop_features, axis=1), y_train)}')
print(f'Test Score: {model.score(X_test.drop(drop_features, axis=1), y_test)}')

# what is our accuracy? (tn + tp / (total length of our data))
print(f'Test Accuracy (tn + tp / (total length of our data)): {(tn + tp) / (tn + fp + fn + tp)}')
print(f'Roc-Auc Score: {rocauc_score}')

print()
print(confusion_matrix(y_test, predictions))