# Titanic ML First Attempt

## Imports and check file locations

In [714]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization

# Input data files are available in the "./inputs/" directory.
import os
for dirname, _, filenames in os.walk('./inputs'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


./inputs/test.csv
./inputs/train.csv
./inputs/gender_submission.csv


## Read files to pandas dataframes

In [715]:
test_df = pd.read_csv('./inputs/test.csv')
train_df = pd.read_csv('./inputs/train.csv')

## Look at the first few rows

In [716]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [717]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Prepare dataset 

We will apply the following transformations on the dataset.

- Tokenize the names. For example, "Braund, Mr. Owen Harris" will become ["Braund", "Mr.", "Owen", "Harris"].
- Extract any prefix in the ticket. For example ticket "STON/O2. 3101282" will become "STON/O2." and 3101282.

In [718]:
# Define the pre-processing function
def preprocess(df):
    df = df.copy()
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    def family_name(x):
        return x.split(",")[0]
    
    df["Name"] = df["Name"].apply(family_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)                     
    return df

In [719]:
## Preprocess the data with our function
prep_train_df = preprocess(train_df)
prep_test_df = preprocess(test_df)

prep_train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_number,Ticket_item
0,1,0,3,Braund,male,22.0,1,0,A/5 21171,7.25,,S,21171,A/5
1,2,1,1,Cumings,female,38.0,1,0,PC 17599,71.2833,C85,C,17599,PC
2,3,1,3,Heikkinen,female,26.0,0,0,STON/O2. 3101282,7.925,,S,3101282,STON/O2.
3,4,1,1,Futrelle,female,35.0,1,0,113803,53.1,C123,S,113803,NONE
4,5,0,3,Allen,male,35.0,0,0,373450,8.05,,S,373450,NONE


## Feature selection

Let's keep the list of the input features of the model. Notably, we don't want to train our model on the "PassengerId" and "Ticket" features.

In [720]:
# Define the input features
input_features = list(prep_train_df.columns)
input_features.remove("Ticket")
input_features.remove("PassengerId")
input_features.remove("Survived")
input_features.remove("Ticket_number")

print(f"Input features: {input_features}")

Input features: ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Ticket_item']


In [721]:
# Split the data into training and testing sets and separate the input features from the target
X_train = prep_train_df[input_features]
y_train = prep_train_df["Survived"].to_numpy()
X_test = prep_test_df[input_features]

In [722]:
X_train.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Ticket_item
0,3,Braund,male,22.0,1,0,7.25,,S,A/5
1,1,Cumings,female,38.0,1,0,71.2833,C85,C,PC
2,3,Heikkinen,female,26.0,0,0,7.925,,S,STON/O2.
3,1,Futrelle,female,35.0,1,0,53.1,C123,S,NONE
4,3,Allen,male,35.0,0,0,8.05,,S,NONE


## Handle categorical features

Most ML models can only 

> Even though `pandas.get_dummies` is straightforward to use, a more common approach is to use `OneHotEncoder` from the sklearn library, especially when you are doing machine learning tasks. The primary difference is `pandas.get_dummies` cannot learn encodings; it can only perform one-hot-encoding on the dataset you pass as an input. On the other hand, `sklearn.OneHotEncoder` is a class that can be saved and used to transform other incoming datasets in the future.

Excerpts from https://www.datacamp.com/tutorial/categorical-data

In [723]:
# Look at the data a bit before we change it
print("Data Types:\n" + str(X_train.dtypes))

Data Types:
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Cabin           object
Embarked        object
Ticket_item     object
dtype: object


In [724]:
# TODO: Look at whether a column transformer makes this code much simpler!!!

from sklearn.preprocessing import OneHotEncoder

categorical_features = [ 'Pclass', 'Name', 'Sex', 'Cabin', 'Embarked', 'Ticket_item' ]
other_features = [x for x in input_features if x not in categorical_features]

print(f"Categorical Features: {categorical_features}")
print(f"Other Features: {other_features}")

# Instantiating the Scikit-Learn OHE object
ohe = OneHotEncoder(handle_unknown='ignore')

# Check the shape of the data before transformation
print("X_train categorical features shape:")
print(X_train[categorical_features].shape)

# Fitting the categorical features from the DataFrame to the Scikit-Learn one-hot encoder
X_train_encoded = ohe.fit_transform(X_train[categorical_features])
flattened_categories = [item for sublist in ohe.categories_ for item in sublist]

# Using the output dummies and transformer categories to produce a cleaner looking dataframe
X_train_encoded_df = pd.DataFrame(data = X_train_encoded.toarray(), 
                                  columns = flattened_categories)

# Concatenating the one-hot encoded dataframe with the original dataframe
X_train = pd.concat([X_train[other_features], X_train_encoded_df], axis=1)

Categorical Features: ['Pclass', 'Name', 'Sex', 'Cabin', 'Embarked', 'Ticket_item']
Other Features: ['Age', 'SibSp', 'Parch', 'Fare']
X_train categorical features shape:
(891, 6)


In [725]:
# Fitting the categorical features from the DataFrame to the Scikit-Learn one-hot encoder
X_test_encoded = ohe.transform(X_test[categorical_features])

# Using the output dummies and transformer categories to produce a cleaner looking dataframe
X_test_encoded_df = pd.DataFrame(data = X_test_encoded.toarray(), 
                                  columns = flattened_categories)

# Concatenating the one-hot encoded dataframe with the original dataframe
X_test = pd.concat([X_test[other_features], X_test_encoded_df], axis=1)

In [726]:
print(X_train)

      Age  SibSp  Parch     Fare    1    2    3  Abbing  Abbott  Abelson  ...  \
0    22.0      1      0   7.2500  0.0  0.0  1.0     0.0     0.0      0.0  ...   
1    38.0      1      0  71.2833  1.0  0.0  0.0     0.0     0.0      0.0  ...   
2    26.0      0      0   7.9250  0.0  0.0  1.0     0.0     0.0      0.0  ...   
3    35.0      1      0  53.1000  1.0  0.0  0.0     0.0     0.0      0.0  ...   
4    35.0      0      0   8.0500  0.0  0.0  1.0     0.0     0.0      0.0  ...   
..    ...    ...    ...      ...  ...  ...  ...     ...     ...      ...  ...   
886  27.0      0      0  13.0000  0.0  1.0  0.0     0.0     0.0      0.0  ...   
887  19.0      0      0  30.0000  1.0  0.0  0.0     0.0     0.0      0.0  ...   
888   NaN      1      2  23.4500  0.0  0.0  1.0     0.0     0.0      0.0  ...   
889  26.0      0      0  30.0000  1.0  0.0  0.0     0.0     0.0      0.0  ...   
890  32.0      0      0   7.7500  0.0  0.0  1.0     0.0     0.0      0.0  ...   

     SOTON/O.Q.  SOTON/O2  

In [727]:
print(y_train)

[0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1
 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0
 1 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0
 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1
 0 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 1 0
 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1
 1 0 1 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 0
 0 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0
 0 1 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 1 1
 1 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0
 1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0 1
 1 1 1 1 1 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0
 0 0 1 1 0 1 0 0 1 0 0 0 

In [728]:
# Check there are not any columns that are not float64, int64, or bool
for col in X_train.dtypes:
    if col not in ["float64", "int64", "bool"]:
        print(col)

## Feature scaling

If the data your model uses are not scaled similarly then it can be skewed when one feature runs 1-100000000 and another (which could even be more important) runs from 0-7 

Scaling beings these all to the same scale 
https://www.datacamp.com/tutorial/preprocessing-in-data-science-part-3-scaling-synthesized-data

In [729]:
# Print raw data
print(test_df.head(1))

   PassengerId  Pclass              Name   Sex   Age  SibSp  Parch  Ticket  \
0          892       3  Kelly, Mr. James  male  34.5      0      0  330911   

     Fare Cabin Embarked  
0  7.8292   NaN        Q  


In [730]:
fn = ohe.get_feature_names_out()

In [731]:
X_train.head() 

Unnamed: 0,Age,SibSp,Parch,Fare,1,2,3,Abbing,Abbott,Abelson,...,SOTON/O.Q.,SOTON/O2,SOTON/OQ,STON/O2.,STON/O_2.,SW/PP,W./C.,W.E.P.,W/C,WE/P
0,22.0,1,0,7.25,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,38.0,1,0,71.2833,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,26.0,0,0,7.925,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35.0,1,0,53.1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,35.0,0,0,8.05,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [732]:
scalable_columns = ['Age', 'SibSp', 'Parch', 'Fare']
other_features = [x for x in X_train.columns if x not in scalable_columns]

# To use ColumnTransformer we need all the columns to be strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

X_train[scalable_columns].head()

Unnamed: 0,Age,SibSp,Parch,Fare
0,22.0,1,0,7.25
1,38.0,1,0,71.2833
2,26.0,0,0,7.925
3,35.0,1,0,53.1
4,35.0,0,0,8.05


In [733]:
X_test[scalable_columns].head()

Unnamed: 0,Age,SibSp,Parch,Fare
0,34.5,0,0,7.8292
1,47.0,1,0,7.0
2,62.0,0,0,9.6875
3,27.0,0,0,8.6625
4,22.0,1,1,12.2875


In [734]:
print("Check that shapes of the data are the same (in terms of columns)")
print("Check that the shape (rows, columns)")
print(X_train.shape)
print(X_test.shape)

Check that shapes of the data are the same (in terms of columns)
Check that the shape (rows, columns)
(891, 872)
(418, 872)


In [735]:
# TODO: Consider using a pipeline to make this code more readable!!!

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Instantiating the Scikit-Learn Scaler as part of a column transformer to target certain columns
ct = ColumnTransformer([
        ('scaler', StandardScaler(), scalable_columns)
    ], remainder='passthrough')

# Scale just what we want to
X_train_scaled = ct.fit_transform(X_train)
X_test_scaled = ct.transform(X_test)

# Get the feature names from the column transformer
feature_names = ct.get_feature_names_out()

# When you use fit_transform method of ColumnTransformer, it returns a numpy array and not a DataFrame, 
# so the column names are lost. However, you can convert the output back to a DataFrame and assign the column names manually.
# Convert the array back to a dataframe and assign column names
X_train = pd.DataFrame(X_train_scaled, columns=feature_names)
X_test = pd.DataFrame(X_test_scaled, columns=feature_names)

ValueError: Shape of passed values is (418, 876), indices imply (418, 872)

In [None]:
X_train.head()

In [None]:
print("Check that shapes of the data are the same (in terms of columns)")
print("Check that the shape (rows, columns)")
print(f"Train {X_train.shape}")
print(f"Test  {X_test.shape}")
print(f"feature_names length {len(feature_names)}")

In [None]:
X_test.head()