# Titanic ML Streamlined Attempt

## Imports and check file locations

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization

# Input data files are available in the "./inputs/" directory.
import os
for dirname, _, filenames in os.walk('./inputs'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Read files to pandas dataframes

In [None]:
test_df = pd.read_csv('./inputs/test.csv')
train_df = pd.read_csv('./inputs/train.csv')

## Look at the first few rows

In [None]:
test_df.head()

In [None]:
train_df.head()

## Prepare dataset 

We will apply the following transformations on the dataset.

- Tokenize the names. For example, "Braund, Mr. Owen Harris" will become ["Braund", "Mr.", "Owen", "Harris"].
- Extract any prefix in the ticket. For example ticket "STON/O2. 3101282" will become "STON/O2." and 3101282.

In [None]:
# Define the pre-processing function
def preprocess(df):
    df = df.copy()
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    def family_name(x):
        return x.split(",")[0]
    
    df["Name"] = df["Name"].apply(family_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)                     
    return df

In [None]:
## Preprocess the data with our function
prep_train_df = preprocess(train_df)
prep_test_df = preprocess(test_df)

prep_train_df.head()

## Feature selection

Let's keep the list of the input features of the model. Notably, we don't want to train our model on the "PassengerId" and "Ticket" features.

In [None]:
# Define the input features
input_features = list(prep_train_df.columns)
input_features.remove("Ticket")
input_features.remove("PassengerId")
input_features.remove("Survived")
input_features.remove("Ticket_number")

print(f"Input features: {input_features}")

In [None]:
# Split the data into training and testing sets and separate the input features from the target
X_train = prep_train_df[input_features]
y_train = prep_train_df["Survived"].to_numpy()
X_test = prep_test_df[input_features]

In [None]:
X_train.head()

In [None]:
# Look at the data a bit before we change it
print("Data Types:\n" + str(X_train.dtypes))

In [None]:
# TODO: Look at whether a column transformer makes this code much simpler!!!

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

categorical_features = [ 'Pclass', 'Name', 'Sex', 'Cabin', 'Embarked', 'Ticket_item' ]
scalable_columns = ['Age', 'SibSp', 'Parch', 'Fare']
other_features = [x for x in input_features if x not in categorical_features and x not in scalable_columns]

print(f"Categorical Features: {categorical_features}")
print(f"Scalable Features: {scalable_columns}")
print(f"Other Features: {other_features}")

# Instantiating the Scikit-Learn Scaler as part of a column transformer to target certain columns
ct = ColumnTransformer([
        ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('scaler', StandardScaler(), scalable_columns)
    ], remainder='passthrough')

feature_names = ct.get_feature_names_out()

X_train = pd.DataFrame(X_train_scaled, columns=feature_names)
X_test = pd.DataFrame(X_test_scaled, columns=feature_names)

In [None]:
X_train.head()

In [None]:
X_test.head()