In [None]:
!pip install -U scikit_learn pandas 'numpy<=1.23.0' seaborn matplotlib

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pathlib as pl
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.calibration import CalibratedClassifierCV, CalibrationDisplay
from sklearn.metrics import f1_score

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Load the Dataset

In [None]:
# Load a dataset into a Pandas Dataframe
data_path = pl.Path('/kaggle/input/spaceship-titanic/')
if not data_path.exists():
    data_path = pl.Path('data')
# df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
df = pd.read_csv(data_path/'train.csv')
print("Full train dataset shape is {}".format(df.shape))
label='Transported'
df.head(5)

Before starting, there are some columns that can be further broken down into more data, which will make our analysis more complete.

After we impute missing data in the original columns, we'll have to generate these surrogate columns again.

In [None]:
def extract_l1_features(df):
    df[["PassengerGroup", "PassengerGroupId"]] = df["PassengerId"].str.split('_', expand = True)
    No_People_In_PassengerGroup = df.groupby('PassengerGroup').aggregate({'PassengerId': 'size'}).reset_index()
    No_People_In_PassengerGroup = No_People_In_PassengerGroup.rename(columns = {"PassengerId": "NoInPassengerGroup"})
    df = df.merge(No_People_In_PassengerGroup[["PassengerGroup"]], how = 'left', on = ['PassengerGroup'])

    # Split Cabin into Deck, Number and Side features
    df[["CabinDeck", "CabinNum", "CabinSide"]] = df["Cabin"].str.split("/", expand=True)

    # Create TotalSpendings feature
    df["TotalSpendings"] = df["RoomService"] + df["FoodCourt"] + df["ShoppingMall"] + df["Spa"] + df["VRDeck"]
    
    df[["FirstName", "FamilyName"]] = df["Name"].str.split(' ', expand = True)
    # Create NoRelatives feature
    NoRelatives = df.groupby('FamilyName')['PassengerId'].count().reset_index()
    NoRelatives = NoRelatives.rename(columns = {"PassengerId": "NoRelatives"})
    
    df = df.merge(NoRelatives[["FamilyName", "NoRelatives"]], how = 'left', on = ['FamilyName'])
    df.drop(columns=['Cabin','Name'],inplace=True)

    return df

df1 = extract_l1_features(df.copy())
df1.head()

Helper functions to plot univariate and bivariate charts for categorical and numeric data.

In [None]:
def config_axes(list_df_cols, dependent: str | None = None, n_per_row=None):
    if dependent is not None and dependent in list_df_cols:
        list_df_cols.remove(dependent)

    if n_per_row:
        nrows = len(list_df_cols)
        ncols = n_per_row
    else:
        sqrt_n_cols = np.sqrt(len(list_df_cols))
        # nrows = int(np.floor(sqrt_n_cols))
        nrows = ncols = int(np.ceil(sqrt_n_cols))

    figsize = (ncols * 5, nrows * 4)
    _, axes = plt.subplots(nrows, ncols, figsize=figsize)
    return axes


def plot_cat_cols(df: pd.DataFrame, dependent: str | None = None):
    list_df_cols = sorted(
        list(df.select_dtypes(["object", "category", "bool"]).columns)
    )
    if dependent is not None and dependent in list_df_cols:
        list_df_cols.remove(dependent)

    axes = config_axes(list_df_cols, dependent).flat
    axes = iter(axes)

    for col in list_df_cols:
        ax = next(axes)
        if dependent is None:
            sns.countplot(data=df, x=col, order=df[col].sort_values().unique(), ax=ax)
        else:
            sns.barplot(
                data=df,
                x=col,
                y=dependent,
                order=df[col].sort_values().unique(),
                orient="v",
                ax=ax,
            )
        ax.bar_label(ax.containers[0])

    plt.tight_layout()


def plot_numeric_cols(df: pd.DataFrame, dependent: str | None = None):
    list_df_cols = list(df.select_dtypes(np.number).columns)

    axes = iter(config_axes(list_df_cols, dependent, 2).flat)
    for col in list_df_cols:
        if dependent is None:
            sns.histplot(
                data=df,
                color="b",
                x=col,
                ax=next(axes),
            )
            sns.boxplot(
                data=df,
                color="b",
                y=col,
                ax=next(axes),
            )
        else:
            sns.violinplot(
                data=df,
                x=dependent,
                y=col,
                ax=next(axes),
            )
            sns.boxplot(
                data=df,
                x=dependent,
                y=col,
                ax=next(axes),
            )

    plt.tight_layout()

# Univariate analysis

## Categorical variables

In [None]:
# remove columns with too many values
plot_cat_cols(
    df1.drop(
        columns=["PassengerId", "FirstName", "FamilyName", "CabinNum", "PassengerGroup"]
    )
)

### Conclusions

- Cabin decks F and G have a lot more people than the other ones, it may be valuable to see if other features explain this disparity. Maybe rich and poor people travel separately, like in the original Titanic?
- Cabin deck T has only 5 people, who are they?
- Most people are going to TRAPPIST-1e.
- Most people come from Earth.
- The dataset is balanced: roughly the same amount of transported and non-transported people in the ship.
- PassengerGroupId has logarithmic behavior, but this is just the nature of the data (all groups have at least one person with group ID 1 and larger groups are more rare).

## Numerical variables

In [None]:
plot_numeric_cols(df1)

### Conclusions

- Age is not totally normally distributed. There are more young people than old.
- Most people spend very little to no money. Are there lots of poor people or is there some explanation to this?

# Bivariate analysis against dependent variable

## Categorical variables

In [None]:
# remove columns with too many values
plot_cat_cols(df1.drop(columns=["PassengerId", "FirstName",'FamilyName','CabinNum','PassengerGroup']), dependent=label)

### Conclusions

- Cabin sides B and C have more transported people (proportionally, inside the group). 
- Even though we have more people coming from Earth and going to TRAPPIST-1e, these are the sources of the fewest transported people.
- People in cryogenic sleep have been transported much more than awake people.

## Numerical variables

In [None]:
plot_numeric_cols(df, dependent=label)

### Conclusions

- Age has no bearing on who gets transported.
- For some reason, people who get transported spend **less** money on room service, Spa and VR deck.

# Bivariate analysis, misc

There could be something going on with lots of people in cryogenic sleep being transported and people who spend less in certain activities also being transported. Let's check out the relationship between CryoSleep and the numeric variables. 

In [None]:
plot_numeric_cols(df, dependent='CryoSleep')

### Conclusions

- Age has no bearing on who gets turned into a popsicle.
- People in cryogenic sleep spend no money **at all**. but because of that, we don't know if they are wealthy or not.

Let's also look at passenger spending by age. Maybe old people are richer and spend more, while kids spend less.

In [None]:
df["AgeCat"] = pd.cut(
    df.Age,
    bins=[0, 4, 12, 17, 25, 34, 55, 80],
    labels=["0 - 4", "5 - 12", "13 - 17", "18 - 25", "26 - 34", "35 - 55", "56 - 80"],
)

In [None]:
plot_numeric_cols(df.drop(columns='Age'), dependent='AgeCat')

### Conclusions

- Kids also do no spend any money.

A little bird told me to take a look at this one...

In [None]:
pd.crosstab(df['AgeCat'],df['VIP'])

## Conclusions

- People under 18 are not VIPs.

In [None]:
del df['AgeCat']

Let's look at who comes from where and who goes where in this ship.

In [None]:
sns.heatmap(pd.crosstab(df1.HomePlanet,df1.Destination), annot=True, fmt="d", linewidths=.5);

### Conclusions

- Most people who embark on Earth go to TRAPPIST-1e.

## TODO

- Visualize spending, cryo sleep, home planet and destination by deck.
- Analyze the compositions of cabins B/C (the ones with most transported) and F/G (the ones with the most people) against CryoSleep and spending.

# Dealing with missing values

In this section, we'll deal with the missing data in the original dataset, without the generated features. After inserting as much missing data as we can, we'll create those columns again.

In [None]:
df.isna().sum()

Let's use our knowledge that kids and sleepers don't spend and fill those missing values in a more informed way.

In [None]:
def fill_nans_by_age_and_cryosleep(df):
    non_spenders = (df["Age"] < 13) | (df["CryoSleep"] == True)
    df.loc[non_spenders, "RoomService"] = 0
    df.loc[non_spenders, "FoodCourt"] = 0
    df.loc[non_spenders, "ShoppingMall"] = 0
    df.loc[non_spenders, "Spa"] = 0
    df.loc[non_spenders, "VRDeck"] = 0
    df.loc[df["Age"] < 18, "VIP"] = False
    return df

df = fill_nans_by_age_and_cryosleep(df)
df.isna().sum()

The rest of categorical variables will be filled with the mode.

In [None]:
# Clever way to list categorical variables with missing values
list_missing_cat_columns = list(
    (df.select_dtypes(["object", "category", "bool"]).isna().sum() > 0).index
)

for col in list_missing_cat_columns:
    df[col] = df[col].fillna(df[col].mode()[0])
df.isna().sum()

Rows with missing numeric values will be filled with sklearn Iterative Imputer. 

In [None]:
list_missing_numeric_col = list((df.select_dtypes(np.number).isna().sum() > 0).index)
list_numeric_col = list(df.select_dtypes(np.number).columns)
df[list_missing_numeric_col] = pd.DataFrame(IterativeImputer().fit_transform(df[list_numeric_col]), columns=[list_missing_numeric_col])
df.isna().sum()

Clip outliers in numerical columns on the 99% quantile.

In [None]:
def clipping_quantile(dataframe, quantile_values = None, quantile = 0.99):
    df = dataframe.copy()
    if quantile_values is None:
        quantile_values = df[list_numeric_col].quantile(quantile)
    for num_column in list_numeric_col:
        num_values = df[num_column].values
        threshold = quantile_values[num_column]
        num_values = np.where(num_values > threshold, threshold, num_values)
        df[num_column] = num_values
    return df      
    
df = clipping_quantile(df, None, 0.99)
plot_numeric_cols(df)

Now we'll create some additional features based on our previous findings.

In [None]:
def extract_l2_features(df):    
    # Create DeckPosition feature
    df["DeckPosition"] = df["CabinDeck"].apply(lambda deck: "Lower" if deck in ('A', 'B', 'C', 'D') else "Higher" )
    # Create Regular feature
    df["Regular"] = df["FoodCourt"] + df["ShoppingMall"] 
    # Create Luxury feature
    df["Luxury"] = df["RoomService"] + df["Spa"] + df["VRDeck"]
    
    Wealthiest_Deck = df.groupby('CabinDeck').aggregate({'TotalSpendings': 'sum', 'PassengerId': 'size'}).reset_index()
    # Create DeckAverageSpent feature
    Wealthiest_Deck['DeckAverageSpent'] = Wealthiest_Deck['TotalSpendings'] / Wealthiest_Deck['PassengerId']

    df.drop(columns=['PassengerId'],inplace=True)
    
    df = df.merge(Wealthiest_Deck[["CabinDeck", "DeckAverageSpent"]], how = 'left', on = ['CabinDeck'])
    # Create FamilySizeCat feature
    df["FamilySizeCat"] = pd.cut(df.NoRelatives, bins = [0, 2, 5, 10, 300], labels = ['0 - 2', '3 - 5', '6 - 10', '11 - 208'])
    
    return df

df2 = df.copy()
df2 = extract_l1_features(df)
df2 = extract_l2_features(df2)

In [None]:
df3 = df2.copy()
irrelevant_columns = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "FirstName", "FamilyName", "PassengerGroup"]
df3.drop(columns=irrelevant_columns,inplace=True)
df3.info()

# Feature encoding

Depending on the algorithm and library used, some features need to be transformed further for compatibility, e.g.:
- tfdf does not work with Boolean values, but works with Categorical columns and treats string columns as Categorical.
- sklearn does not work with Categorical or string columns.

It is usually a good idea to convert:
- Boolean values into 0/1 integers
- Categorical values into one-hot representations
- Ordinal categorical values into ordinal representations

In [None]:
# Preprocessing for sklearn decision trees
# Categorical encoding, drop redundant columns
df3 = pd.get_dummies(
    df3,
    columns=["HomePlanet", "CryoSleep", "Destination", "VIP", "CabinSide"],
    dtype=int, drop_first=True,
)
# # # Ordinal Encoding
for col in ["CabinDeck", "DeckPosition", "FamilySizeCat"]:
    df3[col], _ = df3[col].factorize()

df3.head()

## Train/Val split

Because trees can be evaluated using OOB data, using a train/val split is not necessary for evaluating the model. But it shall be done too.

In [None]:
X = df3.copy()
X.drop('Transported', axis=1, inplace=True)
y = df3['Transported'].copy().astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

# Configure the model

Let's do a grid search over a random forest classifier. The grid search will use straified cross-validation and will use the F1-score as selection metric.

In [None]:
# Create a Random Search tuner
parameters = {"n_estimators": (10, 20, 50, 100), "max_depth": (10, 20, 50, 100, None)}
grid_search = GridSearchCV(
    RandomForestClassifier(criterion="log_loss",oob_score=f1_score), parameters, n_jobs=-1,scoring=f1_score,cv=5
)

# Train the model

We will train the model using a one-liner.

In [None]:
grid_search.fit(X_train, y_train)

Let's check the parameters of the best model found by the grid search.

In [None]:
grid_search.best_params_


# Evaluate the model on the Out of bag (OOB) data and the validation dataset

Before training the dataset we have manually separated 20% of the dataset for validation named as `valid_ds`.

We can also use Out of bag (OOB) score to validate our random forest estimator.
To train a Random Forest Model, a set of random samples from training set are choosen by the algorithm and the rest of the samples are used to finetune the model.The subset of data that is not chosen is known as Out of bag data (OOB).
OOB score is computed on the OOB data.

Read more about OOB data [here](https://developers.google.com/machine-learning/decision-forests/out-of-bag).

The `oob_score_`attribute shows the evaluated metric on the out-of-bag dataset according to the number of trees in the model.

Note: Larger values are better for this hyperparameter.

In [None]:

rf = grid_search.best_estimator_
rf.oob_score_

Gini importances

In [None]:
plt.barh(rf.feature_names_in_, rf.feature_importances_)
# plt.xticks(rotation=90)
plt.xlabel("Importances")
plt.ylabel("Features")
plt.show()

In [None]:
y_pred=rf.predict(X_test)
f1_score(y_test, y_pred)

# Model calibration

This step is mostly useless, as it does not change the output of the model. But I was learning about it and decided to include it here.

In [None]:
CalibrationDisplay.from_estimator(rf, X, y)

In [None]:
calib_rf = CalibratedClassifierCV(rf,cv=3)
calib_rf.fit(X, y)
CalibrationDisplay.from_estimator(calib_rf, X, y);

# Submission

In [None]:
def process_features(df: pd.DataFrame)->pd.DataFrame:
    df=df.copy()
    df = extract_l1_features(df)
    df = fill_nans_by_age_and_cryosleep(df)
    df = extract_l2_features(df)
        # Clever way to list categorical variables with missing values
    list_missing_cat_columns = list(
        (df.select_dtypes(["object", "category", "bool"]).isna().sum() > 0).index
    )
    for col in list_missing_cat_columns:
        df[col] = df[col].fillna(df[col].mode()[0])

    list_missing_numeric_col = list((df.select_dtypes(np.number).isna().sum() > 0).index)
    list_numeric_col = list(df.select_dtypes(np.number).columns)
    df[list_missing_numeric_col] = pd.DataFrame(IterativeImputer().fit_transform(df[list_numeric_col]), columns=[list_missing_numeric_col])
    df = clipping_quantile(df, None, 0.99)
    irrelevant_columns = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "FirstName", "FamilyName", "PassengerGroup"]
    df.drop(columns=irrelevant_columns,inplace=True)\

    # Preprocessing for sklearn decision trees
    # Categorical Encoding
    df = pd.get_dummies(
        df,
        columns=["HomePlanet", "CryoSleep", "Destination", "VIP", "CabinSide"],
        dtype=int, drop_first=True,
    )
    # # # Ordinal Encoding
    for col in ["CabinDeck", "DeckPosition", "FamilySizeCat"]:
        df[col], _ = df[col].factorize()
        
    return df

In [None]:
# Load the test dataset
test_df = pd.read_csv(data_path / 'test.csv')
submission_id = test_df.PassengerId
test_df = process_features(test_df)

In [None]:
# Get the predictions for testdata
predictions = rf.predict(test_df)
n_predictions = (predictions > 0.5).astype(bool)
output = pd.DataFrame({'PassengerId': submission_id,
                       'Transported': n_predictions.squeeze()})
output.to_csv('submission.csv', index=False)
output.head()