In [None]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import tensorflow_decision_forests as tfdf

print(f"Found TF-DF {tfdf.__version__}")

train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
serving_df = pd.read_csv("/kaggle/input/titanic/test.csv")

train_df.head(10)

In [None]:
def preprocess(df):
    df = df.copy()
    
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)                     
    return df
    
preprocessed_train_df = preprocess(train_df)
preprocessed_serving_df = preprocess(serving_df)

preprocessed_train_df.head(5)

In [None]:
input_features = list(preprocessed_train_df.columns)
input_features.remove("Ticket")
input_features.remove("PassengerId")
input_features.remove("Survived")
#input_features.remove("Ticket_number")

print(f"Input features: {input_features}")

In [None]:
def tokenize_names(features, labels=None):
    """Divite the names into tokens. TF-DF can consume text tokens natively."""
    features["Name"] =  tf.strings.split(features["Name"])
    return features, labels

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_train_df,label="Survived").map(tokenize_names)
serving_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_serving_df).map(tokenize_names)

In [None]:
model = tfdf.keras.GradientBoostedTreesModel(
    verbose=0, # Very few logs
    features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],
    exclude_non_specified_features=True, # Only use the features in "features"
    min_examples=1,
    categorical_algorithm="RANDOM",
    shrinkage=0.05,
    split_axis="SPARSE_OBLIQUE",
    sparse_oblique_normalization="MIN_MAX",
    sparse_oblique_num_projections_exponent=2.0,
    num_trees=2000,
    random_seed=1234,
    
)
model.fit(train_ds)

self_evaluation = model.make_inspector().evaluation()
print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")

#### Step 1: Analyze Dataset to figure out importances  

In [None]:
tfdf_model = model.make_inspector()
feature_importances = tfdf_model.variable_importances()

# Print feature importance
for key, value in feature_importances.items():
    print(f"{key}: {value}")

In [None]:
INV_MEAN_MIN_DEPTH: [("Sex" (4; #7), 0.8052448900887005), ("Age" (1; #0), 0.37236877049769707), ("Fare" (1; #3), 0.27302635385086366), ("Name" (5; #4), 0.18730685507680786), ("Pclass" (1; #6), 0.1808049957610191), ("Ticket_item" (4; #9), 0.17841578358982724), ("Ticket_number" (4; #10), 0.17836635726902894), ("Parch" (1; #5), 0.1776837530643011), ("Embarked" (4; #2), 0.1760701569016847), ("SibSp" (1; #8), 0.17255726347322736)]
SUM_SCORE: [("Sex" (4; #7), 461.52320827601943), ("Age" (1; #0), 378.0191557586612), ("Fare" (1; #3), 275.2302527445295), ("Name" (5; #4), 118.34333113066077), ("Pclass" (1; #6), 36.22843893558047), ("Parch" (1; #5), 23.516134310058987), ("Ticket_item" (4; #9), 21.838539421965834), ("Ticket_number" (4; #10), 17.21795472210397), ("Embarked" (4; #2), 7.074696451425552), ("SibSp" (1; #8), 0.4004818166622943)]
NUM_AS_ROOT: [("Sex" (4; #7), 34.0), ("Name" (5; #4), 2.0)]
NUM_NODES: [("Age" (1; #0), 428.0), ("Fare" (1; #3), 278.0), ("Name" (5; #4), 55.0), ("Ticket_item" (4; #9), 38.0), ("Sex" (4; #7), 36.0), ("Ticket_number" (4; #10), 23.0), ("Parch" (1; #5), 19.0), ("Pclass" (1; #6), 10.0), ("Embarked" (4; #2), 9.0), ("SibSp" (1; #8), 4.0)]

#### Step 2: Feature selection 
- removing less important features 