In [9]:
import numpy as np
import pandas as pd
import os

import tensorflow as tf
import tensorflow_decision_forests as tfdf



In [10]:
#load Dataset

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [13]:
#preparando o dataset
#tokenizando nomes, normalizando os dados

def preprocess(df):
    df = df.copy()

    # Normalizing the text
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    def ticket_number(x):
        return x.split(" ")[-1]
    
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)
    return df

preprocessed_train_df = preprocess(train_df)
preprocessed_test_df = preprocess(test_df)

preprocessed_train_df.head(5)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_number,Ticket_item
0,1,0,3,Braund Mr Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,21171,A/5
1,2,1,1,Cumings Mrs John Bradley Florence Briggs Thayer,female,38.0,1,0,PC 17599,71.2833,C85,C,17599,PC
2,3,1,3,Heikkinen Miss Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,3101282,STON/O2.
3,4,1,1,Futrelle Mrs Jacques Heath Lily May Peel,female,35.0,1,0,113803,53.1,C123,S,113803,NONE
4,5,0,3,Allen Mr William Henry,male,35.0,0,0,373450,8.05,,S,373450,NONE


In [14]:
input_features = list(preprocessed_train_df.columns)
input_features.remove("Survived")
input_features.remove("PassengerId")
input_features.remove("Ticket")



In [16]:
#converting panda to ts

def tokenize_names(features, labels=None):
    features["Name"] = tf.strings.to_hash_bucket_fast(features["Name"], 1000)
    return features, labels

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_train_df, label="Survived").map(tokenize_names)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_test_df).map(tokenize_names)



In [18]:
#trainando o modelo
model = tfdf.keras.GradientBoostedTreesModel(
    verbose=0,
    features=[tfdf.keras.FeatureUsage(name=n)
              for n in input_features],
              exclude_non_specified_features=True,
              min_examples=1,
              categorical_algorithm="RANDOM",
              shrinkage=0.05,
              split_axis="SPARSE_OBLIQUE",
              sparse_oblique_normalization="MIN_MAX",
              sparse_oblique_num_projections_exponent=2.0,
              num_trees=2000,
              random_seed=1234)
model.fit(train_ds)
# Avaliando o modelo
self_evaluation = model.make_inspector().evaluation()

print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")

W0000 00:00:1744477551.794491 1260581 gradient_boosted_trees.cc:1873] "goss_alpha" set but "sampling_method" not equal to "GOSS".
W0000 00:00:1744477551.794503 1260581 gradient_boosted_trees.cc:1883] "goss_beta" set but "sampling_method" not equal to "GOSS".
W0000 00:00:1744477551.794505 1260581 gradient_boosted_trees.cc:1897] "selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
I0000 00:00:1744477551.882844 1260581 kernel.cc:782] Start Yggdrasil model training
I0000 00:00:1744477551.882853 1260581 kernel.cc:783] Collect training examples
I0000 00:00:1744477551.882857 1260581 kernel.cc:795] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
column_guides {
  column_name_pattern: "^Pclass$"
}
column_guides {
  column_name_pattern: "^Name$"
}
column_guides {
  column_name_pattern: "^Sex$"
}
column_guides {
  column_name_pattern: "^Age$"
}
column_guide

Accuracy: 0.75 Loss:1.0703905820846558


I0000 00:00:1744477552.080247 1284910 early_stopping.cc:54] Early stop of the training because the validation loss does not decrease anymore. Best valid-loss: 1.07039
I0000 00:00:1744477552.080259 1284910 gradient_boosted_trees.cc:1669] Create final snapshot of the model at iteration 70
I0000 00:00:1744477552.081689 1284910 gradient_boosted_trees.cc:279] Truncates the model to 41 tree(s) i.e. 41  iteration(s).
I0000 00:00:1744477552.082079 1284910 gradient_boosted_trees.cc:341] Final model num-trees:41 valid-loss:1.070391 valid-accuracy:0.750000
I0000 00:00:1744477552.082658 1284910 kernel.cc:926] Export model in log directory: /var/folders/bn/t0gq4m154_9gm5fx3ztc465c0000gn/T/tmp2bmug883 with prefix ca9aaa9f4aa24962
I0000 00:00:1744477552.083503 1284910 kernel.cc:944] Save model in resources
I0000 00:00:1744477552.084026 1260581 abstract_model.cc:921] Model self evaluation:
Task: CLASSIFICATION
Label: __LABEL
Loss (BINOMIAL_LOG_LIKELIHOOD): 1.07039

Accuracy: 0.75  CI95[W][0 1]
ErrorRa

In [19]:
model.summary()

Model: "gradient_boosted_trees_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (11):
	Age
	Cabin
	Embarked
	Fare
	Name
	Parch
	Pclass
	Sex
	SibSp
	Ticket_item
	Ticket_number

No weights

Variable Importance: INV_MEAN_MIN_DEPTH:
    1.           "Sex"  1.000000 ################
    2.        "Pclass"  0.410322 ####
    3.           "Age"  0.350223 ###
    4.          "Fare"  0.347501 ###
    5.         "SibSp"  0.297281 ##
    6.         "Parch"  0.276020 #
    7.          "Name"  0.220359 
    8.      "Embarked"  0.180701 
    9. "Ticket_number"  0.176137 
   10.   "Ticket_item"  0.175622 

Variable Importance: NUM_AS_ROOT:
    1. "Sex" 41.000000 

Var

In [22]:
#predictions

def prediction_to_kaggle_format(model, threshold=0):
    proba_survive = model.predict(test_ds, verbose=0)[:,0]
    return pd.DataFrame({
        "PassengerId": preprocessed_test_df["PassengerId"],
        "Survived": (proba_survive > threshold).astype(int)
    })

def make_submission(kaggle_predictions):
    path="./kaggle/working/submission.csv"
    kaggle_predictions.to_csv(path, index=False)
    print(f"Submission file saved to {path}")

kaggle_predictions = prediction_to_kaggle_format(model)
make_submission(kaggle_predictions)

Submission file saved to ./kaggle/working/submission.csv
