In [None]:
import tensorflow_decision_forests as tfdf
import tensorflow as tf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import sys

import warnings
warnings.filterwarnings("ignore")
import math

# Comment this if the data visualisations doesn't work on your side
%matplotlib inline



In [None]:
print("TensorFlow v" + tf.__version__)
print("TensorFlow Decision Forests v" + tfdf.__version__)

In [None]:
def predict_function(model, data):
  @tf.function(reduce_retracing=True)
  def inner_function(model, data):
    return model(data)

  return inner_function(model, data)

In [None]:
dataset = pd.read_csv("data/house-prices-advanced-regression-techniques/train.csv")
dataset.head()


In [None]:
# Assuming 'dataset' is your DataFrame
null_counts = dataset.isnull().sum()
null_columns = null_counts[null_counts > 0].index.tolist()  # Get column names with null values
null_counts[null_counts > 0].plot(kind='bar')
plt.show()

In [None]:
print("Columns with null values:", null_columns)

In [None]:
# drop NaN valued cols
cols_to_drop = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
dataset.drop(columns=cols_to_drop, inplace=True)

In [None]:
null_counts = dataset.isnull().sum()
null_columns = null_counts[null_counts > 0].index.tolist()  # Get column names with null values

In [None]:
dataset.head()

In [None]:
# Assuming 'dataset' is your DataFrame
for column in null_columns:
    datatype = dataset[column].dtype
    null_count = dataset[column].isnull().sum()
    
    print(f"Column: {column}, DataType: {datatype}, Null Count: {null_count}")

    # Here, you can apply your logic to treat null values based on datatype
    if dataset[column].dtype == object or dataset[column].dtype == bool:
        dataset[column] = dataset[column].fillna(dataset[column].mode())
    elif datatype in ['int64', 'float64']:
        dataset[column] = dataset[column].fillna(dataset[column].mean())

# You can customize the treatment for null values based on datatype as needed
null_counts = dataset.isnull().sum()
null_columns = null_counts[null_counts > 0].index.tolist()
print(null_columns)

In [None]:
# display corellation only if it is more than 0.8
def corelated_graph():
    numerical_dataset = dataset.select_dtypes(include=['number'])  # Select only numerical columns
    plt.figure(figsize=(10,10))
    sns.heatmap(dataset.corr()>0.8,
            annot=True,
            cbar=False)
    plt.show()


In [None]:
corelated_graph()

In [None]:
# drop highly corelated col/features
dataset.drop(columns=['TotRmsAbvGrd', 'TotalBsmtSF', 'GarageCars'], inplace=True)
corelated_graph()


In [None]:
tf_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(dataset, label="SaleCondition")

model = tfdf.keras.RandomForestModel()
model.fit(tf_dataset)

print(model.summary())

In [None]:
test_file_path = "data/house-prices-advanced-regression-techniques/test.csv"
test_data = pd.read_csv(test_file_path)
test_data.head()


In [None]:
test_data.info()

In [None]:
def compare_dataframe_info(df1, df2):
  """
  Compares the information of two DataFrames.

  Args:
    df1: The first DataFrame.
    df2: The second DataFrame.

  Returns:
    A dictionary of differences between the two DataFrames.
  """

  differences = {}

  # Compare the column names
  column_names1 = df1.columns
  column_names2 = df2.columns

  # Get the columns that are only in the first DataFrame
  df1_only_columns = list(set(column_names1) - set(column_names2))

  # Get the columns that are only in the second DataFrame
  df2_only_columns = list(set(column_names2) - set(column_names1))

  # Add the column names to the differences dictionary
  differences['column_names'] = {
      'df1_only': df1_only_columns,
      'df2_only': df2_only_columns
  }

  # Compare the data types
  data_types1 = df1.dtypes
  data_types2 = df2.dtypes

  # Get the columns where the data types are different
  different_data_types_columns = [
      column for column in set(column_names1) & set(column_names2)
      if data_types1[column] != data_types2[column]
  ]

  # Add the columns with different data types to the differences dictionary
  differences['data_types'] = {
      'different_data_types': different_data_types_columns
  }

  return differences


In [None]:
compare_dataframe_info(dataset, test_data)

In [None]:
for col in test_data.columns:
    print(f"{test_data[col].name}: {test_data[col].dtype}")
    
    # encode them
    if test_data[col].dtype == object:
        le = LabelEncoder()
        test_data[col] = le.fit_transform(test_data[col])
  
    # In case of boolean data type 
    # convert them to binary
    if test_data[col].dtype == 'bool':
        test_data[col] = test_data[col].astype(int)    

In [None]:
for col in test_data.columns:
    # Check for null values presence
    if test_data[col].isnull().sum() == 0:
        continue

    # If the data type is categorical filling by mode.
    if test_data[col].dtype == object or test_data[col].dtype == bool:
        test_data[col] = test_data[col].fillna(test_data[col].mode()[0])

    # Else by mean
    else:
        test_data[col] = test_data[col].fillna(test_data[col].mean())

test_data.isnull().sum().sum()


In [None]:
dataset.head(3)

In [None]:
# Get the first record.
first_record = dataset.iloc[0, :]

# Print the record vertically.
print(first_record)

In [None]:
dataset = dataset.drop('Id', axis=1)
dataset.head(3)

In [None]:
dataset.info()

In [None]:
print(dataset['SalePrice'].describe())
plt.figure(figsize=(9, 8))
sns.histplot(dataset['SalePrice'], bins=100, kde=True);

In [None]:
list(set(dataset.dtypes.tolist()))

In [None]:
df_num = dataset.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8);

In [None]:
def split_dataset(dataset, test_ratio=0.30):
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

train_ds_pd, valid_ds_pd = split_dataset(dataset)
print("{} examples in training, {} examples in testing.".format(
    len(train_ds_pd), len(valid_ds_pd)))

In [None]:
label = 'SalePrice'
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)

Select a Model

There are several tree-based models for you to choose from.

RandomForestModel
GradientBoostedTreesModel
CartModel
DistributedGradientBoostedTreesModel

To start, we'll work with a Random Forest. This is the most well-known of the Decision Forest training algorithms.

A Random Forest is a collection of decision trees, each trained independently on a random subset of the training dataset (sampled with replacement). The algorithm is unique in that it is robust to overfitting, and easy to use.

We can list the all the available models in TensorFlow Decision Forests using the following code:

In [None]:
tfdf.keras.get_all_models()

In [None]:
rf = tfdf.keras.RandomForestModel(hyperparameter_template="benchmark_rank1", task=tfdf.keras.Task.REGRESSION)

In [None]:
rf = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)
rf.compile(metrics=["mse"]) # Optional, you can use this to include a list of eval metrics

In [None]:
rf.fit(x=train_ds)

In [None]:
tfdf.model_plotter.plot_model_in_colab(rf, tree_idx=0, max_depth=3)

In [None]:

logs = rf.make_inspector().training_logs()
plt.plot([log.num_trees for log in logs], [log.evaluation.rmse for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("RMSE (out-of-bag)")
plt.show()

In [None]:
inspector = rf.make_inspector()
inspector.evaluation()

In [None]:
evaluation = rf.evaluate(x=valid_ds,return_dict=True)

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")

In [None]:
print(f"Available variable importances:")
for importance in inspector.variable_importances().keys():
  print("\t", importance)

In [None]:
inspector.variable_importances()["NUM_AS_ROOT"]

In [None]:
plt.figure(figsize=(12, 4))

# Mean decrease in AUC of the class 1 vs the others.
variable_importance_metric = "NUM_AS_ROOT"
variable_importances = inspector.variable_importances()[variable_importance_metric]

# Extract the feature name and importance values.
#
# `variable_importances` is a list of <feature, importance> tuples.
feature_names = [vi[0].name for vi in variable_importances]
feature_importances = [vi[1] for vi in variable_importances]
# The feature are ordered in decreasing importance value.
feature_ranks = range(len(feature_names))

bar = plt.barh(feature_ranks, feature_importances, label=[str(x) for x in feature_ranks])
plt.yticks(feature_ranks, feature_names)
plt.gca().invert_yaxis()

# TODO: Replace with "plt.bar_label()" when available.
# Label each bar with values
for importance, patch in zip(feature_importances, bar.patches):
  plt.text(patch.get_x() + patch.get_width(), patch.get_y(), f"{importance:.4f}", va="top")

plt.xlabel(variable_importance_metric)
plt.title("NUM AS ROOT of the class 1 vs the others")
plt.tight_layout()
plt.show()

In [None]:
ids = test_data.pop('Id')

test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    test_data,
    task = tfdf.keras.Task.REGRESSION)

preds = rf.predict(test_ds)
output = pd.DataFrame({'Id': ids,
                       'SalePrice': preds.squeeze()})

output.head()

In [None]:
sample_submission_df = pd.read_csv('data/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission_df['SalePrice'] = rf.predict(test_ds)
# sample_submission_df.to_csv('/kaggle/working/submission.csv', index=False)
sample_submission_df.head()
