In [1]:
# Import necessary libraries
import pandas as pd
import tensorflow_decision_forests as tfdf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

ModuleNotFoundError: No module named 'tensorflow_decision_forests'

In [None]:
pip install tensorflow_decision_forests

In [None]:
# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [None]:
# Display basic information about the datasets
print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

In [None]:
# Handle missing values
# For numerical features, fill missing values with the median
for col in train_data.select_dtypes(include=[np.number]).columns:
    train_data[col].fillna(train_data[col].median(), inplace=True)
    if col in test_data.columns:
        test_data[col].fillna(test_data[col].median(), inplace=True)

In [None]:
# For categorical features, fill missing values with the mode
for col in train_data.select_dtypes(include=[object]).columns:
    train_data[col].fillna(train_data[col].mode()[0], inplace=True)
    if col in test_data.columns:
        test_data[col].fillna(test_data[col].mode()[0], inplace=True)

In [None]:
# Encode categorical variables
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

In [None]:
# Align the train and test dataframes by the columns
train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=0)

In [None]:
# Split the training data into features and target variable
X = train_data.drop('SalePrice', axis=1)
y = train_data['SalePrice']

In [None]:
# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Convert the dataframes to TensorFlow datasets
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_train.join(y_train), label='SalePrice')
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_valid.join(y_valid), label='SalePrice')
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_data)

In [None]:
# Initialize and train the TFDF model
model = tfdf.keras.GradientBoostedTreesModel()
model.fit(train_ds)

In [None]:
# Evaluate the model on the validation set
valid_predictions = model.predict(valid_ds)
rmse = np.sqrt(mean_squared_error(y_valid, valid_predictions))
print(f"Validation RMSE: {rmse}")

In [None]:

# Generate predictions for the test set
test_predictions = model.predict(test_ds)

In [None]:
# Prepare the submission dataframe
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': test_predictions.flatten()
})

In [None]:
# Save the submission to a CSV file
submission.to_csv('submission2.csv', index=False)
print("Submission file has been generated.")