In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

# Load your Excel file
# Make sure to `pip install openpyxl`
df = pd.read_excel('./synthetic_housing_data.xlsx')  # <<< updated for Excel

# Choose your features and target
features = [
    'Age', 'Adults', 'Children', 'Rent', 'IsStudent',
    'Distance_to_New_Tenancy', 'Total_Rooms', 'Area_m2',
    'Hospital_distance', 'Gym_distance', 'School_distance',
    'Supermarket_distance', 'Distance_to_University'
]
X = df[features]
y = df['Label']  # or your chosen target column

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# XGBoost model
xgb_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    # early_stopping_rounds=50,
    # verbose=False
)

# LightGBM model
lgbm_model = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=-1,
    random_state=42
)
lgbm_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    # early_stopping_rounds=50,
    # verbose=False
)

# Predictions
df['XGB_Pred'] = xgb_model.predict(X)
df['LGBM_Pred'] = lgbm_model.predict(X)

# Find best tenancy for each model
best_xgb = df.loc[df['XGB_Pred'].idxmax()]
best_lgbm = df.loc[df['LGBM_Pred'].idxmax()]

print("Best tenancy by XGBoost:")
print(best_xgb)

print("\nBest tenancy by LightGBM:")
print(best_lgbm)

# Evaluation metrics
print("\nValidation RMSE XGB:", mean_squared_error(y_val, xgb_model.predict(X_val)))
print("Validation RMSE LGBM:", mean_squared_error(y_val, lgbm_model.predict(X_val)))
