#### This file is going to deal with training a model to predict property price 

In [67]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor 
from xgboost import XGBRegressor 
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [68]:
df_vlc = pd.read_csv("../working_data/properties_vlc_clean.csv")

In [69]:
X = df_vlc.drop(columns = ["price"])
y = df_vlc["price"]

In [70]:
boolean_columns = []
one_hot_columns = ["location_cluster"]
integer_columns = []

for column in df_vlc.columns: 
    if column == "price": 
        continue 
    elif df_vlc[column].dtype == "int64": 
        if column not in one_hot_columns: 
            integer_columns.append(column)
    elif df_vlc[column].dtype == "bool": 
        boolean_columns.append(column)

In [71]:
df_vlc = pd.get_dummies(df_vlc, columns = ["location_cluster"], prefix = "location", drop_first = False)

In [72]:
scaler = StandardScaler()
for col in integer_columns: 
    df_vlc[col] = scaler.fit_transform(df_vlc[[col]])

for col in boolean_columns: 
    df_vlc[col] = df_vlc[col].astype("int64")

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [74]:
# Custom scorer using MAE
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Example hyperparameter grids
param_grids = {
    'LinearRegression': {},  # no params to tune
    'Ridge': {
        'alpha': [0.01, 0.1, 1, 10, 50, 100, 200],
        'max_iter': [1000, 5000, 10000]
    },
    'Lasso': {
        'alpha': [0.001, 0.01, 0.1, 0.5, 1, 5, 10],
        'max_iter': [5000, 10000, 20000]
    },
    'RandomForest': {
        'n_estimators': [100, 200, 500],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'max_depth': [3, 5, 7, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }
}

# Dictionary of models
models = {
    'LinearRegression': LinearRegression(), 
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

In [75]:
best_models = {}

for name, model in models.items():
    print(f"Tuning {name}...")
    grid = GridSearchCV(model, param_grids[name], scoring=mae_scorer, cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    best_models[name] = grid.best_estimator_
    print(f"Best params for {name}: {grid.best_params_}")

Tuning LinearRegression...


Best params for LinearRegression: {}
Tuning Ridge...
Best params for Ridge: {'alpha': 50, 'max_iter': 1000}
Tuning Lasso...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best params for Lasso: {'alpha': 10, 'max_iter': 5000}
Tuning RandomForest...




Best params for RandomForest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Tuning XGBoost...
Best params for XGBoost: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 500, 'subsample': 0.8}


In [76]:
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred)
    print(f"{name}: MAE = {mae:.2f}, RMSE = {rmse:.2f}")

LinearRegression: MAE = 142196.59, RMSE = 64051783481.80
Ridge: MAE = 142276.98, RMSE = 64034671566.63
Lasso: MAE = 142247.07, RMSE = 64078042366.38
RandomForest: MAE = 85318.94, RMSE = 28548316300.94
XGBoost: MAE = 80202.80, RMSE = 26288441344.00


In [77]:
for column in df_vlc.columns: 
    if sum(df_vlc[column].isna()) > 0: 
        print(f"Column: {column} contains {sum(df_vlc[column].isna())} NaN values")

In [78]:
df_vlc["price"].mean()

315910.70606991707

In [79]:
80202.80 / 315910.70606991707 * 100 

25.38780688940931