#### This file is going to deal with training a model to predict property price 

In [19]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor 
from xgboost import XGBRegressor 
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [20]:
df_vlc = pd.read_csv("../working_data/properties_vlc_clean2.csv")

In [21]:
X = df_vlc.drop(columns = ["price"])
y = df_vlc["price"]

In [22]:
boolean_columns = []
one_hot_columns = ["location_cluster"]
integer_columns = []

for column in df_vlc.columns: 
    if column == "price": 
        continue 
    elif df_vlc[column].dtype == "int64": 
        if column not in one_hot_columns: 
            integer_columns.append(column)
    elif df_vlc[column].dtype == "bool": 
        boolean_columns.append(column)

In [23]:
df_vlc = pd.get_dummies(df_vlc, columns = ["location_cluster"], prefix = "location", drop_first = False)

In [24]:
# Prepare features without scaling yet (to prevent data leakage)
print("Preparing feature matrix...")

# Convert boolean columns to integers
for col in boolean_columns: 
    df_vlc[col] = df_vlc[col].astype("int64")

# Create feature matrix and target
X = df_vlc.drop(columns=["price"])
y = df_vlc["price"]

print(f"Feature matrix shape: {X.shape}")
print(f"Integer columns to scale: {integer_columns}")
print(f"Boolean columns (already 0/1): {boolean_columns}")

Preparing feature matrix...
Feature matrix shape: (15726, 108)
Integer columns to scale: ['rooms', 'baths', 'm2_cons', 'm2_property', 'floor', 'prop_age', 'luxury_score', 'parking_price', 'consumption', 'emissions', 'amenity_count', 'convenience_score']
Boolean columns (already 0/1): ['garage', 'balcony', 'terrace', 'lift', 'AC', 'pool', 'east', 'north', 'south', 'west', 'adosado', 'pareado', 'chalet', 'masia', 'atico', 'duplex', 'estudio', 'piso', 'casa_rustica', 'villa', 'heating', 'trastero', 'fireplace', 'garden', 'wardrobes', 'mobility', 'sea_views', 'nuda', 'ocupada', 'rented', 'new', 'good', 'renovate', 'missing_prop_type', 'missing_prop_age', 'missing_consumption', 'missing_emissions', 'is_house', 'is_apartment', 'is_small', 'top_floor', 'ground_floor', 'is_tiny', 'is_small_apt', 'is_medium', 'is_large', 'is_mansion', 'is_vintage', 'optimal_age', 'has_outdoor', 'has_storage', 'energy_premium', 'energy_penalty', 'is_premium_location', 'bathroom_luxury', 'family_home', 'luxury_pr

In [25]:
# STEP 1: Split data FIRST (before any scaling)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, shuffle = True)

# STEP 2: Apply StandardScaler properly (fit on train only, transform both)
print("Applying StandardScaler correctly to prevent data leakage...")

scaler = StandardScaler()

# Fit scaler ONLY on training data
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Scale only the integer columns
scaler.fit(X_train[integer_columns])  # Learn statistics from training data only
X_train_scaled[integer_columns] = scaler.transform(X_train[integer_columns])
X_test_scaled[integer_columns] = scaler.transform(X_test[integer_columns])  # Use same transformation

# Update the variables
X_train = X_train_scaled
X_test = X_test_scaled

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print("✅ Scaling applied correctly - no data leakage!")

# Save scaler for future predictions
import joblib
joblib.dump(scaler, '../working_data/feature_scaler.pkl')

Applying StandardScaler correctly to prevent data leakage...
Train shape: (12580, 108), Test shape: (3146, 108)
✅ Scaling applied correctly - no data leakage!


['../working_data/feature_scaler.pkl']

In [26]:
# ACCURACY MAXIMIZATION: Target transformation for better modeling
print("Applying log transformation to handle price skewness...")

# Check price distribution skewness
import scipy.stats as stats
skewness = stats.skew(y_train)
print(f"Original price skewness: {skewness:.3f}")

# Apply log transformation (highly effective for price data)
y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

skewness_log = stats.skew(y_train_log)
print(f"Log-transformed skewness: {skewness_log:.3f} (closer to 0 = better)")

# Custom scorer using MAE (but for log-transformed targets)
def log_mae(y_true, y_pred):
    """MAE in original price scale after inverse log transform"""
    return mean_absolute_error(np.exp(y_true), np.exp(y_pred))

log_mae_scorer = make_scorer(log_mae, greater_is_better=False)

# ENHANCED hyperparameter grids (more comprehensive search)
param_grids = {
    'LinearRegression': {},  # no params to tune
    'Ridge': {
        'alpha': [0.001, 0.01, 0.1, 1, 10, 50, 100, 500],
        'max_iter': [1000, 5000, 10000]
    },
    'Lasso': {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 0.5, 1, 5, 10, 20, 50],
        'max_iter': [5000, 10000, 20000]
    },
    'RandomForest': {
        'n_estimators': [200, 500, 800],  # More trees for better performance
        'max_depth': [None, 15, 25, 35],  # Deeper trees
        'min_samples_split': [2, 5, 10],  # Lower for more flexibility
        'min_samples_leaf': [1, 2, 4],    # Lower for more flexibility
        'max_features': ['sqrt', 'log2', 0.5]  # Feature selection strategies
    },
    'XGBoost': {
        'n_estimators': [300, 500, 800],
        'max_depth': [4, 6, 8, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.15],
        'subsample': [0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
        'reg_alpha': [0, 0.1, 0.5],       # L1 regularization
        'reg_lambda': [1, 1.5, 2]         # L2 regularization
    }
}

# Enhanced model dictionary with better configurations
models = {
    'LinearRegression': LinearRegression(), 
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForest': RandomForestRegressor(n_jobs = -1, random_state=42),
    'XGBoost': XGBRegressor(n_jobs = -1, random_state=42)
}

Applying log transformation to handle price skewness...
Original price skewness: 2.249
Log-transformed skewness: 0.101 (closer to 0 = better)


In [27]:
# ENHANCED TRAINING WITH BETTER VALIDATION
from sklearn.model_selection import KFold
from sklearn.ensemble import VotingRegressor
import time

# Use stratified validation for more robust hyperparameter selection
kfold = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold for faster training

best_models = {}
training_times = {}

print("Training models with log-transformed targets and enhanced validation...")

for name, model in models.items():
    start_time = time.time()
    print(f"\nTuning {name}...")
    
    # Use log-transformed targets for training
    grid = GridSearchCV(
        model, 
        param_grids[name], 
        scoring=log_mae_scorer, 
        cv=kfold,
        n_jobs=-1,
        verbose=1 if name in ['RandomForest', 'XGBoost'] else 0
    )
    
    grid.fit(X_train, y_train_log)
    best_models[name] = grid.best_estimator_
    training_times[name] = time.time() - start_time
    
    print(f"Best params for {name}: {grid.best_params_}")
    print(f"Best CV score: {-grid.best_score_:.4f}")
    print(f"Training time: {training_times[name]:.1f}s")

print("\n" + "="*50)
print("INDIVIDUAL MODEL TRAINING COMPLETED")
print("="*50)

Training models with log-transformed targets and enhanced validation...

Tuning LinearRegression...
Best params for LinearRegression: {}
Best CV score: 47192.9552
Training time: 4.8s

Tuning Ridge...
Best params for LinearRegression: {}
Best CV score: 47192.9552
Training time: 4.8s

Tuning Ridge...
Best params for Ridge: {'alpha': 0.01, 'max_iter': 1000}
Best CV score: 47186.9361
Training time: 4.9s

Tuning Lasso...
Best params for Ridge: {'alpha': 0.01, 'max_iter': 1000}
Best CV score: 47186.9361
Training time: 4.9s

Tuning Lasso...
Best params for Lasso: {'alpha': 0.0001, 'max_iter': 5000}
Best CV score: 47910.6657
Training time: 143.7s

Tuning RandomForest...
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best params for Lasso: {'alpha': 0.0001, 'max_iter': 5000}
Best CV score: 47910.6657
Training time: 143.7s

Tuning RandomForest...
Fitting 5 folds for each of 324 candidates, totalling 1620 fits




KeyboardInterrupt: 

In [None]:
mae_scores = dict() 
for name, model in best_models.items():
    # Predict on log scale, then transform back
    y_pred_log = model.predict(X_test)
    y_pred = np.exp(y_pred_log)  # Transform back to original price scale
    
    # Calculate metrics in original price scale
    mae = mean_absolute_error(y_test, y_pred)
    mae_scores[name] = mae 
    rmse = mean_squared_error(y_test, y_pred)
    print(f"{name}: MAE = {mae:.2f}, RMSE = {rmse:.2f}")


LinearRegression: MAE = 142196.59, RMSE = 64051783481.80
Ridge: MAE = 142276.98, RMSE = 64034671566.63
Lasso: MAE = 142247.07, RMSE = 64078042366.38
RandomForest: MAE = 85318.94, RMSE = 28548316300.94
XGBoost: MAE = 80202.80, RMSE = 26288441344.00


In [None]:
df_vlc["price"].mean()

315910.70606991707

In [None]:
80202.80 / 315910.70606991707 * 100

25.38780688940931

In [None]:
for name, model in best_models.items(): 
    y_pred_log = model.predict(X_train)
    y_pred = np.exp(y_pred_log)
    mae = mean_absolute_error(y_train, y_pred)
    print(f"{name}: MAE = {mae:.2f}")                          