# 04 â€“ Model Training

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import joblib

df = pd.read_csv("../data/processed/cleaned_data.csv")

In [None]:
X = df.drop("price_lkr", axis=1)
y = df["price_lkr"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Random Forest model
rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)


In [None]:
joblib.dump(rf, "../models/house_price_model.pkl")
joblib.dump(scaler, "../models/scaler.pkl")

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
import xgboost as xgb

# Models to train
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'SVR': SVR(),
    'Random Forest': rf,
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42)
}

# Train and evaluate with cross-validation
results = {}
for name, model in models.items():
    if name in ['SVR', 'Linear Regression', 'Ridge', 'Lasso']:
        scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    else:
        scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    results[name] = scores.mean()
    print(f"{name}: R2 = {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_
print("Best RF params:", grid_search.best_params_)

# Train best model
best_rf.fit(X_train, y_train)

# Save best model
joblib.dump(best_rf, "../models/best_house_price_model.pkl")