# Initial Modelling for Ames Iowa Housing Dataset

## Goals
- Train models with minimally processed dataset to get understand of baseline performance
- Identify which models seem more promising for given dataset

## Imports

In [70]:
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from scipy.stats import boxcox

from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

RANDOM_SEED = 6


## Reading Data

In [71]:
train = pd.read_csv('train_processed.csv')
test = pd.read_csv('test_processed.csv')

features = train.iloc[:, :-1]
target = train.iloc[:, -1]

target_transformed = np.log1p(target)

In [72]:
features.shape, target.shape, test.shape

((1460, 244), (1460,), (1459, 244))

## Initialising Models

In [73]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)

ridge = make_pipeline(RobustScaler(), RidgeCV(cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(random_state=RANDOM_SEED, cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(cv=kfolds))                                
svr = make_pipeline(RobustScaler(), SVR())
rfr = RandomForestRegressor(random_state=RANDOM_SEED)
gbr = GradientBoostingRegressor(random_state=RANDOM_SEED)
lightgbm = LGBMRegressor(random_state=RANDOM_SEED)
xgboost = XGBRegressor(seed=RANDOM_SEED)

## Initial Modelling
- No hyperparameter tuning
- Target Transformed

In [75]:
results = {}
scoring = 'neg_mean_squared_error'
models = {
    'Ridge': ridge,
    'Lasso': lasso,
    'ElasticNet': elasticnet,
    'SVR': svr,
    'RandomForest': rfr,
    'GradientBoostingRegressor': gbr,
    'LightGBM': lightgbm,
    'XGBoost': xgboost
}

for name, model in models.items():
    print(f"Training {name}...")
    start_time = time.time()
    scores = cross_val_score(model, features, target_transformed, cv=kfolds, scoring=scoring)
    
    rmse_scores = np.sqrt(-scores)
    mean_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)
    training_time = time.time() - start_time
    
    results[name] = {
        'Mean RMSE': mean_rmse,
        'Std RMSE': std_rmse,
        'Training Time (s)': training_time
    }
    
    print(f"{name} - Training completed in {training_time:.2f} seconds.")
    print("-" * 50)

results_df = pd.DataFrame(results).T.reset_index()
results_df.columns = ['Model', 'Mean RMSE', 'Std RMSE', 'Training Time (s)']
results_df.sort_values(by='Mean RMSE')

Training Ridge...
Ridge - Training completed in 1.47 seconds.
--------------------------------------------------
Training Lasso...
Lasso - Training completed in 2.02 seconds.
--------------------------------------------------
Training ElasticNet...
ElasticNet - Training completed in 2.15 seconds.
--------------------------------------------------
Training SVR...
SVR - Training completed in 0.63 seconds.
--------------------------------------------------
Training RandomForest...
RandomForest - Training completed in 10.73 seconds.
--------------------------------------------------
Training GradientBoostingRegressor...
GradientBoostingRegressor - Training completed in 3.95 seconds.
--------------------------------------------------
Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002862 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [In

Unnamed: 0,Model,Mean RMSE,Std RMSE,Training Time (s)
3,SVR,0.123208,0.022918,0.626987
1,Lasso,0.127197,0.028523,2.024949
2,ElasticNet,0.127372,0.028488,2.152916
0,Ridge,0.127381,0.030866,1.47423
6,LightGBM,0.128994,0.019726,5.836633
5,GradientBoostingRegressor,0.12912,0.021801,3.954023
7,XGBoost,0.142119,0.018606,2.411495
4,RandomForest,0.142213,0.02262,10.732236
