# Initial Modelling for Ames Iowa Housing Dataset

## Goals
- Train models with minimally processed dataset to get understand of baseline performance
- Identify which models seem more promising for given dataset

## Imports

In [2]:
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from scipy.stats import boxcox

from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

RANDOM_SEED = 6




## Reading Data

In [12]:
train = pd.read_csv('train_processed.csv')
test = pd.read_csv('test_processed.csv')

features = train.iloc[:, :-1]
target = train.iloc[:, -1]

target_transformed = np.log1p(target)

In [4]:
features.shape, target.shape, test.shape

((1460, 244), (1460,), (1459, 244))

## Initialising Models

In [5]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)

ridge = make_pipeline(RobustScaler(), RidgeCV(cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(random_state=RANDOM_SEED, cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(cv=kfolds))                                
svr = make_pipeline(RobustScaler(), SVR())
rfr = RandomForestRegressor(random_state=RANDOM_SEED)
gbr = GradientBoostingRegressor(random_state=RANDOM_SEED)
lightgbm = LGBMRegressor(random_state=RANDOM_SEED)
xgboost = XGBRegressor(seed=RANDOM_SEED)

## Initial Modelling
- No hyperparameter tuning
- Target Transformed

In [None]:
results = {}
scoring = 'neg_mean_squared_error'
models = {
    'Ridge': ridge,
    'Lasso': lasso,
    'ElasticNet': elasticnet,
    'SVR': svr,
    'RandomForest': rfr,
    'GradientBoostingRegressor': gbr,
    'LightGBM': lightgbm,
    'XGBoost': xgboost
}

for name, model in models.items():
    print(f"Training {name}...")
    start_time = time.time()
    scores = cross_val_score(model, features, target_transformed, cv=kfolds, scoring=scoring)
    
    rmse_scores = np.sqrt(-scores)
    mean_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)
    training_time = time.time() - start_time
    
    results[name] = {
        'Mean RMSE': mean_rmse,
        'Std RMSE': std_rmse,
        'Training Time (s)': training_time
    }
    
    print(f"{name} - Training completed in {training_time:.2f} seconds.")
    print("-" * 50)

results_df = pd.DataFrame(results).T.reset_index()
results_df.columns = ['Model', 'Mean RMSE', 'Std RMSE', 'Training Time (s)']
results_df.sort_values(by='Mean RMSE')

Training Ridge...
Ridge - Training completed in 1.49 seconds.
--------------------------------------------------
Training Lasso...
Lasso - Training completed in 1.91 seconds.
--------------------------------------------------
Training ElasticNet...
ElasticNet - Training completed in 2.23 seconds.
--------------------------------------------------
Training SVR...
SVR - Training completed in 0.60 seconds.
--------------------------------------------------
Training RandomForest...
RandomForest - Training completed in 10.48 seconds.
--------------------------------------------------
Training GradientBoostingRegressor...
GradientBoostingRegressor - Training completed in 3.97 seconds.
--------------------------------------------------
Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003311 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [In

Unnamed: 0,Model,Mean RMSE,Std RMSE,Training Time (s)
3,SVR,0.123208,0.022918,0.597942
1,Lasso,0.127197,0.028523,1.914059
2,ElasticNet,0.127372,0.028488,2.228375
0,Ridge,0.127381,0.030866,1.485419
6,LightGBM,0.128994,0.019726,5.926607
5,GradientBoostingRegressor,0.12912,0.021801,3.972504
7,XGBoost,0.142119,0.018606,2.387853
4,RandomForest,0.142213,0.02262,10.483357


## Next Steps
- Initial modelling of data is promising, all models perform reasonable well on data
- There is lots of possibility for feature engineering
- Models need hyperparameter tuning
- I'm not well educated on the topic but combining predictions from several models or using a stacking algorithm could further improve performance

In [None]:
lightgbm.fit(features, target_transformed)
scaled_predictions = np.expm1(lightgbm.predict(test))
submission = pd.DataFrame({
    'Id': list(range(1461, 2920)),
    'SalePrice': scaled_predictions
})
submission.head()

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004027 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3414
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 165
[LightGBM] [Info] Start training from score 12.024057


Unnamed: 0,Id,SalePrice
0,1461,136561.881733
1,1462,146569.937857
2,1463,161640.29579
3,1464,167950.781224
4,1465,205669.681839


In [26]:
submission.to_csv('first_submission_lgbm.csv', index=False)