In [18]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import (LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor, HuberRegressor)
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

In [7]:
# Load dataset
data = pd.read_csv(r"D:\Dropbox\DS & AI\Full stack DS and AI course\27 Jun\HOUSING REGRESSOR- capstone project\USA_Housing.csv")

In [8]:
# Preprocessing
X = data.drop(['Price', 'Address'], axis=1) 
y = data['Price']

In [9]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
# Define models
models = {
    'LinearRegression': LinearRegression(),
    'RobustRegression': HuberRegressor(),
    'RidgeRegression': Ridge(),
    'LassoRegression': Lasso(),
    'ElasticNet': ElasticNet(),
    'PolynomialRegression': Pipeline([
        ('poly', PolynomialFeatures(degree=4)),
        ('linear', LinearRegression())
    ]),
    'SGDRegressor': SGDRegressor(),
    'ANN': MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000),
    'RandomForest': RandomForestRegressor(),
    'SVM': SVR(),
    'LGBM': lgb.LGBMRegressor(),
    'XGBoost': xgb.XGBRegressor(),
    'KNN': KNeighborsRegressor()
}



In [13]:
# Train and evaluate models
results = []
 
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred=model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae= mean_absolute_error(y_test, y_pred)

    results.append({
        'Model': name,
        'MSE': mse,
        'R2 Score': r2,
        'MAE': mae
    })

    with open(f'{name}_model.pkl', 'wb') as file:
        pickle.dump(model, file)    


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000327 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1256
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 5
[LightGBM] [Info] Start training from score 1231911.452183


In [14]:
# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('model_evaluation_results.csv', index=False)

print("Models have been trained and saved as pickle files. Evaluation results have been saved to model_evaluation_results.csv.")

Models have been trained and saved as pickle files. Evaluation results have been saved to model_evaluation_results.csv.


In [15]:
#Convert results to DataFrame
results_df = pd.DataFrame(results)  

In [16]:
#Save the results to a CSV file
results_df.to_csv('model_results.csv', index=False)

In [17]:
#Print the results
print("Model Evaluation Results:")
print(results_df)

Model Evaluation Results:
                   Model           MSE      R2 Score           MAE
0       LinearRegression  1.054972e+10  9.146455e-01  8.265795e+04
1       RobustRegression  6.166491e+10  5.010882e-01  1.994656e+05
2        RidgeRegression  1.054975e+10  9.146453e-01  8.265967e+04
3        LassoRegression  1.054972e+10  9.146455e-01  8.265795e+04
4             ElasticNet  1.508138e+10  8.779812e-01  9.912681e+04
5   PolynomialRegression  1.087026e+10  9.120521e-01  8.410057e+04
6           SGDRegressor  8.426740e+35 -6.817816e+24  9.098289e+17
7                    ANN  6.149688e+10  5.024476e-01  1.993568e+05
8           RandomForest  1.505877e+10  8.781642e-01  9.805428e+04
9                    SVM  1.235466e+11  4.227862e-04  2.829477e+05
10                  LGBM  1.309771e+10  8.940305e-01  9.213399e+04
11               XGBoost  1.613868e+10  8.694269e-01  1.015652e+05
12                   KNN  6.039581e+10  5.113561e-01  1.980862e+05
