In [None]:
import os
import joblib
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, DecisionTreeRegressor, KNeighborsRegressor, RandomForestRegressor, GradientBoostingRegressor


In [None]:
model_scores = {}
X = df_with_dummies.drop(columns=['log_price])
y = df_with_dummies['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
saved_model_name = 'model_lr_TUNED.joblib'

param_grid = {
    'fit_intercept': [True, False],
    'normalize': [True, False]
}

if os.path.exists(saved_model_name):
    loaded_model = joblib.load(saved_model_name)
    model_lr_TUNED = loaded_model
else:
    grid_search = GridSearchCV(LinearRegression(), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    model_lr_TUNED = grid_search.best_estimator_
    joblib.dump(model_lr_TUNED, saved_model_name)

y_pred = model_lr_TUNED.predict(X_test)

train_score = model_lr_TUNED.score(X_train, y_train)
test_score = model_lr_TUNED.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

params = model_lr_TUNED.get_params()

model_scores['Linear Regression TUNED'] = {
    'Train Score': train_score,
    'Test Score': test_score,
    'Mean Squared Error': mse,
    'R2 Score': r2,
    'Used Parameters': params
}

print(f"Train Score (R²): {train_score}")
print(f"Test Score (R²): {test_score}")
print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")


In [None]:
saved_model_name_knn = 'model_knn_TUNED.joblib'

param_grid_knn = {
    'n_neighbors': [3, 5, 7, 10, 15],
    'p': [1, 2]  
}

# KNN Regression with GridSearchCV
if os.path.exists(saved_model_name_knn):
    model_knn_TUNED = joblib.load(saved_model_name_knn)
else:
    grid_search_knn = GridSearchCV(KNeighborsRegressor(), param_grid_knn, cv=5)
    grid_search_knn.fit(X_train, y_train)
    model_knn_TUNED = grid_search_knn.best_estimator_
    joblib.dump(model_knn_TUNED, saved_model_name_knn)

y_pred_knn = model_knn_TUNED.predict(X_test)

train_score_knn = model_knn_TUNED.score(X_train, y_train)
test_score_knn = model_knn_TUNED.score(X_test, y_test)
mse_knn = mean_squared_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

params_knn = model_knn_TUNED.get_params()

model_scores['KNN Regression TUNED'] = {
    'Train Score': train_score_knn,
    'Test Score': test_score_knn,
    'Mean Squared Error': mse_knn,
    'R² Score': r2_knn,
    'Used Parameters': params_knn
}

print(f"KNN Regression - Train Score (R²): {train_score_knn}")
print(f"KNN Regression - Test Score (R²): {test_score_knn}")
print(f"KNN Regression - Mean Squared Error: {mse_knn}")
print(f"KNN Regression - R² Score: {r2_knn}")
print(f"KNN Regression - Best Parameters: {params_knn}")

In [None]:
saved_model_name_ridge = 'model_ridge_TUNED.joblib'
saved_model_name_lasso = 'model_lasso_TUNED.joblib'

param_grid_ridge = {
    'alpha': [0.01, 0.1, 1, 10, 100],
    'fit_intercept': [True, False]
}

param_grid_lasso = {
    'alpha': [0.01, 0.1, 1, 10, 100],
    'fit_intercept': [True, False]
}

# Ridge Regression
if os.path.exists(saved_model_name_ridge):
    model_ridge_TUNED = joblib.load(saved_model_name_ridge)
else:
    grid_search_ridge = GridSearchCV(Ridge(), param_grid_ridge, cv=5)
    grid_search_ridge.fit(X_train, y_train)
    model_ridge_TUNED = grid_search_ridge.best_estimator_
    joblib.dump(model_ridge_TUNED, saved_model_name_ridge)

y_pred_ridge = model_ridge_TUNED.predict(X_test)

train_score_ridge = model_ridge_TUNED.score(X_train, y_train)
test_score_ridge = model_ridge_TUNED.score(X_test, y_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

params_ridge = model_ridge_TUNED.get_params()

model_scores['Ridge Regression TUNED'] = {
    'Train Score': train_score_ridge,
    'Test Score': test_score_ridge,
    'Mean Squared Error': mse_ridge,
    'R2 Score': r2_ridge,
    'Used Parameters': params_ridge
}

print(f"Ridge Regression - Train Score (R²): {train_score_ridge}")
print(f"Ridge Regression - Test Score (R²): {test_score_ridge}")
print(f"Ridge Regression - Mean Squared Error: {mse_ridge}")
print(f"Ridge Regression - R² Score: {r2_ridge}")

# Lasso Regression
if os.path.exists(saved_model_name_lasso):
    model_lasso_TUNED = joblib.load(saved_model_name_lasso)
else:
    grid_search_lasso = GridSearchCV(Lasso(), param_grid_lasso, cv=5)
    grid_search_lasso.fit(X_train, y_train)
    model_lasso_TUNED = grid_search_lasso.best_estimator_
    joblib.dump(model_lasso_TUNED, saved_model_name_lasso)

y_pred_lasso = model_lasso_TUNED.predict(X_test)

train_score_lasso = model_lasso_TUNED.score(X_train, y_train)
test_score_lasso = model_lasso_TUNED.score(X_test, y_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

params_lasso = model_lasso_TUNED.get_params()

model_scores['Lasso Regression TUNED'] = {
    'Train Score': train_score_lasso,
    'Test Score': test_score_lasso,
    'Mean Squared Error': mse_lasso,
    'R2 Score': r2_lasso,
    'Used Parameters': params_lasso
}

print(f"Lasso Regression - Train Score (R²): {train_score_lasso}")
print(f"Lasso Regression - Test Score (R²): {test_score_lasso}")
print(f"Lasso Regression - Mean Squared Error: {mse_lasso}")
print(f"Lasso Regression - R² Score: {r2_lasso}")

In [None]:
saved_model_name_elastic = 'model_elastic_TUNED.joblib'

param_grid_elastic = {
    'alpha': [0.01, 0.1, 1, 10, 100],
    'l1_ratio': [0.1, 0.5, 0.7, 0.9],
    'fit_intercept': [True, False]
}

# Elastic Net Regression
if os.path.exists(saved_model_name_elastic):
    model_elastic_TUNED = joblib.load(saved_model_name_elastic)
else:
    grid_search_elastic = GridSearchCV(ElasticNet(), param_grid_elastic, cv=5)
    grid_search_elastic.fit(X_train, y_train)
    model_elastic_TUNED = grid_search_elastic.best_estimator_
    joblib.dump(model_elastic_TUNED, saved_model_name_elastic)

y_pred_elastic = model_elastic_TUNED.predict(X_test)

train_score_elastic = model_elastic_TUNED.score(X_train, y_train)
test_score_elastic = model_elastic_TUNED.score(X_test, y_test)
mse_elastic = mean_squared_error(y_test, y_pred_elastic)
r2_elastic = r2_score(y_test, y_pred_elastic)

params_elastic = model_elastic_TUNED.get_params()
alpha_used = params_elastic['alpha']
l1_ratio_used = params_elastic['l1_ratio']

model_scores['Elastic Net Regression TUNED'] = {
    'Train Score': train_score_elastic,
    'Test Score': test_score_elastic,
    'Mean Squared Error': mse_elastic,
    'R² Score': r2_elastic,
    'Lambda (Alpha)': alpha_used,
    'L1 Ratio': l1_ratio_used,
    'Used Parameters': params_elastic
}

print(f"Elastic Net Regression - Train Score (R²): {train_score_elastic}")
print(f"Elastic Net Regression - Test Score (R²): {test_score_elastic}")
print(f"Elastic Net Regression - Mean Squared Error: {mse_elastic}")
print(f"Elastic Net Regression - R² Score: {r2_elastic}")
print(f"Elastic Net Regression - Lambda (Alpha): {alpha_used}")
print(f"Elastic Net Regression - L1 Ratio: {l1_ratio_used}")


In [None]:
saved_model_name_dt = 'model_dt_TUNED.joblib'

param_grid_dt = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'criterion': ['mse', 'friedman_mse', 'mae']
}

# Decision Tree Regression with GridSearchCV
if os.path.exists(saved_model_name_dt):
    model_dt_TUNED = joblib.load(saved_model_name_dt)
else:
    grid_search_dt = GridSearchCV(DecisionTreeRegressor(), param_grid_dt, cv=5)
    grid_search_dt.fit(X_train, y_train)
    model_dt_TUNED = grid_search_dt.best_estimator_
    joblib.dump(model_dt_TUNED, saved_model_name_dt)

y_pred_dt = model_dt_TUNED.predict(X_test)

train_score_dt = model_dt_TUNED.score(X_train, y_train)
test_score_dt = model_dt_TUNED.score(X_test, y_test)
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

params_dt = model_dt_TUNED.get_params()

model_scores['Decision Tree Regression TUNED'] = {
    'Train Score': train_score_dt,
    'Test Score': test_score_dt,
    'Mean Squared Error': mse_dt,
    'R² Score': r2_dt,
    'Used Parameters': params_dt
}

print(f"Decision Tree Regression - Train Score (R²): {train_score_dt}")
print(f"Decision Tree Regression - Test Score (R²): {test_score_dt}")
print(f"Decision Tree Regression - Mean Squared Error: {mse_dt}")
print(f"Decision Tree Regression - R² Score: {r2_dt}")
print(f"Decision Tree Regression - Best Parameters: {params_dt}")

In [None]:
saved_model_name_rf = 'model_rf_TUNED.joblib'

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Random Forest Regression with GridSearchCV
if os.path.exists(saved_model_name_rf):
    model_rf_TUNED = joblib.load(saved_model_name_rf)
else:
    grid_search_rf = GridSearchCV(RandomForestRegressor(), param_grid_rf, cv=5)
    grid_search_rf.fit(X_train, y_train)
    model_rf_TUNED = grid_search_rf.best_estimator_
    joblib.dump(model_rf_TUNED, saved_model_name_rf)

y_pred_rf = model_rf_TUNED.predict(X_test)

train_score_rf = model_rf_TUNED.score(X_train, y_train)
test_score_rf = model_rf_TUNED.score(X_test, y_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

params_rf = model_rf_TUNED.get_params()

model_scores['Random Forest Regression TUNED'] = {
    'Train Score': train_score_rf,
    'Test Score': test_score_rf,
    'Mean Squared Error': mse_rf,
    'R² Score': r2_rf,
    'Used Parameters': params_rf
}

print(f"Random Forest Regression - Train Score (R²): {train_score_rf}")
print(f"Random Forest Regression - Test Score (R²): {test_score_rf}")
print(f"Random Forest Regression - Mean Squared Error: {mse_rf}")
print(f"Random Forest Regression - R² Score: {r2_rf}")
print(f"Random Forest Regression - Best Parameters: {params_rf}")

In [None]:
saved_model_name_gb = 'model_gb_TUNED.joblib'

param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Gradient Boosting Regression with GridSearchCV
if os.path.exists(saved_model_name_gb):
    model_gb_TUNED = joblib.load(saved_model_name_gb)
else:
    grid_search_gb = GridSearchCV(GradientBoostingRegressor(), param_grid_gb, cv=5)
    grid_search_gb.fit(X_train, y_train)
    model_gb_TUNED = grid_search_gb.best_estimator_
    joblib.dump(model_gb_TUNED, saved_model_name_gb)

y_pred_gb = model_gb_TUNED.predict(X_test)

train_score_gb = model_gb_TUNED.score(X_train, y_train)
test_score_gb = model_gb_TUNED.score(X_test, y_test)
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

params_gb = model_gb_TUNED.get_params()

model_scores['Gradient Boosting Regression TUNED'] = {
    'Train Score': train_score_gb,
    'Test Score': test_score_gb,
    'Mean Squared Error': mse_gb,
    'R² Score': r2_gb,
    'Used Parameters': params_gb
}

print(f"Gradient Boosting Regression - Train Score (R²): {train_score_gb}")
print(f"Gradient Boosting Regression - Test Score (R²): {test_score_gb}")
print(f"Gradient Boosting Regression - Mean Squared Error: {mse_gb}")
print(f"Gradient Boosting Regression - R² Score: {r2_gb}")
print(f"Gradient Boosting Regression - Best Parameters: {params_gb}")

In [None]:
models = list(model_scores.keys())
train_scores = [model_scores[model]["Train Score"] for model in models]
test_scores = [model_scores[model]["Test Score"] for model in models]

x = range(len(models))

plt.figure(figsize=(10, 6))
plt.bar(x, train_scores, width=0.4, label='Train Score', color='b', align='center')
plt.bar([p + 0.4 for p in x], test_scores, width=0.4, label='Test Score', color='orange', align='center')

plt.xlabel('Models')
plt.ylabel('Scores')
plt.title('Train and Test Scores of Different Models')
plt.xticks([p + 0.2 for p in x], models , rotation =90)
plt.ylim(0.5, 1)
plt.legend()
plt.show()

In [None]:
best_model = max(model_scores, key=lambda x: model_scores[x]['Test Score'])
print('the best model -', best_model)
print(model_scores[best_model]['Train Score']) 
print(model_scores[best_model]['Test Score'])
print(model_scores[best_model]['Used parameters']) 

worst_model = min(model_scores, key=lambda x: model_scores[x]['Test Score'])
print('the worst model -', worst_model)
print(model_scores[worst_model]['Train Score']) 
print(model_scores[worst_model]['Test Score'])
print(model_scores[worst_model]['Used parameters']) 