In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import KFold, cross_validate, cross_val_predict
import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse, rmse

house = pd.read_csv("https://djl-lms-assets.s3.eu-central-1.amazonaws.com/datasets/house_prices.csv", sep = ";")
house.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Model 1: OLS Regresyonu

In [2]:
# Modelimizi oluşturalım:
house.ExterQual = house.ExterQual.replace({"Ex":4, "Gd":3, "TA":2, "Fa":1, "Po":0})
house.BsmtQual = house.BsmtQual.replace({"Ex":4, "Gd":3, "TA":2, "Fa":1, np.nan:0})
house.GarageQual = house.GarageQual.replace({"Ex":4, "Gd":3, "TA":2, "Fa":1, np.nan:0, "Po":0})
house["TotalSF"] = house["TotalBsmtSF"] + house["1stFlrSF"] + house["2ndFlrSF"]

x1 = house[["TotalSF", "ExterQual","BsmtQual","GarageQual","OverallQual", "GrLivArea","YearBuilt","TotalBsmtSF","GrLivArea","FullBath","GarageCars","TotalSF"]]
y1 = house["SalePrice"]

In [3]:
kFold = KFold(10, shuffle=True, random_state=42)
mae1, mse1, rmse1, mape1, trainR_mean, testR_mean = ([] for i in range(6))

for train, test in kFold.split(x1, y1): 
    x1_train = x1.iloc[train]
    x1_test = x1.iloc[test]
    y1_train = y1.iloc[train]
    y1_test = y1.iloc[test]
    
    # Eğitim verilerinin R-kare değeri
    x1_train = sm.add_constant(x1_train)
    results1_model = sm.OLS(y1_train, x1_train)
    results1_train = results1_model.fit()
    trainR_mean.append(results1_train.rsquared_adj)

    # Test verilerinin R-kare değeri
    x1_test = sm.add_constant(x1_test)
    results1_model = sm.OLS(y1_test, x1_test)
    results1_test = results1_model.fit()
    testR_mean.append(results1_test.rsquared_adj)

    y1_predict = results1_train.predict(x1_test)

    mae0 = mean_absolute_error(y1_test, y1_predict)
    mse0 = mse(y1_test, y1_predict)
    rmse0 = rmse(y1_test, y1_predict)
    mape0 = (abs((y1_test - y1_predict)) / y1_test).mean() * 100
    
    mae1.append(mae0)
    mse1.append(mse0)
    rmse1.append(rmse0)
    mape1.append(mape0)

print(f"Eğitim verilerinin ayarlanmış R-kare değeri : {np.array(trainR_mean).mean():.3f}")
print(f"Test verilerinin ayarlanmış R-kare değeri   : {np.array(testR_mean).mean():.3f}")
print("\n---------Test Kümesi Hata İstatistikleri---------")
print(f"Ortalama Mutlak Hata (MAE)        : {np.array(mae1).mean():.2f}")
print(f"Ortalama Kare Hata (MSE)          : {np.array(mse1).mean():.2f}")
print(f"Kök Ortalama Kare Hata (RMSE)     : {np.array(rmse1).mean():.2f}")
print(f"Ortalama Mutlak Yüzde Hata (MAPE) : %{np.array(mape1).mean():.2f}")

Eğitim verilerinin ayarlanmış R-kare değeri : 0.780
Test verilerinin ayarlanmış R-kare değeri   : 0.804

---------Test Kümesi Hata İstatistikleri---------
Ortalama Mutlak Hata (MAE)        : 24096.95
Ortalama Kare Hata (MSE)          : 1476389737.26
Kök Ortalama Kare Hata (RMSE)     : 37533.09
Ortalama Mutlak Yüzde Hata (MAPE) : %14.53


## Model 2: Ridge Regresyonu

In [4]:
cv_results2 = cross_validate(Ridge(), x1_train, y1_train, cv=5, 
                             return_train_score = True)
cv_results2

{'fit_time': array([0.004987  , 0.00398946, 0.00299239, 0.00292039, 0.00304127]),
 'score_time': array([0.0009973 , 0.00299144, 0.00199413, 0.00103617, 0.00099993]),
 'test_score': array([0.81207802, 0.78251721, 0.78274387, 0.7884199 , 0.61366468]),
 'train_score': array([0.76429763, 0.7710003 , 0.76892021, 0.76992167, 0.80442761])}

In [5]:
y_test_predict = cross_val_predict(Ridge(), x1_test, y1_test, cv=5)

var1 = cv_results2["train_score"].mean()
var2 = cv_results2["test_score"].mean()

print(f"Eğitim verilerinin ayarlanmış R-kare değeri: {var1:.3f}")
print(f"Test verilerinin ayarlanmış R-kare değeri: {var2:.3f}")

ridge_CV = RidgeCV(alphas=np.logspace(-1, 5, 7)).fit(x1_train, y1_train)
var3 = ridge_CV.score(x1_train, y1_train)
var4 = ridge_CV.score(x1_test, y1_test)
print(f"\nRidgeCV modülü ile; \nEğitim verilerinin ayarlanmış R-kare değeri: {var3:.3f}")
print(f"Test verilerinin ayarlanmış R-kare değeri: {var4:.3f}")

mae2 = mean_absolute_error(y1_test, y_test_predict)
mse2 = mse(y1_test, y_test_predict)
rmse2 = rmse(y1_test, y_test_predict)
mape2 = (abs((y1_test - y_test_predict)) / y1_test).mean() * 100

print("\n---------Test Kümesi Hata İstatistikleri---------")
print(f"Ortalama Mutlak Hata (MAE)        : {mae2:.2f}")
print(f"Ortalama Kare Hata (MSE)          : {mse2:.2f}")
print(f"Kök Ortalama Kare Hata (RMSE)     : {rmse2:.2f}")
print(f"Ortalama Mutlak Yüzde Hata (MAPE) : %{mape2:.2f}")
      
y_test_predict = ridge_CV.predict(x1_test)
mae2 = mean_absolute_error(y1_test, y_test_predict)
mse2 = mse(y1_test, y_test_predict)
rmse2 = rmse(y1_test, y_test_predict)
mape2 = (abs((y1_test - y_test_predict)) / y1_test).mean() * 100

print("\nRidgeCV Modülü ile;\n---------Test Kümesi Hata İstatistikleri---------")
print(f"Ortalama Mutlak Hata (MAE)        : {mae2:.2f}")
print(f"Ortalama Kare Hata (MSE)          : {mse2:.2f}")
print(f"Kök Ortalama Kare Hata (RMSE)     : {rmse2:.2f}")
print(f"Ortalama Mutlak Yüzde Hata (MAPE) : %{mape2:.2f}")

Eğitim verilerinin ayarlanmış R-kare değeri: 0.776
Test verilerinin ayarlanmış R-kare değeri: 0.756

RidgeCV modülü ile; 
Eğitim verilerinin ayarlanmış R-kare değeri: 0.774
Test verilerinin ayarlanmış R-kare değeri: 0.851

---------Test Kümesi Hata İstatistikleri---------
Ortalama Mutlak Hata (MAE)        : 21066.42
Ortalama Kare Hata (MSE)          : 835140546.21
Kök Ortalama Kare Hata (RMSE)     : 28898.80
Ortalama Mutlak Yüzde Hata (MAPE) : %12.31

RidgeCV Modülü ile;
---------Test Kümesi Hata İstatistikleri---------
Ortalama Mutlak Hata (MAE)        : 19605.79
Ortalama Kare Hata (MSE)          : 799056156.11
Kök Ortalama Kare Hata (RMSE)     : 28267.58
Ortalama Mutlak Yüzde Hata (MAPE) : %11.10


## Model 3: Lasso Regresyonu

In [6]:
cv_results3 = cross_validate(Lasso(), x1_train, y1_train, cv=5, 
                             return_train_score = True)
cv_results3

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'fit_time': array([0.03091669, 0.02199626, 0.0156219 , 0.0374496 , 0.02194095]),
 'score_time': array([0.00199485, 0.        , 0.        , 0.00099754, 0.00199533]),
 'test_score': array([0.81275845, 0.78243129, 0.7827679 , 0.78858437, 0.61369945]),
 'train_score': array([0.76429598, 0.77100068, 0.76891931, 0.76991725, 0.80442779])}

In [7]:
ey_test_predict = cross_val_predict(Lasso(), x1_test, y1_test, cv=5)

var1 = cv_results3["train_score"].mean()
var2 = cv_results3["test_score"].mean()

print(f"Eğitim verilerinin ayarlanmış R-kare değeri: {var1:.3f}")
print(f"Test verilerinin ayarlanmış R-kare değeri: {var2:.3f}")

lasso_CV = LassoCV(alphas=np.logspace(-1, 5, 7)).fit(x1_train, y1_train)
var3 = lasso_CV.score(x1_train, y1_train)
var4 = lasso_CV.score(x1_test, y1_test)
print(f"\nLassoCV modülü ile; \nEğitim verilerinin ayarlanmış R-kare değeri: {var3:.3f}")
print(f"Test verilerinin ayarlanmış R-kare değeri: {var4:.3f}")

mae3 = mean_absolute_error(y1_test, y_test_predict)
mse3 = mse(y1_test, y_test_predict)
rmse3 = rmse(y1_test, y_test_predict)
mape3 = (abs((y1_test - y_test_predict)) / y1_test).mean() * 100

print("\n---------Test Kümesi Hata İstatistikleri---------")
print(f"Ortalama Mutlak Hata (MAE)        : {mae3:.2f}")
print(f"Ortalama Kare Hata (MSE)          : {mse3:.2f}")
print(f"Kök Ortalama Kare Hata (RMSE)     : {rmse3:.2f}")
print(f"Ortalama Mutlak Yüzde Hata (MAPE) : %{mape3:.2f}")
      
y_test_predict = lasso_CV.predict(x1_test)
mae3 = mean_absolute_error(y1_test, y_test_predict)
mse3 = mse(y1_test, y_test_predict)
rmse3 = rmse(y1_test, y_test_predict)
mape3 = (abs((y1_test - y_test_predict)) / y1_test).mean() * 100

print("\nLassoCV Modülü ile;\n---------Test Kümesi Hata İstatistikleri---------")
print(f"Ortalama Mutlak Hata (MAE)        : {mae3:.2f}")
print(f"Ortalama Kare Hata (MSE)          : {mse3:.2f}")
print(f"Kök Ortalama Kare Hata (RMSE)     : {rmse3:.2f}")
print(f"Ortalama Mutlak Yüzde Hata (MAPE) : %{mape3:.2f}")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast

Eğitim verilerinin ayarlanmış R-kare değeri: 0.776
Test verilerinin ayarlanmış R-kare değeri: 0.756

LassoCV modülü ile; 
Eğitim verilerinin ayarlanmış R-kare değeri: 0.774
Test verilerinin ayarlanmış R-kare değeri: 0.851

---------Test Kümesi Hata İstatistikleri---------
Ortalama Mutlak Hata (MAE)        : 19605.79
Ortalama Kare Hata (MSE)          : 799056156.11
Kök Ortalama Kare Hata (RMSE)     : 28267.58
Ortalama Mutlak Yüzde Hata (MAPE) : %11.10

LassoCV Modülü ile;
---------Test Kümesi Hata İstatistikleri---------
Ortalama Mutlak Hata (MAE)        : 19600.48
Ortalama Kare Hata (MSE)          : 795313366.68
Kök Ortalama Kare Hata (RMSE)     : 28201.30
Ortalama Mutlak Yüzde Hata (MAPE) : %11.10


## Model 4: ElasticNet Regresyonu

In [8]:
cv_results4 = cross_validate(ElasticNet(), x1_train, y1_train, cv=5, 
                             return_train_score = True)
cv_results4

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'fit_time': array([0.02892256, 0.02194095, 0.00800395, 0.01560926, 0.03539038]),
 'score_time': array([0.00199413, 0.00199509, 0.01563406, 0.        , 0.00199461]),
 'test_score': array([0.78934743, 0.77863439, 0.77963594, 0.78734335, 0.56946822]),
 'train_score': array([0.75122288, 0.7548449 , 0.75336884, 0.75429446, 0.7921006 ])}

In [9]:
y_test_predict = cross_val_predict(ElasticNet(), x1_test, y1_test, cv=5)

var1 = cv_results4["train_score"].mean()
var2 = cv_results4["test_score"].mean()

print(f"Eğitim verilerinin ayarlanmış R-kare değeri: {var1:.3f}")
print(f"Test verilerinin ayarlanmış R-kare değeri: {var2:.3f}")

elasticNet_CV = ElasticNetCV(alphas=np.logspace(-1, 5, 7)).fit(x1_train, y1_train)
var3 = elasticNet_CV.score(x1_train, y1_train)
var4 = elasticNet_CV.score(x1_test, y1_test)
print(f"\nElasticNetCV modülü ile; \nEğitim verilerinin ayarlanmış R-kare değeri: {var3:.3f}")
print(f"Test verilerinin ayarlanmış R-kare değeri: {var4:.3f}")

mae4 = mean_absolute_error(y1_test, y_test_predict)
mse4 = mse(y1_test, y_test_predict)
rmse4 = rmse(y1_test, y_test_predict)
mape4 = (abs((y1_test - y_test_predict)) / y1_test).mean() * 100

print("\n---------Test Kümesi Hata İstatistikleri---------")
print(f"Ortalama Mutlak Hata (MAE)        : {mae4:.2f}")
print(f"Ortalama Kare Hata (MSE)          : {mse4:.2f}")
print(f"Kök Ortalama Kare Hata (RMSE)     : {rmse4:.2f}")
print(f"Ortalama Mutlak Yüzde Hata (MAPE) : %{mape4:.2f}")
      
y_test_predict = elasticNet_CV.predict(x1_test)
mae4 = mean_absolute_error(y1_test, y_test_predict)
mse4 = mse(y1_test, y_test_predict)
rmse4 = rmse(y1_test, y_test_predict)
mape4 = (abs((y1_test - y_test_predict)) / y1_test).mean() * 100

print("\nElasticNetCV Modülü ile;\n---------Test Kümesi Hata İstatistikleri---------")
print(f"Ortalama Mutlak Hata (MAE)        : {mae4:.2f}")
print(f"Ortalama Kare Hata (MSE)          : {mse4:.2f}")
print(f"Kök Ortalama Kare Hata (RMSE)     : {rmse4:.2f}")
print(f"Ortalama Mutlak Yüzde Hata (MAPE) : %{mape4:.2f}")
del var1, var2, var3, var4

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


Eğitim verilerinin ayarlanmış R-kare değeri: 0.761
Test verilerinin ayarlanmış R-kare değeri: 0.741

ElasticNetCV modülü ile; 
Eğitim verilerinin ayarlanmış R-kare değeri: 0.773
Test verilerinin ayarlanmış R-kare değeri: 0.848

---------Test Kümesi Hata İstatistikleri---------
Ortalama Mutlak Hata (MAE)        : 22016.89
Ortalama Kare Hata (MSE)          : 1007551056.90
Kök Ortalama Kare Hata (RMSE)     : 31741.94
Ortalama Mutlak Yüzde Hata (MAPE) : %12.72

ElasticNetCV Modülü ile;
---------Test Kümesi Hata İstatistikleri---------
Ortalama Mutlak Hata (MAE)        : 19591.49
Ortalama Kare Hata (MSE)          : 814212583.66
Kök Ortalama Kare Hata (RMSE)     : 28534.41
Ortalama Mutlak Yüzde Hata (MAPE) : %11.05


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


## Yorumlar:
Test verileri için Ayarlanmış R-kare Değerleri:
ElasticNet: 0.727
Ridge & Lasso: 0.739
OLS: 0.776
RidgeCV, LassoCV & ElasticNetCV: 0.819
Test Kümesi, Ortalama Mutlak Yüzde Hata Değerleri:
OLS: %15.08
ElasticNet: %13.45
Ridge & Lasso: %12.92
RidgeCV & LassoCV: %12.06
ElasticNetCV: %12.00
En iyi modelin, ElasticNetCV modülü ile oluşturulan ElasticNet regresyon modeli olduğunu görürüz.