In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('data_cleaned.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6116 entries, 0 to 6115
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   city          6116 non-null   object
 1   district      6116 non-null   object
 2   neighborhood  6116 non-null   object
 3   room          6116 non-null   int64 
 4   livingRoom    6116 non-null   int64 
 5   area          6116 non-null   int64 
 6   age           6116 non-null   int64 
 7   floor         6116 non-null   int64 
 8   price         6116 non-null   int64 
dtypes: int64(6), object(3)
memory usage: 430.2+ KB


In [4]:
df['city'] = df['city'].astype('category')
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')
df['room'] = df['room'].astype('int')
df['livingRoom'] = df['livingRoom'].astype('int')
df['area'] = df['area'].astype('int')
df['age'] = df['age'].astype('int')
df['floor'] = df['floor'].astype('int')
df['price'] = df['price'].astype('int')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6116 entries, 0 to 6115
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          6116 non-null   category
 1   district      6116 non-null   category
 2   neighborhood  6116 non-null   category
 3   room          6116 non-null   int32   
 4   livingRoom    6116 non-null   int32   
 5   area          6116 non-null   int32   
 6   age           6116 non-null   int32   
 7   floor         6116 non-null   int32   
 8   price         6116 non-null   int32   
dtypes: category(3), int32(6)
memory usage: 191.7 KB


In [6]:
categoricalFeatures = ['city','district','neighborhood']
numericalFeatures = ['room','livingRoom','area','age','floor']

In [7]:
fullPipeline = ColumnTransformer([
    ('num',StandardScaler(),numericalFeatures),
    ('cat',OneHotEncoder(handle_unknown='ignore'),categoricalFeatures)
])

In [8]:
X = df.drop('price',axis=1)
y = df['price']

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [10]:
model = Pipeline([
    ('preparation', fullPipeline),
    ('model', LinearRegression())
])

In [11]:
model.fit(X_train,y_train)

In [12]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test,y_pred)

In [13]:
print(f"MSE:{mse}")
print(f"RMSE:{rmse}")
print(f"R^2:{r2}")

MSE:42472210.43862266
RMSE:6517.070694615999
R^2:0.5816311312131912


In [14]:
featureImportances = model.named_steps['model'].coef_

In [15]:
featureImportances

array([ 9.23616380e+02,  0.00000000e+00,  3.27932435e+03, -2.21293134e+03,
        1.84089786e+02, -3.21623416e+03,  1.12655715e+03, -3.73853812e+03,
        5.45325285e+03, -2.19728964e+03,  2.57225191e+03, -8.68558250e+02,
       -1.24282286e+03, -1.44318609e+03, -5.44715844e+03,  5.01442913e+03,
       -9.68492766e+03, -6.99930488e+03, -7.24451066e+03,  1.71011304e+04,
       -2.40937268e+03, -2.45262740e+03, -2.44221991e+03,  3.06486295e+03,
       -1.90327546e+03, -1.27648346e+03,  1.74332327e+04, -4.98038574e+03,
       -2.20806984e+03, -2.59155170e+03, -3.05871742e+03,  6.24272134e+03,
        1.15864550e+03, -9.46478178e+02, -1.57146370e+03, -2.92092047e+03,
        9.27953672e+02,  1.08695266e+04, -2.24172336e+03,  9.13248341e+03,
        3.13932015e+03,  6.96726550e+02, -7.53562806e+03, -7.96229039e+02,
       -7.22564866e+02,  1.63387116e+03, -9.51356906e+03,  1.44385650e+04,
       -4.37630557e+03,  3.73442149e+03, -1.02312408e+03,  1.28247332e+04,
        8.61666015e+03,  

In [16]:
print("Numerical Feateres")
for i in range(len(numericalFeatures)):
    print(numericalFeatures[i],featureImportances[i])

Numerical Feateres
room 923.6163804009475
livingRoom 0.0
area 3279.3243512171216
age -2212.931342690237
floor 184.08978630644995


In [17]:
print("Categorical Feateres")
for i in range(len(categoricalFeatures)):
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j],featureImportances[len(numericalFeatures)+ j])

Categorical Feateres
afyonkarahisar -3216.234161380194
aydin 1126.5571536475509
denizli -3738.5381157806955
izmir 5453.252849893863
manisa -2197.289636958381
mugla 2572.251910554541
acipayam -3216.234161380194
akhisar 1126.5571536475509
alasehir -3738.5381157806955
aliaga 5453.252849893863
balcova -2197.289636958381
bayindir 2572.251910554541
bayrakli -868.558249841615
bergama -1242.822858416953
bodrum -1443.1860938864497
bornova -5447.158439557202
buca 5014.429131230781
buharkent -9684.927657291943
cameli -6999.304877043124
cardak -7244.51065519287
cay 17101.130389494225
cesme -2409.3726826849934
cigli -2452.6273988212606
cine -2442.2199122196257
civril 3064.862952086173
dalaman -1903.2754575179897
datca -1276.4834598027755
demirci 17433.232680815003
didim -4980.385742394562
dikili -2208.0698405164503
efeler -2591.5516980401844
fethiye -3058.717416869649
foca 6242.721344529302
gaziemir 1158.645501310554
germencik -946.4781776240742
guzelbahce -1571.4637029576131
honaz -2920.9204668131

The state with a tolerance value.

In [27]:
def tolerance_r2(y_true, y_pred,tolerance):
    residuals = y_pred - y_true
    residuals[np.abs(residuals) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)


def tolerance_percentange_r2(y_true, y_pred,tolerance):
    residuals = y_pred - y_true
    residuals[(np.abs(residuals) / y_true )<= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)


In [28]:
print(r2_score(y_test,y_pred))

0.5816311312131912


In [29]:
print(tolerance_r2(y_test,y_pred,1000))

0.5823401100987761


In [30]:
print(tolerance_percentange_r2(y_test,y_pred,0.30))

0.6594658853703971
