# Estimation Of Rent By Linear Regression

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [4]:
df = pd.read_csv('data_cleaned.csv')

In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6116 entries, 0 to 6115
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   city          6116 non-null   object
 1   district      6116 non-null   object
 2   neighborhood  6116 non-null   object
 3   room          6116 non-null   int64 
 4   living_room   6116 non-null   int64 
 5   area          6116 non-null   int64 
 6   age           6116 non-null   int64 
 7   floor         6116 non-null   int64 
 8   price         6116 non-null   int64 
dtypes: int64(6), object(3)
memory usage: 430.2+ KB
None


In [6]:
df['city'] = df['city'].astype('category')
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')
df['room'] = df['room'].astype('int')
df['living_room'] = df['living_room'].astype('int')
df['area'] = df['area'].astype('int')
df['age'] = df['age'].astype('int')
df['floor'] = df['floor'].astype('int')
df['price'] = df['price'].astype('int')

In [7]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6116 entries, 0 to 6115
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          6116 non-null   category
 1   district      6116 non-null   category
 2   neighborhood  6116 non-null   category
 3   room          6116 non-null   int64   
 4   living_room   6116 non-null   int64   
 5   area          6116 non-null   int64   
 6   age           6116 non-null   int64   
 7   floor         6116 non-null   int64   
 8   price         6116 non-null   int64   
dtypes: category(3), int64(6)
memory usage: 335.0 KB
None


In [8]:
categorical_features = ['city', 'district', 'neighborhood']
numerical_features = ['room', 'living_room', 'area', 'age', 'floor']

In [9]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [10]:
X = df.drop('price', axis=1)
y = df['price']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', LinearRegression())
])

In [13]:
model.fit(X_train, y_train)

In [14]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [15]:
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

MSE: 42478739.39277743
RMSE: 6517.57158708498
R^2: 0.5815668183098626


In [16]:
feature_importances = model.named_steps['model'].coef_
print(len(feature_importances))
print(feature_importances)

717
[ 9.23512028e+02  0.00000000e+00  3.28043638e+03 -2.21252456e+03
  1.72126051e+02 -3.21743111e+03  1.12812957e+03 -3.73826037e+03
  5.45123126e+03 -2.19322459e+03  2.56955524e+03 -8.67475006e+02
 -1.24932825e+03 -1.44216093e+03 -5.44137163e+03  5.01664867e+03
 -9.67870016e+03 -7.00681971e+03 -7.23749940e+03  1.71021625e+04
 -2.41487996e+03 -2.45402900e+03 -2.43967717e+03  3.06225708e+03
 -1.90022767e+03 -1.27672467e+03  1.74260587e+04 -4.97326249e+03
 -2.19726214e+03 -2.59308799e+03 -3.05844643e+03  6.24414258e+03
  1.17015430e+03 -9.49563368e+02 -1.57632440e+03 -2.92153528e+03
  9.27727830e+02  1.08679770e+04 -2.24202820e+03  9.12788740e+03
  3.14995315e+03  7.00639238e+02 -7.52875587e+03 -7.91428384e+02
 -7.32768895e+02  1.63788869e+03 -9.51462123e+03  1.44272501e+04
 -4.38236710e+03  3.74099903e+03 -1.03099984e+03  1.28220005e+04
  8.62032605e+03  8.33972885e+02 -4.25101411e+03  3.75979956e+03
 -1.94070644e+03  1.68537504e+03 -4.03674239e+03  7.39800766e+03
 -5.10269648e+03 -8.2

In [17]:
print("Numerical Features")
for i in range(len(numerical_features)):
    print(numerical_features[i], feature_importances[i])

Numerical Features
room 923.5120277600364
living_room 0.0
area 3280.436380029853
age -2212.5245565668606
floor 172.12605149771298


In [18]:
print("Categorical Features")
for i in range(len(categorical_features)):
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importances[len(numerical_features) + j])

Categorical Features
afyonkarahisar -3217.4311052321004
aydin 1128.1295654462563
denizli -3738.260372250595
izmir 5451.231258080736
manisa -2193.224590627441
mugla 2569.5552442800613
acipayam -3217.4311052321004
akhisar 1128.1295654462563
alasehir -3738.260372250595
aliaga 5451.231258080736
balcova -2193.224590627441
bayindir 2569.5552442800613
bayrakli -867.4750063857821
bergama -1249.3282502996149
bodrum -1442.1609348797435
bornova -5441.371626208222
buca 5016.648672973431
buharkent -9678.700156150639
cameli -7006.81971188102
cardak -7237.499395085256
cay 17102.162499376052
cesme -2414.8799554942634
cigli -2454.0290007696076
cine -2439.6771672677805
civril 3062.2570787988125
dalaman -1900.227671991918
datca -1276.724669269891
demirci 17426.058668631922
didim -4973.262491535466
dikili -2197.262144186339
efeler -2593.087992264147
fethiye -3058.4464308782494
foca 6244.142577922938
gaziemir 1170.1542982467015
germencik -949.5633683342294
guzelbahce -1576.3243991818074
honaz -2921.5352750

In [19]:
new_data = pd.DataFrame({
    'city': ['manisa'],
    'district': ['yunusemre'],
    'neighborhood': ['guzelyurt'],
    'room': [4],
    'living_room': [1],
    'area': [200],
    'age': [5],
    'floor': [3]
})

print(model.predict(new_data))

[30087.56935418]


In [20]:
print(df[(df['city'] == 'manisa') & (df['district'] == 'yunusemre') & (df['neighborhood'] == 'guzelyurt')])

        city   district neighborhood  room  living_room  area  age  floor  \
2712  manisa  yunusemre    guzelyurt     1            1    65   13      5   
2759  manisa  yunusemre    guzelyurt     2            1    85    2      3   
2783  manisa  yunusemre    guzelyurt     4            1   196    5      1   
2800  manisa  yunusemre    guzelyurt     1            1    60   11      5   

      price  
2712  15000  
2759  15000  
2783  36000  
2800  11000  


In [21]:
def tolerance_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[np.abs(residuals) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

def tolerance_percentage_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[(np.abs(residuals) / y_true) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

In [22]:
print(r2_score(y_test, y_pred))
print(tolerance_r2(y_test, y_pred, 10000))
print(tolerance_percentage_r2(y_test, y_pred, 0.50))

0.5815668183098626
0.7230209235301088
0.821090914834199
