In [1]:
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)
n = 1200

size_m2 = rng.normal(110, 35, size=n).clip(30, 300)
rooms = rng.integers(1, 6, size=n)
building_age = rng.integers(0, 41, size=n)
floor = rng.integers(0, 15, size=n)
distance_center_km = rng.normal(7, 4, size=n).clip(0.5, 30)

# Gerçekçi fiyat formülü (gizli gerçek)
price = (
    size_m2 * 18000
    + rooms * 120000
    - building_age * 15000
    - distance_center_km * 35000
    + floor * 20000
    + rng.normal(0, 250000, size=n)   # gürültü
)

price = price.clip(300000, None)

df = pd.DataFrame({
    "size_m2": size_m2.round(1),
    "rooms": rooms,
    "building_age": building_age,
    "floor": floor,
    "distance_center_km": distance_center_km.round(2),
    "price": price.round(0)
})

df.to_csv("house_prices_regression_1200.csv", index=False)
df.head()


Unnamed: 0,size_m2,rooms,building_age,floor,distance_center_km,price
0,120.7,3,24,0,15.13,1619045.0
1,73.6,2,0,8,6.87,1610269.0
2,136.3,1,26,2,11.11,2099236.0
3,142.9,3,17,9,6.79,2771782.0
4,41.7,3,15,6,6.88,783566.0


### Keşif

In [2]:
df.shape #1200 satır, 6 kolondan oluşuyor. yani 5 özellik (y) 1 hedef (X)'ten oluşuyor.

(1200, 6)

In [4]:
df.info() #kolonlarda herhangi bir null değer yok. dolayısıyla değer doldurma (imputation) gerekmez, model direkt eğitilebilir.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   size_m2             1200 non-null   float64
 1   rooms               1200 non-null   int64  
 2   building_age        1200 non-null   int64  
 3   floor               1200 non-null   int64  
 4   distance_center_km  1200 non-null   float64
 5   price               1200 non-null   float64
dtypes: float64(3), int64(3)
memory usage: 56.4 KB


In [5]:
df.describe() #std size_m2 yüksek yani geniş bir std dağılım anlamına gelir. distance_center'da bazı evler merkeze yakın, bazıları uzak. 

Unnamed: 0,size_m2,rooms,building_age,floor,distance_center_km,price
count,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0
mean,109.353917,3.015833,19.501667,7.008333,7.097425,1929585.0
std,34.146054,1.417953,11.94748,4.239683,3.789276,720260.2
min,30.0,1.0,0.0,0.0,0.5,300000.0
25%,86.15,2.0,9.75,3.0,4.32,1455771.0
50%,110.1,3.0,19.0,7.0,6.925,1900428.0
75%,131.525,4.0,30.0,11.0,9.5625,2409326.0
max,221.3,5.0,40.0,14.0,20.82,4405199.0


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [10]:
X = df[["size_m2", "rooms", "building_age", "floor", "distance_center_km"]]
y = df["price"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size = 0.2, random_state = 42
    
)

In [14]:
model = LinearRegression()

model.fit(X_train, y_train)

In [17]:
prediction = model.predict(X_test)#sonrasında modeli eğitiyoruz.

In [18]:
mae = mean_absolute_error(y_test, prediction)
print(mae) #model 50k yanılıyor. 

180992.59419912993


In [22]:
df["price"].mean()

1929585.2775

In [23]:
df["price"].median()

1900427.5

In [20]:
rmse = np.sqrt(mean_squared_error(y_test, prediction))
print(rmse)

232507.10909174863


In [21]:
baseline_pred = [y_train.mean()] * len(y_test)
baseline_mae = mean_absolute_error(y_test, baseline_pred)
print(baseline_mae)

587456.52515625
