In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# veride okunamayan karakterleri okutmak için encoding
df = pd.read_csv(r"C:\Users\ASUS\Desktop\Bitirme\4-exploratory_data_analysis\df_next2_winsorized.csv") 

df.head(4)

Unnamed: 0,price,m2,age,floor,total_room,district_encoded,neighbor_encoded
0,2229999,110,0,1.0,3,4970296.0,6487466.0
1,2650000,90,0,4.0,2,2769582.0,2487815.0
2,3449999,130,0,2.0,3,4970296.0,6057293.0
3,5475000,125,21,4.0,3,5848908.0,7428125.0


**Bölgesine göre m² fiyatı anormal yüksek**

In [8]:
import pandas as pd
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# 1. CSV dosyasını oku
df = pd.read_csv(r"C:\Users\ASUS\Desktop\Bitirme\4-exploratory_data_analysis\df_next2_winsorized.csv")

# 2. district bazlı ortalama m2 ve fiyat hesapla
district_stats = df.groupby("district_encoded")[["m2", "price"]].mean().rename(
    columns={"m2": "mean_m2", "price": "mean_price"}
)

# 3. Ortalamaları ana veriyle birleştir
df_rel = df.merge(district_stats, on="district_encoded", how="left")

# 4. m2 başına fiyat ve ortalama fiyat oranlarını hesapla
df_rel["price_per_m2"] = df_rel["price"] / df_rel["m2"]
df_rel["mean_price_per_m2"] = df_rel["mean_price"] / df_rel["mean_m2"]

# 5. Bağlamsal outlier tespiti (m2 başına fiyatı ortalamanın %50 üzerindeyse çıkar)
df_rel["strong_context_outlier"] = df_rel["price_per_m2"] > (df_rel["mean_price_per_m2"] * 1.5)

# 6. Outlier'ları çıkar ve modellemeye hazırla
df_cleaned = df_rel[~df_rel["strong_context_outlier"]].drop(
    columns=["mean_m2", "mean_price", "price_per_m2", "mean_price_per_m2", "strong_context_outlier"]
)

# 7. Modelleme: Random Forest ile R² hesapla
X = df_cleaned.drop(columns="price")
y = df_cleaned["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
print("Final R² Skoru (district bazlı bağlamsal outlier temizliği sonrası):", r2)


Final R² Skoru (district bazlı bağlamsal outlier temizliği sonrası): 0.8876642980982441


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# 1. CSV dosyasını oku
df = pd.read_csv(r"C:\Users\ASUS\Desktop\Bitirme\4-exploratory_data_analysis\df_next2_winsorized.csv")

# 2. neighbor bazlı ortalama m2 ve fiyat hesapla
neighbor_stats = df.groupby("neighbor_encoded")[["m2", "price"]].mean().rename(
    columns={"m2": "mean_m2", "price": "mean_price"}
)

# 3. Ortalamaları ana veriyle birleştir
df_rel = df.merge(neighbor_stats, on="neighbor_encoded", how="left")

# 4. m2 başına fiyat ve ortalama fiyat oranlarını hesapla
df_rel["price_per_m2"] = df_rel["price"] / df_rel["m2"]
df_rel["mean_price_per_m2"] = df_rel["mean_price"] / df_rel["mean_m2"]

# 5. Bağlamsal outlier tespiti (price/m2 oranı çok yüksek olanlar)
df_rel["strong_context_outlier"] = df_rel["price_per_m2"] > (df_rel["mean_price_per_m2"] * 1.5)

# 6. Outlier'ları çıkar
df_cleaned = df_rel[~df_rel["strong_context_outlier"]].drop(
    columns=["mean_m2", "mean_price", "price_per_m2", "mean_price_per_m2", "strong_context_outlier"]
)

# 7. Modelleme
X = df_cleaned.drop(columns="price")
y = df_cleaned["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 8. Sonuç
r2 = r2_score(y_test, y_pred)
print("R² Skoru (neighbor bazlı bağlamsal outlier temizliği):", r2)


R² Skoru (neighbor bazlı bağlamsal outlier temizliği): 0.8952434647517871


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# 1. Veriyi oku
df = pd.read_csv(r"C:\Users\ASUS\Desktop\Bitirme\4-exploratory_data_analysis\df_next2_winsorized.csv")

# 2. district bazlı ortalama m2 ve fiyat
district_stats = df.groupby("district_encoded")[["m2", "price"]].mean().rename(
    columns={"m2": "mean_m2_d", "price": "mean_price_d"}
)
df = df.merge(district_stats, on="district_encoded", how="left")
df["price_per_m2"] = df["price"] / df["m2"]
df["mean_ppm_d"] = df["mean_price_d"] / df["mean_m2_d"]
df["outlier_district"] = df["price_per_m2"] > (df["mean_ppm_d"] * 1.5)

# 3. neighbor bazlı ortalama m2 ve fiyat
neighbor_stats = df.groupby("neighbor_encoded")[["m2", "price"]].mean().rename(
    columns={"m2": "mean_m2_n", "price": "mean_price_n"}
)
df = df.merge(neighbor_stats, on="neighbor_encoded", how="left")
df["mean_ppm_n"] = df["mean_price_n"] / df["mean_m2_n"]
df["outlier_neighbor"] = df["price_per_m2"] > (df["mean_ppm_n"] * 1.5)

# 4. En az bir bağlamda outlier olanları çıkar
df_filtered = df[~(df["outlier_district"] | df["outlier_neighbor"])].copy()

# 5. Gereksiz sütunları çıkar
df_filtered = df_filtered.drop(columns=[
    "mean_m2_d", "mean_price_d", "mean_ppm_d",
    "mean_m2_n", "mean_price_n", "mean_ppm_n",
    "price_per_m2", "outlier_district", "outlier_neighbor"
])

# 6. Modelleme
X = df_filtered.drop(columns="price")
y = df_filtered["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print("✅ R² Skoru (district + neighbor bağlamsal temizliği):", r2)


✅ R² Skoru (district + neighbor bağlamsal temizliği): 0.9065556791535067


**District ve neighbor içi fiyat IQR temelli outlier temizliği**

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# CSV'den oku
df = pd.read_csv(r"C:\Users\ASUS\Desktop\Bitirme\4-exploratory_data_analysis\df_next2_winsorized.csv")

# Denenecek IQR k değerleri
iqr_ks = [1.0, 1.5, 2.0]
results = []

for k in iqr_ks:
    df_filtered = df.copy()
    df_filtered["outlier_district_iqr"] = False
    df_filtered["outlier_neighbor_iqr"] = False

    # District bazında IQR
    for district, group in df.groupby("district_encoded"):
        Q1 = group["price"].quantile(0.25)
        Q3 = group["price"].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - k * IQR
        upper = Q3 + k * IQR
        mask = (group["price"] < lower) | (group["price"] > upper)
        df_filtered.loc[group.index, "outlier_district_iqr"] = mask

    # Neighbor bazında IQR
    for neighbor, group in df.groupby("neighbor_encoded"):
        Q1 = group["price"].quantile(0.25)
        Q3 = group["price"].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - k * IQR
        upper = Q3 + k * IQR
        mask = (group["price"] < lower) | (group["price"] > upper)
        df_filtered.loc[group.index, "outlier_neighbor_iqr"] = mask

    # Tüm outlier'ları çıkar
    df_cleaned = df_filtered[~(df_filtered["outlier_district_iqr"] | df_filtered["outlier_neighbor_iqr"])].copy()
    df_cleaned = df_cleaned.drop(columns=["outlier_district_iqr", "outlier_neighbor_iqr"])

    # Modelleme
    X = df_cleaned.drop(columns="price")
    y = df_cleaned["price"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)

    results.append((k, df_cleaned.shape[0], round(r2, 4)))

# Tablo olarak yazdır
print("| IQR Katsayısı (k) | Kalan Kayıt Sayısı | 🎯 R² Skoru  |")
print("| ----------------- | ------------------ | ------------ |")
for k, n, r2 in results:
    star = "✅" if r2 == max(x[2] for x in results) else ""
    print(f"| **{k:.1f}**           | {n:<18} | {r2:<10} {star}|")


| IQR Katsayısı (k) | Kalan Kayıt Sayısı | 🎯 R² Skoru  |
| ----------------- | ------------------ | ------------ |
| **1.0**           | 14758              | 0.8878     |
| **1.5**           | 15518              | 0.8897     |
| **2.0**           | 15906              | 0.8934     ✅|


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# 1. Veriyi oku
df = pd.read_csv(r"C:\Users\ASUS\Desktop\Bitirme\4-exploratory_data_analysis\df_next2_winsorized.csv")

# 2. Bağlamsal price_per_m2 outlier'ları tespit et (district)
district_stats = df.groupby("district_encoded")[["m2", "price"]].mean().rename(
    columns={"m2": "mean_m2_d", "price": "mean_price_d"}
)
df = df.merge(district_stats, on="district_encoded", how="left")
df["price_per_m2"] = df["price"] / df["m2"]
df["mean_ppm_d"] = df["mean_price_d"] / df["mean_m2_d"]
df["outlier_district_ppm"] = df["price_per_m2"] > (df["mean_ppm_d"] * 1.5)

# 3. Bağlamsal price_per_m2 outlier'ları tespit et (neighbor)
neighbor_stats = df.groupby("neighbor_encoded")[["m2", "price"]].mean().rename(
    columns={"m2": "mean_m2_n", "price": "mean_price_n"}
)
df = df.merge(neighbor_stats, on="neighbor_encoded", how="left")
df["mean_ppm_n"] = df["mean_price_n"] / df["mean_m2_n"]
df["outlier_neighbor_ppm"] = df["price_per_m2"] > (df["mean_ppm_n"] * 1.5)

# 4. District IQR outlier temizliği (k=2.0)
df["outlier_district_iqr"] = False
for d, group in df.groupby("district_encoded"):
    Q1 = group["price"].quantile(0.25)
    Q3 = group["price"].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 2.0 * IQR
    upper = Q3 + 2.0 * IQR
    mask = (group["price"] < lower) | (group["price"] > upper)
    df.loc[group.index, "outlier_district_iqr"] = mask

# 5. Neighbor IQR outlier temizliği (k=2.0)
df["outlier_neighbor_iqr"] = False
for n, group in df.groupby("neighbor_encoded"):
    Q1 = group["price"].quantile(0.25)
    Q3 = group["price"].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 2.0 * IQR
    upper = Q3 + 2.0 * IQR
    mask = (group["price"] < lower) | (group["price"] > upper)
    df.loc[group.index, "outlier_neighbor_iqr"] = mask

# 6. Tüm outlier'ları çıkar
df_cleaned = df[
    ~(df["outlier_district_ppm"] | df["outlier_neighbor_ppm"] |
      df["outlier_district_iqr"] | df["outlier_neighbor_iqr"])
].copy()

# 7. Gereksiz sütunları sil
df_cleaned = df_cleaned.drop(columns=[
    "mean_m2_d", "mean_price_d", "mean_m2_n", "mean_price_n",
    "mean_ppm_d", "mean_ppm_n", "price_per_m2",
    "outlier_district_ppm", "outlier_neighbor_ppm",
    "outlier_district_iqr", "outlier_neighbor_iqr"
])

# 8. Modelleme
X = df_cleaned.drop(columns="price")
y = df_cleaned["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print("🎯 Final R² Skoru (bağlamsal + IQR temizliği):", r2)


🎯 Final R² Skoru (bağlamsal + IQR temizliği): 0.9134391843942011


bir oda başına en fazla 80 m2 olacak şekilde olan durumu inceleyelim

In [22]:
# Denenecek m2/oda eşikleri
thresholds = [80, 100, 120]

# Sonuçları tut
results = []

for threshold in thresholds:
    df["m2_per_room"] = df["m2"] / df["total_room"]
    df_filtered = df[df["m2_per_room"] <= threshold].copy()

    X = df_filtered.drop(columns=["price", "m2_per_room"])
    y = df_filtered["price"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    results.append((threshold, df_filtered.shape[0], round(r2, 4)))

# Sonuçları yazdır
print("| m² / Oda Sınırı | Kalan Kayıt | R² Skoru |")
print("|------------------|--------------|-----------|")
for threshold, count, r2 in results:
    print(f"| {threshold:<16} | {count:<12} | {r2:<9} |")

| m² / Oda Sınırı | Kalan Kayıt | R² Skoru |
|------------------|--------------|-----------|
| 80               | 16753        | 0.8608    |
| 100              | 16763        | 0.8585    |
| 120              | 16763        | 0.8585    |


Olumsuz etkiledi, ekstra büyük odalı evler gerekli bir outlier

**Zemin/Bodrum Katta Aşırı Pahalı Evler (floor ≤ 0 & price üst %5):**

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Veri setini oku
df = pd.read_csv(r"C:\Users\ASUS\Desktop\Bitirme\4-exploratory_data_analysis\df_next2_winsorized.csv")

# 1. Zemin ve bodrum katları seç
ground_below = df[df["floor"] < 0]

# 2. Bu grubun %95'lik fiyat eşiğini al
price_threshold = ground_below["price"].quantile(0.95)

# 3. Outlier olarak işaretle: floor <= 0 ve price > eşik
df["ground_outlier"] = (df["floor"] <= 0) & (df["price"] > price_threshold)

# 4. Kaç tane böyle outlier var?
num_outliers = df["ground_outlier"].sum()

# 5. Bu outlier'ları çıkar
df_cleaned = df[~df["ground_outlier"]].drop(columns=["ground_outlier"])

# 6. Modelleme (Random Forest ile)
X = df_cleaned.drop(columns="price")
y = df_cleaned["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 7. R² skorunu hesapla
r2 = r2_score(y_test, y_pred)

# 8. Sonucu yazdır
print("Tespit edilen outlier sayısı:", num_outliers)
print("Kalan kayıt sayısı:", df_cleaned.shape[0])
print("R² Skoru:", round(r2, 4))


Tespit edilen outlier sayısı: 424
Kalan kayıt sayısı: 16340
R² Skoru: 0.8566


**FİNAL**

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# 1. Veriyi oku
df = pd.read_csv(r"C:\Users\ASUS\Desktop\Bitirme\4-exploratory_data_analysis\df_next2_winsorized.csv")

original_len = len(df)  # Başlangıçtaki veri sayısı

# 2. District bağlamsal outlier
district_stats = df.groupby("district_encoded")[["m2", "price"]].mean().rename(
    columns={"m2": "mean_m2_d", "price": "mean_price_d"}
)
df = df.merge(district_stats, on="district_encoded", how="left")
df["price_per_m2"] = df["price"] / df["m2"]
df["mean_ppm_d"] = df["mean_price_d"] / df["mean_m2_d"]
df["outlier_district_ppm"] = df["price_per_m2"] > (df["mean_ppm_d"] * 1.5)

# 3. Neighbor bağlamsal outlier
neighbor_stats = df.groupby("neighbor_encoded")[["m2", "price"]].mean().rename(
    columns={"m2": "mean_m2_n", "price": "mean_price_n"}
)
df = df.merge(neighbor_stats, on="neighbor_encoded", how="left")
df["mean_ppm_n"] = df["mean_price_n"] / df["mean_m2_n"]
df["outlier_neighbor_ppm"] = df["price_per_m2"] > (df["mean_ppm_n"] * 1.5)

# 4. District IQR (k=2.0)
df["outlier_district_iqr"] = False
for d, group in df.groupby("district_encoded"):
    Q1 = group["price"].quantile(0.25)
    Q3 = group["price"].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 2.0 * IQR
    upper = Q3 + 2.0 * IQR
    mask = (group["price"] < lower) | (group["price"] > upper)
    df.loc[group.index, "outlier_district_iqr"] = mask

# 5. Neighbor IQR (k=2.0)
df["outlier_neighbor_iqr"] = False
for n, group in df.groupby("neighbor_encoded"):
    Q1 = group["price"].quantile(0.25)
    Q3 = group["price"].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 2.0 * IQR
    upper = Q3 + 2.0 * IQR
    mask = (group["price"] < lower) | (group["price"] > upper)
    df.loc[group.index, "outlier_neighbor_iqr"] = mask

# 6. Outlier'ları çıkar
df_cleaned = df[
    ~(df["outlier_district_ppm"] | df["outlier_neighbor_ppm"] |
      df["outlier_district_iqr"] | df["outlier_neighbor_iqr"])
].copy()

removed_count = original_len - len(df_cleaned)

# 7. Gereksiz sütunları temizle
df_cleaned = df_cleaned.drop(columns=[
    "mean_m2_d", "mean_price_d", "mean_m2_n", "mean_price_n",
    "mean_ppm_d", "mean_ppm_n", "price_per_m2",
    "outlier_district_ppm", "outlier_neighbor_ppm",
    "outlier_district_iqr", "outlier_neighbor_iqr"
])

# 8. Modelleme
X = df_cleaned.drop(columns="price")
y = df_cleaned["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

# 9. Çıktı
print("🎯 Final R² Skoru (bağlamsal + IQR temizliği):", round(r2, 4))
print("🧹 Çıkarılan kayıt sayısı:", removed_count)
print("📊 Kalan kayıt sayısı:", len(df_cleaned))

df_cleaned.to_csv("df_cleaned.csv")
print("df_cleaned.csv başarıyla kaydedildi!")

🎯 Final R² Skoru (bağlamsal + IQR temizliği): 0.9134
🧹 Çıkarılan kayıt sayısı: 2722
📊 Kalan kayıt sayısı: 14042
df_cleaned.csv başarıyla kaydedildi!
