# Araba Fiyat Tahmini Projesi

Bu not defteri proje ödevinin gereksinimlerini kapsar:
- Veriyi önişleme
- Veri görselleştirme (EDA)
- İstatistiksel analizler
- Kümeleme ve regresyon modelleri
- Eğitim/test değerlendirme
- (Gerekirse) hiperparametre araması
- Sonuçların tablo ve görsellerle analizi



In [1]:
# Kurulum ve veri yükleme
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.metrics import mean_absolute_error, r2_score, silhouette_score
from sklearn.ensemble import RandomForestRegressor

# Yol ve veri
CSV_PATH = os.path.join('data', 'cars.csv')
df = pd.read_csv(CSV_PATH)
print(df.shape)
df.head()


ModuleNotFoundError: No module named 'seaborn'

In [None]:
# Önişleme: temel temizlik ve özellik seçimi
# Sütunlar: brand, series, model, year, km, price (+ opsiyonel: adId)

# Geçersiz/ekstrem değer filtreleri
clean = df.copy()
clean = clean[clean['price'].notnull()]
clean = clean[(clean['year'] >= 1980) & (clean['year'] <= 2025)]
clean = clean[(clean['km'] >= 0) & (clean['km'] <= 1_000_000)]
clean = clean[(clean['price'] > 0) & (clean['price'] <= 20_000_000)]

# Eksik string değerleri 'Unknown' yapalım (analiz ve model boru hattı için)
for c in ['brand','series','model']:
	clean[c] = clean[c].astype(str).replace({'nan':'Unknown'})

print(clean.shape)
clean[['brand','series','model','year','km','price']].head()


In [None]:
# EDA: dağılımlar ve ilişkiler
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
sns.histplot(clean['year'], bins=30, kde=True)
plt.title('Yıl dağılımı')
plt.subplot(1,3,2)
sns.histplot(clean['km'], bins=30, kde=True)
plt.title('KM dağılımı')
plt.subplot(1,3,3)
sns.histplot(clean['price'], bins=30, kde=True)
plt.title('Fiyat dağılımı')
plt.tight_layout()
plt.show()

# Fiyat ~ yıl, km ilişkileri
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.regplot(data=clean, x='year', y='price', scatter_kws={'alpha':0.2})
plt.subplot(1,2,2)
sns.regplot(data=clean, x='km', y='price', scatter_kws={'alpha':0.2})
plt.tight_layout()
plt.show()

# Marka başına özet
brand_summary = clean.groupby('brand', as_index=False)['price'].median().sort_values('price', ascending=False).head(20)
brand_summary


In [None]:
# İstatistiksel analiz: korelasyonlar, grup testleri
# Sayısal değişkenler arası korelasyon
num_cols = ['year','km','price']
corr = clean[num_cols].corr(numeric_only=True)
print(corr)

# Basit örnek: yeni vs eski (>=2018) araçlarda fiyat farkı (Mann-Whitney U)
recent = clean.loc[clean['year'] >= 2018, 'price']
old = clean.loc[clean['year'] < 2018, 'price']
stat, p = stats.mannwhitneyu(recent, old, alternative='two-sided')
print({'mannwhitneyu_stat': float(stat), 'p_value': float(p)})


In [None]:
# Kümeleme: KMeans ile segmentasyon (sayısal uzayda)
from sklearn.preprocessing import MinMaxScaler

clust_df = clean[['year','km','price']].dropna().copy()
scaler = MinMaxScaler()
Xc = scaler.fit_transform(clust_df)

# k=3 örnek; Silhouette skoru raporla
kmeans = KMeans(n_clusters=3, n_init=10, random_state=42)
labels = kmeans.fit_predict(Xc)
score = silhouette_score(Xc, labels)
print({'silhouette_score_k3': float(score)})

clust_df['cluster'] = labels
clust_df.groupby('cluster').agg({'year':'median','km':'median','price':'median','cluster':'count'}).rename(columns={'cluster':'count'})


In [None]:
# Regresyon: eğitim/test ve temel model
features = ['brand','series','model','year','km']
X = clean[features]
y = clean['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cat_features = ['brand','series','model']
num_features = ['year','km']

cat_tf = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='most_frequent')),
	('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True)),
])
num_tf = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='median')),
	('scaler', StandardScaler(with_mean=False)),
])

pre = ColumnTransformer(transformers=[
	('cat', cat_tf, cat_features),
	('num', num_tf, num_features),
])

base_model = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
pipe = Pipeline(steps=[('pre', pre), ('model', base_model)])
pipe.fit(X_train, y_train)

pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, pred)
r2 = r2_score(y_test, pred)
{'MAE': float(mae), 'R2': float(r2)}


In [None]:
# Hiperparametre araması: RandomizedSearchCV ile RF
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
	'model__n_estimators': [200, 300, 400, 600, 800],
	'model__max_depth': [None, 10, 15, 20, 30],
	'model__min_samples_split': [2, 5, 10],
	'model__min_samples_leaf': [1, 2, 4],
}

search = RandomizedSearchCV(
	estimator=pipe,
	param_distributions=param_distributions,
	n_iter=20,
	scoring='neg_mean_absolute_error',
	n_jobs=-1,
	random_state=42,
	cv=3,
	verbose=1,
)
search.fit(X_train, y_train)

y_pred_tuned = search.best_estimator_.predict(X_test)
mae_tuned = mean_absolute_error(y_test, y_pred_tuned)
r2_tuned = r2_score(y_test, y_pred_tuned)
{
	'best_params': search.best_params_,
	'base_mae': float(mae),
	'base_r2': float(r2),
	'tuned_mae': float(mae_tuned),
	'tuned_r2': float(r2_tuned)
}


In [None]:
# Değerlendirme görselleri: gerçek vs tahmin ve hata dağılımı
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
ax[0].scatter(y_test, pred, alpha=0.2, label='Base')
ax[0].scatter(y_test, y_pred_tuned, alpha=0.2, label='Tuned')
ax[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
ax[0].set_xlabel('Gerçek')
ax[0].set_ylabel('Tahmin')
ax[0].legend()
ax[0].set_title('Gerçek vs Tahmin')

base_errors = pred - y_test
best_errors = y_pred_tuned - y_test
sns.histplot(base_errors, bins=40, kde=True, ax=ax[1], color='C0', label='Base', stat='density')
sns.histplot(best_errors, bins=40, kde=True, ax=ax[1], color='C1', label='Tuned', stat='density', alpha=0.6)
ax[1].set_title('Hata dağılımı')
ax[1].legend()
plt.tight_layout()
plt.show()
