In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import pandas as pd
from EngineerFeature import FeatureEngineer

In [16]:
# Veri setini yükle
from sklearn.pipeline import Pipeline

data = pd.read_csv('../final_data.csv')

# Özellikler ve hedef değişkeni ayır
X = data.drop("popularity", axis=1)
y = data["popularity"]

# Eğitim ve test setlerine ayır (%80 eğitim, %20 test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

In [17]:
# Pipeline oluştur (Data Leakage önlenir)
pipeline = Pipeline([
    ('feature_eng', FeatureEngineer()),
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# Pipeline'ı eğit
pipeline.fit(X_train, y_train)

print("Model trained successfully with Pipeline!")
print(f"Özellik sayısı (after feature engineering): {pipeline.named_steps['feature_eng'].transform(X_train.iloc[:1]).shape[1]}")


Model trained successfully with Pipeline!
Özellik sayısı (after feature engineering): 18


In [18]:
# Cross-Validation ile daha güvenilir değerlendirme
from sklearn.model_selection import cross_val_score

# 5-fold cross-validation
cv_scores = cross_val_score(
    pipeline, X_train, y_train, 
    cv=5, scoring='r2', n_jobs=-1
)

print("=" * 60)
print("CROSS-VALIDATION RESULTS")
print("=" * 60)
print(f"CV R² Scores: {cv_scores}")
print(f"Mean CV R² Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f"Test R² Score: {test_r2:.4f}")

CROSS-VALIDATION RESULTS
CV R² Scores: [0.24147741 0.21889811 0.19853532 0.22441807 0.20871684]
Mean CV R² Score: 0.2184 (+/- 0.0291)
Test R² Score: 0.2518
