In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np
from EngineerFeature import FeatureEngineer

In [2]:
# Veri setini yükle
from sklearn.discriminant_analysis import StandardScaler
scaler = StandardScaler()

data = pd.read_csv('../final_data.csv')
#data = data.drop(columns=['artist'])
# Özellikler ve hedef değişkeni ayır
X = data.drop("popularity", axis=1)
y = data["popularity"]

# Eğitim ve test setlerine ayır (%80 eğitim, %20 test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

# Feature Engineering uygula
fe = FeatureEngineer()
X_train = fe.fit_transform(X_train, y_train)
X_test = fe.transform(X_test)
#X_train.drop('artist_song_count', axis=1, inplace=True)
#X_test.drop('artist_song_count', axis=1, inplace=True)
print(X_train.columns)
# Scaling uygula
cols = X_train.columns#.difference([''])#, 'artist_song_count'])
X_train = pd.DataFrame(
    scaler.fit_transform(X_train[cols]),
    columns=cols,#X_train.columns,
    index=X_train.index
)
X_test = pd.DataFrame(
    scaler.transform(X_test[cols]),
    columns=cols,#X_test.columns,
    index=X_test.index
)

print(f"Eğitim seti boyutu: {X_train.shape}")
print(f"Test seti boyutu: {X_test.shape}")
print(f"Özellik sayısı: {X_train.shape[1]}")

Index(['danceability', 'energy', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms', 'playlist_count', 'album_age_log',
       'playlist_count_final', 'artist_avg_popularity',
       'artist_high_pop_ratio', 'artist_song_count_bin',
       'avg_subgenre_popularity'],
      dtype='object')
Eğitim seti boyutu: (22684, 18)
Test seti boyutu: (5672, 18)
Özellik sayısı: 18


In [3]:
# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

print("Model trained successfully!")

Model trained successfully!


In [4]:
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Evaluate the model
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("=" * 50)
print("LINEAR REGRESSION MODEL PERFORMANCE")
print("=" * 50)
print(f"Training R² Score: {train_r2:.4f}")
print(f"Test R² Score: {test_r2:.4f}")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test MAE: {test_mae:.4f}")
print("=" * 50)

# Check overfitting
overfitting_gap = train_r2 - test_r2
if overfitting_gap > 0.05:
    print(f"⚠️ WARNING: Overfitting detected!")
    print(f"   Train-Test gap: {overfitting_gap:.4f}")
    print(f"   Consider using regularization (Ridge/Lasso) or Pipeline with CV")
else:
    print(f"✓ Good: Train-Test gap is acceptable ({overfitting_gap:.4f})")

LINEAR REGRESSION MODEL PERFORMANCE
Training R² Score: 0.5938
Test R² Score: 0.2518
Test MSE: 416.3371
Test RMSE: 20.4043
Test MAE: 15.6057
   Train-Test gap: 0.3420
   Consider using regularization (Ridge/Lasso) or Pipeline with CV


## Cross-Validation ile Daha Güvenilir Değerlendirme

In [5]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline

# Pipeline ile data leakage'ı önle
pipeline = Pipeline([
    ('feature_eng', FeatureEngineer()),
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# Yeni train-test split (pipeline için raw data kullan)
X_raw = data.drop("popularity", axis=1)
y_raw = data["popularity"]
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    X_raw, y_raw, test_size=0.2, random_state=24
)

# 5-fold cross-validation
cv_scores = cross_val_score(
    pipeline, X_train_raw, y_train_raw, 
    cv=5, scoring='r2', n_jobs=-1
)

# Pipeline'ı eğit
pipeline.fit(X_train_raw, y_train_raw)

# Test seti performansı
y_test_pred_cv = pipeline.predict(X_test_raw)
test_r2_cv = r2_score(y_test_raw, y_test_pred_cv)
test_mae_cv = mean_absolute_error(y_test_raw, y_test_pred_cv)
test_rmse_cv = np.sqrt(mean_squared_error(y_test_raw, y_test_pred_cv))

print("=" * 60)
print("LINEAR REGRESSION WITH PIPELINE (Data Leakage Prevented)")
print("=" * 60)
print(f"CV R² Scores: {cv_scores}")
print(f"Mean CV R² Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f"Test R² Score: {test_r2_cv:.4f}")
print(f"Test MAE: {test_mae_cv:.4f}")
print(f"Test RMSE: {test_rmse_cv:.4f}")
print("=" * 60)

cv_test_gap = abs(cv_scores.mean() - test_r2_cv)
if cv_test_gap < 0.05:
    print(f"✓ Excellent: CV-Test gap is small ({cv_test_gap:.4f})")
    print("  No significant overfitting detected!")
else:
    print(f"⚠️ CV-Test gap: {cv_test_gap:.4f}")
    print("  Consider using Ridge/Lasso regularization")

LINEAR REGRESSION WITH PIPELINE (Data Leakage Prevented)
CV R² Scores: [0.24147741 0.21889811 0.19853532 0.22441807 0.20871684]
Mean CV R² Score: 0.2184 (+/- 0.0291)
Test R² Score: 0.2518
Test MAE: 15.6057
Test RMSE: 20.4043
✓ Excellent: CV-Test gap is small (0.0334)
  No significant overfitting detected!
