In [31]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
import pandas as pd
from EngineerFeature import FeatureEngineer

In [32]:
data = pd.read_csv('../final_data.csv')
X = data.drop('popularity', axis=1)
y = data['popularity']

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=24)

In [34]:
# Create a pipeline to prevent data leakage
pipeline = Pipeline([
    ('feature_eng', FeatureEngineer()),
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

# Update parameter grid for pipeline with stronger regularization
param_grid_pipeline = {
    'ridge__alpha': [8500.0, 10000.0, 12500.0, 15000.0],  # Daha yüksek alpha değerleri (daha güçlü regularization)
    'ridge__solver': ['auto', 'lsqr', 'sag'],
    'ridge__tol': [1e-3, 1e-2],
    'ridge__max_iter': [5000, 10000]
}


In [35]:
# GridSearchCV with pipeline
ridge_cv = GridSearchCV(
    pipeline,
    param_grid=param_grid_pipeline,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

ridge_cv.fit(X_train1, y_train1)

print("\n\nBest parameters:", ridge_cv.best_params_)
print(f"Best CV R2 score: {ridge_cv.best_score_:.4f}")

Fitting 5 folds for each of 48 candidates, totalling 240 fits


Best parameters: {'ridge__alpha': 10000.0, 'ridge__max_iter': 10000, 'ridge__solver': 'sag', 'ridge__tol': 0.01}
Best CV R2 score: 0.2671


In [36]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Train set predictions
y_train_pred = ridge_cv.predict(X_train1)
train_r2 = r2_score(y_train1, y_train_pred)
train_mae = mean_absolute_error(y_train1, y_train_pred)
train_rmse = np.sqrt(mean_squared_error(y_train1, y_train_pred))

print("\n=== Train Set Results ===")
print(f"R2 Score: {train_r2:.4f}")
print(f"MAE: {train_mae:.4f}")
print(f"RMSE: {train_rmse:.4f}")

# Test set predictions
y_pred = ridge_cv.predict(X_test1)
r2 = r2_score(y_test1, y_pred)
mae = mean_absolute_error(y_test1, y_pred)
rmse = np.sqrt(mean_squared_error(y_test1, y_pred))

print("\n=== Test Set Results ===")
print(f"R2 Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")

print(f"\n=== Comparison ===")
print(f"CV R2:   {ridge_cv.best_score_:.4f}")
print(f"Train R2: {train_r2:.4f}")
print(f"Test R2: {r2:.4f}")
print(f"Difference (CV-Test): {abs(ridge_cv.best_score_ - r2):.4f}")
print(f"Difference (Train-Test): {abs(train_r2 - r2):.4f}")

if abs(ridge_cv.best_score_ - r2) < 0.05:
    print("✓ Good - CV and Test scores are similar!")
else:
    print("⚠ Warning - Large gap suggests overfitting")



=== Train Set Results ===
R2 Score: 0.5454
MAE: 12.1348
RMSE: 15.9990

=== Test Set Results ===
R2 Score: 0.2935
MAE: 15.7658
RMSE: 19.8278

=== Comparison ===
CV R2:   0.2671
Train R2: 0.5454
Test R2: 0.2935
Difference (CV-Test): 0.0264
Difference (Train-Test): 0.2520
✓ Good - CV and Test scores are similar!
