In [29]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
import pandas as pd
import itertools
from EngineerFeature import FeatureEngineer

In [30]:
data = pd.read_csv('../final_data.csv')
#data = data[:1000]
X = data.drop('popularity', axis=1)
y = data['popularity']

In [31]:
grid = {
    'alpha': [0.1, 1.0, 10.0, 100.0, 200.0, 500.0],
    'solver': ['sag', 'auto', 'lsqr', 'sparse_cg'],
    'tol': [1e-3, 1e-2, 1e-1],
    'max_iter': [1000, 2000, 5000, 7500, 10000]
}

In [32]:
def grid_search_ridge(X, y, grid):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    keys = grid.keys()
    values = grid.values()

    best_r2 = -float('inf')
    for combo in itertools.product(*values):
        params = dict(zip(keys, combo))

        model = Ridge(**params)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)

        r2 = r2_score(y_val, y_pred)
        #print(f"Params: {params}, R2: {r2}")
        if r2 > best_r2:
            best_r2 = r2
            best_params = params

    return best_params, best_r2

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train shape: {X_train1.shape}, Test shape: {X_test1.shape}")

# Create a pipeline to prevent data leakage
pipeline = Pipeline([
    ('feature_eng', FeatureEngineer()),
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

# Update parameter grid for pipeline
param_grid_pipeline = {
    'ridge__alpha': [0.1, 1.0, 10.0, 100.0],
    'ridge__solver': ['sag', 'auto', 'lsqr', 'sparse_cg'],
    'ridge__tol': [1e-3, 1e-2, 1e-1],
    'ridge__max_iter': [1000, 2000, 5000, 7500, 10000]
}

# GridSearchCV with pipeline
ridge_cv = GridSearchCV(
    pipeline,
    param_grid=param_grid_pipeline,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

ridge_cv.fit(X_train1, y_train1)

print("\n\nBest parameters:", ridge_cv.best_params_)
print(f"Best CV R2 score: {ridge_cv.best_score_:.4f}")

Train shape: (22684, 39), Test shape: (5672, 39)
Fitting 5 folds for each of 240 candidates, totalling 1200 fits






Best parameters: {'ridge__alpha': 0.1, 'ridge__max_iter': 1000, 'ridge__solver': 'sag', 'ridge__tol': 0.1}
Best CV R2 score: 0.2222


In [34]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Test the best model
y_pred = ridge_cv.predict(X_test1)

r2 = r2_score(y_test1, y_pred)
mae = mean_absolute_error(y_test1, y_pred)
rmse = np.sqrt(mean_squared_error(y_test1, y_pred))

print("\n=== Test Set Results ===")
print(f"R2 Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")

print(f"\n=== Comparison ===")
print(f"CV R2:   {ridge_cv.best_score_:.4f}")
print(f"Test R2: {r2:.4f}")
print(f"Difference: {abs(ridge_cv.best_score_ - r2):.4f}")

if abs(ridge_cv.best_score_ - r2) < 0.05:
    print("✓ Good - CV and Test scores are similar!")
else:
    print("⚠ Warning - Large gap suggests overfitting")


=== Test Set Results ===
R2 Score: 0.2450
MAE: 15.6774
RMSE: 20.6340

=== Comparison ===
CV R2:   0.2222
Test R2: 0.2450
Difference: 0.0228
✓ Good - CV and Test scores are similar!
