In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
import pandas as pd
import itertools
from EngineerFeature import FeatureEngineer

In [3]:
data = pd.read_csv('final_data.csv')
#data = data[:1000]
X = data.drop('popularity', axis=1)
y = data['popularity']

In [4]:
grid = {
    'alpha': [0.1, 1.0, 10.0, 100.0, 200.0, 500.0],
    'solver': ['sag', 'auto', 'lsqr', 'sparse_cg'],
    'tol': [1e-3, 1e-2, 1e-1],
    'max_iter': [1000, 2000, 5000, 7500, 10000]
}

In [5]:
def grid_search_ridge(X, y, grid):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    keys = grid.keys()
    values = grid.values()

    best_r2 = -float('inf')
    for combo in itertools.product(*values):
        params = dict(zip(keys, combo))

        model = Ridge(**params)
        model.fit(X_train_scaled, y_train)

        y_pred = model.predict(X_val_scaled)

        r2 = r2_score(y_val, y_pred)
        #print(f"Params: {params}, R2: {r2}")
        if r2 > best_r2:
            best_r2 = r2
            best_params = params

    return best_params, best_r2

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=42)
fe = FeatureEngineer()
X_train = fe.fit_transform(X_train1, y_train1)
X_test = fe.transform(X_test1)
best_params, best_r2 = grid_search_ridge(X_train1, y_train1, grid)
print("\n\nThe best parameters are: ", best_params)
print("Best R2 score: ", best_r2)



The best parameters are:  {'alpha': 0.1, 'solver': 'sag', 'tol': 0.01, 'max_iter': 7500}
Best R2 score:  0.5870717825998937


In [7]:
model = Ridge(**best_params)
scaler = StandardScaler()
X_train1_scaled = scaler.fit_transform(X_train1)
X_test1_scaled = scaler.transform(X_test1)
model.fit(X_train1_scaled, y_train1)
y_pred = model.predict(X_test1_scaled)
r2 = r2_score(y_test1, y_pred)
print("R2 score with best parameters on test set: ", r2)


R2 score with best parameters on test set:  0.5968439178629816
