# Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import random
import copy
from sklearn.decomposition import PCA
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split,GridSearchCV, HalvingGridSearchCV,RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 
from torch.utils.tensorboard import SummaryWriter

seed = 42
pca = None
np.random.seed(seed)
random.seed(seed)
df = pd.read_csv('dataset.csv')

In [2]:
df.head()

Unnamed: 0,rating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
0,3.893708,0,0,1,1,1,1,0,0,0,...,0.0405,0.01425,0.0305,0.035,0.14125,0.05775,0.039,0.02975,0.08475,0.022
1,3.251527,0,0,1,0,1,0,0,0,0,...,0.0525,0.01575,0.0125,0.02,0.12225,0.03275,0.021,0.011,0.10525,0.01975
2,3.142028,0,0,0,0,0,1,0,0,0,...,0.06275,0.0195,0.02225,0.023,0.122,0.03475,0.017,0.018,0.091,0.01775
3,2.853547,0,0,0,0,0,1,0,0,1,...,0.05325,0.028,0.01675,0.03875,0.182,0.0705,0.01625,0.01425,0.0885,0.015
4,3.058434,0,0,0,0,0,1,0,0,0,...,0.0535,0.0205,0.01425,0.0255,0.19225,0.02675,0.01625,0.013,0.087,0.016


In [3]:
#splitting dataframe df into train and test
X=df.drop(['rating'],axis=1)
y=df['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)



In [4]:
print(f'Number of training samples: {X_train.shape}')
print(f'Number of testing samples: {X_test.shape}')

Number of training samples: (9946, 1148)
Number of testing samples: (2764, 1148)


In [5]:
#PCA
pca = PCA(n_components=0.95)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_val = pca.transform(X_val)
X_test = pca.transform(X_test)


In [23]:
# Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

print(f'Linear Regression R2 score: {model.score(X_test, y_test)}')
print(f'Linear Regression MSE: {mean_squared_error(y_test, model.predict(X_test))}')

Linear Regression R2 score: 0.9707843273740225
Linear Regression MSE: 0.006476124027323298


In [6]:
def plot_model_hyperparams(grid,model_name):
    writer = SummaryWriter('run/PCA/{}'.format(model_name))
    for i in range(len(grid.cv_results_['params'])):
        writer.add_hparams(
            {f'{k}': v for k, v in grid.cv_results_['params'][i].items()},
            {f'{model_name}_mean_test_score': grid.cv_results_['mean_test_score'][i]}
        )
    writer.flush()
    writer.close()

In [7]:

#hyperparameters tuning for random forest regressor
model=RandomForestRegressor()

#generate random number
rng = np.random.RandomState(0)
param_grid = {
    'n_estimators': [50,100,200],
    'max_depth': [5,10],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'sqrt', 'log2']
}

hgs = RandomizedSearchCV(
    estimator=model, param_distributions=param_grid, random_state=rng, verbose=3, cv=2
)
hgs.fit(X_train, y_train)
plot_model_hyperparams(hgs, 'RFR_Bis')


print("\nBest R2: {:.6f}".format(hgs.best_score_))
print("Best parameters: {}".format(hgs.best_params_))

print(f'Random Forest Regressor R2 score: {hgs.score(X_test, y_test)}')
print(f'Random Forest Regressor MSE: {mean_squared_error(y_test, hgs.predict(X_test))}')

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV 1/2] END max_depth=10, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=50;, score=0.537 total time=   6.1s
[CV 2/2] END max_depth=10, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=50;, score=0.515 total time=   5.8s
[CV 1/2] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100;, score=0.745 total time= 2.4min
[CV 2/2] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100;, score=0.743 total time= 2.4min
[CV 1/2] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=50;, score=0.507 total time=   5.6s
[CV 2/2] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=50;, score=0.523 total time=   5.7s
[CV 1/2] END max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.747 total 

In [8]:
hgs.cv_results_['mean_test_score']

array([0.52586566, 0.7439882 , 0.51488839, 0.74504078, 0.30006974,
       0.30271001, 0.31790306, 0.74501493, 0.74511704, 0.51596005])

In [16]:
# use GridSearchCV to find the best parameters for SVM
model = SVR()


rng = np.random.RandomState(0)
param_grid = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.001,0.01,0.1, 1,2,10],
    'epsilon': [0.001,0.01, 0.1, 1]
}

rgs = HalvingGridSearchCV(
    estimator=model, param_grid=param_grid, random_state=rng, verbose=3, cv=2
)
rgs.fit(X_train, y_train)
plot_model_hyperparams(rgs, 'SVR')

print("\nBest MSE: {:.6f}".format(rgs.best_score_))
print("Best parameters: {}".format(rgs.best_params_))



print(f'SVM R2 score: {rgs.score(X_test, y_test)}')
print(f'SVM MSE: {mean_squared_error(y_test, rgs.predict(X_test))}')

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 368
max_resources_: 9946
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 72
n_resources: 368
Fitting 2 folds for each of 72 candidates, totalling 144 fits
[CV 1/2] END C=0.001, epsilon=0.001, kernel=linear;, score=(train=0.551, test=0.457) total time=   0.0s
[CV 2/2] END C=0.001, epsilon=0.001, kernel=linear;, score=(train=0.455, test=0.446) total time=   0.0s
[CV 1/2] END C=0.001, epsilon=0.001, kernel=poly;, score=(train=0.005, test=-0.070) total time=   0.0s
[CV 2/2] END C=0.001, epsilon=0.001, kernel=poly;, score=(train=-0.004, test=-0.012) total time=   0.0s
[CV 1/2] END C=0.001, epsilon=0.001, kernel=rbf;, score=(train=0.009, test=-0.060) total time=   0.0s
[CV 2/2] END C=0.001, epsilon=0.001, kernel=rbf;, score=(train=-0.003, test=-0.007) total time=   0.0s
[CV 1/2] END C=0.001, epsilon=0.01, kernel=linear;, score=(train=0.548, test=0.454) total time=   0.0s
[CV 2/2] END C=

In [9]:
# use GridSearchCV to find the best parameters for Lasso Regression
model = Lasso()


rng = np.random.RandomState(0)
param_grid = {
    'alpha': [0.0001,0.001,0.01, 0.1,1],
    'max_iter': [100, 1000, 10000]
}

lgs = HalvingGridSearchCV(
    estimator=model, param_grid=param_grid, random_state=rng, verbose=3, cv=2,scoring='neg_mean_squared_error'
)
lgs.fit(X_train, y_train)

print("\nBest MSE: {:.6f}".format(-lgs.best_score_))
print("Best parameters: {}".format(lgs.best_params_))

#print the r2 results for the best parameters
model = Lasso(
    alpha=lgs.best_params_['alpha'],
    max_iter=lgs.best_params_['max_iter']
)

model.fit(X_train, y_train)
plot_model_hyperparams(lgs, 'Lasso')

print(f'Lasso Regression R2 score: {model.score(X_test, y_test)}')
print(f'Lasso Regression MSE: {mean_squared_error(y_test, model.predict(X_test))}')


n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 1103
max_resources_: 9934
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 15
n_resources: 1103
Fitting 2 folds for each of 15 candidates, totalling 30 fits
[CV 1/2] END alpha=0.0001, max_iter=100;, score=(train=-0.002, test=-0.013) total time=   0.0s
[CV 2/2] END alpha=0.0001, max_iter=100;, score=(train=-0.002, test=-0.014) total time=   0.0s
[CV 1/2] END alpha=0.0001, max_iter=1000;, score=(train=-0.002, test=-0.013) total time=   0.0s
[CV 2/2] END alpha=0.0001, max_iter=1000;, score=(train=-0.002, test=-0.014) total time=   0.0s
[CV 1/2] END alpha=0.0001, max_iter=10000;, score=(train=-0.002, test=-0.013) total time=   0.0s
[CV 2/2] END alpha=0.0001, max_iter=10000;, score=(train=-0.002, test=-0.014) total time=   0.0s
[CV 1/2] END alpha=0.001, max_iter=100;, score=(train=-0.009, test=-0.014) total time=   0.0s
[CV 2/2] END alpha=0.001, max_iter=100;, score=(train=-0.009, test=

In [33]:
# use GridSearchCV to find the best parameters for Ridge Regression
model = Ridge()


rng = np.random.RandomState(0)
param_grid = {
    'alpha': [0.1,0.5,1.0,2.5,5,6,10],
    'max_iter': [100, 1000, 10000]
}

hgs = HalvingGridSearchCV(
    estimator=model, param_grid=param_grid, random_state=rng, verbose=3, cv=2, scoring='neg_mean_squared_error'
)

hgs.fit(X_train, y_train)

print("\nBest MSE: {:.6f}".format(-hgs.best_score_))
print("Best parameters: {}".format(hgs.best_params_))

#print the r2 results for the best parameters
model = Ridge(
    alpha=hgs.best_params_['alpha'],
    max_iter=hgs.best_params_['max_iter']
)

model.fit(X_train, y_train)
plot_model_hyperparams(hgs, 'Ridge')

print(f'Ridge Regression R2 score: {model.score(X_test, y_test)}')
print(f'Ridge Regression MSE: {mean_squared_error(y_test, model.predict(X_test))}')


n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 1103
max_resources_: 9934
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 15
n_resources: 1103
Fitting 2 folds for each of 15 candidates, totalling 30 fits
[CV 1/2] END alpha=0.1, max_iter=100;, score=(train=-0.001, test=-0.022) total time=   0.0s
[CV 2/2] END alpha=0.1, max_iter=100;, score=(train=-0.001, test=-0.023) total time=   0.0s
[CV 1/2] END alpha=0.1, max_iter=1000;, score=(train=-0.001, test=-0.022) total time=   0.0s
[CV 2/2] END alpha=0.1, max_iter=1000;, score=(train=-0.001, test=-0.023) total time=   0.0s
[CV 1/2] END alpha=0.1, max_iter=10000;, score=(train=-0.001, test=-0.022) total time=   0.0s
[CV 2/2] END alpha=0.1, max_iter=10000;, score=(train=-0.001, test=-0.023) total time=   0.0s
[CV 1/2] END alpha=0.5, max_iter=100;, score=(train=-0.001, test=-0.013) total time=   0.0s
[CV 2/2] END alpha=0.5, max_iter=100;, score=(train=-0.001, test=-0.014) total time=   

In [47]:
# use GridSearchCV to find the best parameters for ElasticNet Regression
model = ElasticNet()


rng = np.random.RandomState(0)
param_grid = {
    'alpha': [0.0001,0.001,0.01, 0.1,10],
    'max_iter': [100, 1000, 10000],
    'l1_ratio': [0.01, 0.1, 0.5, 0.9, 1]
}

hgs = HalvingGridSearchCV(
    estimator=model, param_grid=param_grid, random_state=rng, verbose=3, cv=2, scoring='neg_mean_squared_error'
)

hgs.fit(X_train, y_train)

print("\nBest MSE: {:.6f}".format(-hgs.best_score_))
print("Best parameters: {}".format(hgs.best_params_))

#print the r2 results for the best parameters
model = ElasticNet(
    alpha=hgs.best_params_['alpha'],
    max_iter=hgs.best_params_['max_iter'],
    l1_ratio=hgs.best_params_['l1_ratio']
)

model.fit(X_train, y_train)
plot_model_hyperparams(hgs, 'ElasticNet')

print(f'ElasticNet Regression R2 score: {model.score(X_test, y_test)}')
print(f'ElasticNet Regression MSE: {mean_squared_error(y_test, model.predict(X_test))}')


n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 367
max_resources_: 9934
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 75
n_resources: 367
Fitting 2 folds for each of 75 candidates, totalling 150 fits
[CV 1/2] END alpha=0.0001, l1_ratio=0.01, max_iter=100;, score=(train=-0.000, test=-0.017) total time=   0.0s
[CV 2/2] END alpha=0.0001, l1_ratio=0.01, max_iter=100;, score=(train=-0.000, test=-0.021) total time=   0.0s
[CV 1/2] END alpha=0.0001, l1_ratio=0.01, max_iter=1000;, score=(train=-0.000, test=-0.018) total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 2/2] END alpha=0.0001, l1_ratio=0.01, max_iter=1000;, score=(train=-0.000, test=-0.019) total time=   0.1s
[CV 1/2] END alpha=0.0001, l1_ratio=0.01, max_iter=10000;, score=(train=-0.000, test=-0.018) total time=   0.0s
[CV 2/2] END alpha=0.0001, l1_ratio=0.01, max_iter=10000;, score=(train=-0.000, test=-0.019) total time=   0.0s
[CV 1/2] END alpha=0.0001, l1_ratio=0.1, max_iter=100;, score=(train=-0.000, test=-0.018) total time=   0.0s
[CV 2/2] END alpha=0.0001, l1_ratio=0.1, max_iter=100;, score=(train=-0.000, test=-0.020) total time=   0.0s
[CV 1/2] END alpha=0.0001, l1_ratio=0.1, max_iter=1000;, score=(train=-0.000, test=-0.021) total time=   0.0s
[CV 2/2] END alpha=0.0001, l1_ratio=0.1, max_iter=1000;, score=(train=-0.000, test=-0.020) total time=   0.0s
[CV 1/2] END alpha=0.0001, l1_ratio=0.1, max_iter=10000;, score=(train=-0.000, test=-0.021) total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 2/2] END alpha=0.0001, l1_ratio=0.1, max_iter=10000;, score=(train=-0.000, test=-0.020) total time=   0.0s
[CV 1/2] END alpha=0.0001, l1_ratio=0.5, max_iter=100;, score=(train=-0.000, test=-0.023) total time=   0.0s
[CV 2/2] END alpha=0.0001, l1_ratio=0.5, max_iter=100;, score=(train=-0.000, test=-0.022) total time=   0.0s
[CV 1/2] END alpha=0.0001, l1_ratio=0.5, max_iter=1000;, score=(train=-0.000, test=-0.024) total time=   0.0s
[CV 2/2] END alpha=0.0001, l1_ratio=0.5, max_iter=1000;, score=(train=-0.000, test=-0.024) total time=   0.0s
[CV 1/2] END alpha=0.0001, l1_ratio=0.5, max_iter=10000;, score=(train=-0.000, test=-0.024) total time=   0.0s
[CV 2/2] END alpha=0.0001, l1_ratio=0.5, max_iter=10000;, score=(train=-0.000, test=-0.024) total time=   0.0s
[CV 1/2] END alpha=0.0001, l1_ratio=0.9, max_iter=100;, score=(train=-0.000, test=-0.023) total time=   0.0s
[CV 2/2] END alpha=0.0001, l1_ratio=0.9, max_iter=100;, score=(train=-0.000, test=-0.022) total time=   0.0s
[CV 1/2] EN

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 2/2] END alpha=0.0001, l1_ratio=0.9, max_iter=1000;, score=(train=-0.000, test=-0.024) total time=   0.0s
[CV 1/2] END alpha=0.0001, l1_ratio=0.9, max_iter=10000;, score=(train=-0.000, test=-0.024) total time=   0.0s
[CV 2/2] END alpha=0.0001, l1_ratio=0.9, max_iter=10000;, score=(train=-0.000, test=-0.024) total time=   0.0s
[CV 1/2] END alpha=0.0001, l1_ratio=1, max_iter=100;, score=(train=-0.000, test=-0.023) total time=   0.0s
[CV 2/2] END alpha=0.0001, l1_ratio=1, max_iter=100;, score=(train=-0.000, test=-0.022) total time=   0.0s
[CV 1/2] END alpha=0.0001, l1_ratio=1, max_iter=1000;, score=(train=-0.000, test=-0.024) total time=   0.0s
[CV 2/2] END alpha=0.0001, l1_ratio=1, max_iter=1000;, score=(train=-0.000, test=-0.024) total time=   0.0s
[CV 1/2] END alpha=0.0001, l1_ratio=1, max_iter=10000;, score=(train=-0.000, test=-0.024) total time=   0.0s
[CV 2/2] END alpha=0.0001, l1_ratio=1, max_iter=10000;, score=(train=-0.000, test=-0.024) total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 1/2] END alpha=0.001, l1_ratio=0.01, max_iter=100;, score=(train=-0.000, test=-0.017) total time=   0.0s
[CV 2/2] END alpha=0.001, l1_ratio=0.01, max_iter=100;, score=(train=-0.000, test=-0.019) total time=   0.0s
[CV 1/2] END alpha=0.001, l1_ratio=0.01, max_iter=1000;, score=(train=-0.000, test=-0.017) total time=   0.0s
[CV 2/2] END alpha=0.001, l1_ratio=0.01, max_iter=1000;, score=(train=-0.000, test=-0.019) total time=   0.0s
[CV 1/2] END alpha=0.001, l1_ratio=0.01, max_iter=10000;, score=(train=-0.000, test=-0.017) total time=   0.0s
[CV 2/2] END alpha=0.001, l1_ratio=0.01, max_iter=10000;, score=(train=-0.000, test=-0.019) total time=   0.0s
[CV 1/2] END alpha=0.001, l1_ratio=0.1, max_iter=100;, score=(train=-0.000, test=-0.019) total time=   0.0s
[CV 2/2] END alpha=0.001, l1_ratio=0.1, max_iter=100;, score=(train=-0.000, test=-0.019) total time=   0.0s
[CV 1/2] END alpha=0.001, l1_ratio=0.1, max_iter=1000;, score=(train=-0.000, test=-0.019) total time=   0.0s
[CV 2/2] END al

  model = cd_fast.enet_coordinate_descent(


[CV 2/2] END alpha=0.001, l1_ratio=0.1, max_iter=10000;, score=(train=-0.000, test=-0.019) total time=   0.0s
[CV 1/2] END alpha=0.001, l1_ratio=0.5, max_iter=100;, score=(train=-0.002, test=-0.019) total time=   0.0s
[CV 2/2] END alpha=0.001, l1_ratio=0.5, max_iter=100;, score=(train=-0.002, test=-0.019) total time=   0.0s
[CV 1/2] END alpha=0.001, l1_ratio=0.5, max_iter=1000;, score=(train=-0.002, test=-0.019) total time=   0.0s
[CV 2/2] END alpha=0.001, l1_ratio=0.5, max_iter=1000;, score=(train=-0.002, test=-0.019) total time=   0.0s
[CV 1/2] END alpha=0.001, l1_ratio=0.5, max_iter=10000;, score=(train=-0.002, test=-0.019) total time=   0.0s
[CV 2/2] END alpha=0.001, l1_ratio=0.5, max_iter=10000;, score=(train=-0.002, test=-0.019) total time=   0.0s
[CV 1/2] END alpha=0.001, l1_ratio=0.9, max_iter=100;, score=(train=-0.005, test=-0.018) total time=   0.0s
[CV 2/2] END alpha=0.001, l1_ratio=0.9, max_iter=100;, score=(train=-0.004, test=-0.018) total time=   0.0s
[CV 1/2] END alpha=0

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 1/2] END alpha=0.0001, l1_ratio=0.01, max_iter=100;, score=(train=-0.000, test=-0.029) total time=   0.0s
[CV 2/2] END alpha=0.0001, l1_ratio=0.01, max_iter=100;, score=(train=-0.000, test=-0.029) total time=   0.0s
[CV 1/2] END alpha=0.0001, l1_ratio=0.1, max_iter=100;, score=(train=-0.001, test=-0.023) total time=   0.0s
[CV 2/2] END alpha=0.0001, l1_ratio=0.1, max_iter=100;, score=(train=-0.000, test=-0.024) total time=   0.0s
[CV 1/2] END alpha=0.01, l1_ratio=0.1, max_iter=10000;, score=(train=-0.011, test=-0.016) total time=   0.0s
[CV 2/2] END alpha=0.01, l1_ratio=0.1, max_iter=10000;, score=(train=-0.011, test=-0.016) total time=   0.0s
[CV 1/2] END alpha=0.01, l1_ratio=0.1, max_iter=1000;, score=(train=-0.011, test=-0.016) total time=   0.0s
[CV 2/2] END alpha=0.01, l1_ratio=0.1, max_iter=1000;, score=(train=-0.011, test=-0.016) total time=   0.0s
[CV 1/2] END alpha=0.01, l1_ratio=0.1, max_iter=100;, score=(train=-0.011, test=-0.016) total time=   0.0s
[CV 2/2] END alpha=0.