# Data Preprocessing

In [6]:
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import random
import copy
from sklearn.decomposition import PCA
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split,GridSearchCV, HalvingGridSearchCV,RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

from torch.utils.tensorboard import SummaryWriter

seed = 42
pca = None
np.random.seed(seed)
random.seed(seed)
df = pd.read_csv('dataset.csv')

In [7]:
df.head()

Unnamed: 0,rating,rating_count,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies,year
0,3.893708,57309,0,0,1,1,1,1,0,0,...,0.01425,0.0305,0.035,0.14125,0.05775,0.039,0.02975,0.08475,0.022,1995
1,3.251527,24228,0,0,1,0,1,0,0,0,...,0.01575,0.0125,0.02,0.12225,0.03275,0.021,0.011,0.10525,0.01975,1995
2,3.142028,11804,0,0,0,0,0,1,0,0,...,0.0195,0.02225,0.023,0.122,0.03475,0.017,0.018,0.091,0.01775,1995
3,2.853547,2523,0,0,0,0,0,1,0,0,...,0.028,0.01675,0.03875,0.182,0.0705,0.01625,0.01425,0.0885,0.015,1995
4,3.058434,11714,0,0,0,0,0,1,0,0,...,0.0205,0.01425,0.0255,0.19225,0.02675,0.01625,0.013,0.087,0.016,1995


In [11]:
#print only the year column
df.drop(['year','rating_count'], axis=1, inplace=True)

In [8]:

columns_to_transform = ['year', 'rating_count']


def transform(X):
    X_norm2 = np.linalg.norm(X, ord=2)
    X = X / X_norm2
    return X

def normalize(df, type):
    print(type)
    for column in columns_to_transform:
        df[column] = transform(df[column])
    return df
df=normalize(df, 'L2_normalization')

L2_normalization


In [13]:
#splitting dataframe df into train and test
X=df.drop(['rating'],axis=1)
y=df['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)



In [16]:
print(f'Number of training samples: {X_train.shape}')
print(f'Number of testing samples: {X_test.shape}')

Number of training samples: (9934, 543)
Number of testing samples: (2760, 543)


In [15]:
#PCA
pca = PCA(n_components=0.95)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_val = pca.transform(X_val)
X_test = pca.transform(X_test)


In [8]:
# Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

print(f'Linear Regression R2 score: {model.score(X_test, y_test)}')
print(f'Linear Regression MSE: {mean_squared_error(y_test, model.predict(X_test))}')

Linear Regression R2 score: 0.971788385235882
Linear Regression MSE: 0.0063633984275459


In [17]:
def plot_model_hyperparams(grid,model_name):
    writer = SummaryWriter('run/{}'.format(model_name))
    for i in range(len(grid.cv_results_['params'])):
        writer.add_hparams(
            {f'{k}': v for k, v in grid.cv_results_['params'][i].items()},
            {f'{model_name}_mean_test_score': grid.cv_results_['mean_test_score'][i]}
        )
    writer.flush()
    writer.close()

In [26]:
#hyperparameters tuning for random forest regressor
model=RandomForestRegressor()

#generate random number
rng = np.random.RandomState(0)

param_grid = {
    'n_estimators': [10, 50, 100,200],
    'max_depth': [5],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': [1.0, 'sqrt', 'log2']
}

hgs = HalvingGridSearchCV(
    estimator=model, param_grid=param_grid, random_state=rng, verbose=3, cv=2
)
hgs.fit(X_train, y_train)
plot_model_hyperparams(hgs, 'RFR')


print("\nBest MSE: {:.6f}".format(hgs.best_score_))
print("Best parameters: {}".format(hgs.best_params_))


#hgs_results = plot_model_results(hgs, 'RandomForestRegressor')

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV 1/2] END max_depth=5, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=10;, score=0.314 total time=   0.7s
[CV 2/2] END max_depth=5, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=10;, score=0.405 total time=   0.7s
[CV 1/2] END max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=2, n_estimators=50;, score=0.162 total time=   1.4s
[CV 2/2] END max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=2, n_estimators=50;, score=0.151 total time=   1.3s
[CV 1/2] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=0.415 total time=   0.6s
[CV 2/2] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=0.336 total time=   0.6s
[CV 1/2] END max_depth=5, max_features=log2, min_samples_leaf=3, min_samples_split=2, n_estimators=10;, score=0.197 total time=  

In [27]:
print("\nBest MSE: {:.6f}".format(hgs.best_score_))
print("Best parameters: {}".format(hgs.best_params_))


Best MSE: 0.743243
Best parameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 1.0, 'max_depth': 5}


In [12]:
#train random forest regressor with best parameters
rng = np.random.RandomState(0)
model = RandomForestRegressor(
    n_estimators=200,
    max_depth=25,
    #min_samples_split=hgs.best_params_['min_samples_split'],
    #min_samples_leaf=hgs.best_params_['min_samples_leaf'],
    #max_features=hgs.best_params_['max_features'],
    min_samples_split=2,
    min_samples_leaf=2,
    max_features=1.0,
    random_state=rng
)
model.fit(X_train, y_train)

print(f'Random Forest Regressor R2 score: {model.score(X_test, y_test)}')
print(f'Random Forest Regressor MSE: {mean_squared_error(y_test, model.predict(X_test))}')


Random Forest Regressor R2 score: 0.8328541279132453
Random Forest Regressor MSE: 0.03770134352466923


In [30]:
# use GridSearchCV to find the best parameters for SVM
model = SVR()


rng = np.random.RandomState(0)
param_grid = {
    'kernel': ['linear', 'poly', 'rbf'],
    'degree': [2, 4],
    'gamma': ['scale', 'auto'],
    'C': [0.001,0.01,0.1, 1],
    'epsilon': [0.001,0.01, 0.1, 1]
}

rgs = HalvingGridSearchCV(
    estimator=model, param_grid=param_grid, random_state=rng, verbose=3, cv=2,scoring='neg_mean_squared_error'
)
rgs.fit(X_train, y_train)
plot_model_hyperparams(rgs, 'SVR')

print("\nBest MSE: {:.6f}".format(rgs.best_score_))
print("Best parameters: {}".format(rgs.best_params_))

#print the r2 results for the best parameters
#model = SVR(
#    kernel=hgs.best_params_['kernel'],
#    degree=hgs.best_params_['degree'],
 #   gamma=hgs.best_params_['gamma'],
 #   C=hgs.best_params_['C'],
 #   epsilon=hgs.best_params_['epsilon']
#)

#model.fit(X_train, y_train)

#print(f'SVM R2 score: {model.score(X_test, y_test)}')
#print(f'SVM MSE: {mean_squared_error(y_test, model.predict(X_test))}')







n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 122
max_resources_: 9934
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 108
n_resources: 122
Fitting 2 folds for each of 108 candidates, totalling 216 fits
[CV 1/2] END C=0.01, degree=2, epsilon=0.001, gamma=scale, kernel=linear;, score=(train=0.877, test=0.724) total time=   0.0s
[CV 2/2] END C=0.01, degree=2, epsilon=0.001, gamma=scale, kernel=linear;, score=(train=0.918, test=0.707) total time=   0.0s
[CV 1/2] END C=0.01, degree=2, epsilon=0.001, gamma=scale, kernel=poly;, score=(train=0.032, test=0.011) total time=   0.0s
[CV 2/2] END C=0.01, degree=2, epsilon=0.001, gamma=scale, kernel=poly;, score=(train=0.018, test=0.012) total time=   0.0s
[CV 1/2] END C=0.01, degree=2, epsilon=0.001, gamma=scale, kernel=rbf;, score=(train=0.066, test=0.061) total time=   0.0s
[CV 2/2] END C=0.01, degree=2, epsilon=0.001, gamma=scale, kernel=rbf;, score=(train=0.052, test=0.062) total tim

In [29]:
plot_model_hyperparams(rgs, 'SVR')

In [32]:
# use GridSearchCV to find the best parameters for Lasso Regression
model = Lasso()


rng = np.random.RandomState(0)
param_grid = {
    'alpha': [0.001,0.01, 0.1],
    'max_iter': [100, 1000, 10000]
}

lgs = HalvingGridSearchCV(
    estimator=model, param_grid=param_grid, random_state=rng, verbose=3, cv=2,scoring='neg_mean_squared_error'
)
lgs.fit(X_train, y_train)

print("\nBest MSE: {:.6f}".format(-lgs.best_score_))
print("Best parameters: {}".format(lgs.best_params_))

#print the r2 results for the best parameters
model = Lasso(
    alpha=lgs.best_params_['alpha'],
    max_iter=lgs.best_params_['max_iter']
)

model.fit(X_train, y_train)
plot_model_hyperparams(lgs, 'Lasso')

print(f'Lasso Regression R2 score: {model.score(X_test, y_test)}')
print(f'Lasso Regression MSE: {mean_squared_error(y_test, model.predict(X_test))}')


n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 1103
max_resources_: 9934
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 9
n_resources: 1103
Fitting 2 folds for each of 9 candidates, totalling 18 fits
[CV 1/2] END alpha=0.001, max_iter=100;, score=(train=0.959, test=0.943) total time=   0.0s
[CV 2/2] END alpha=0.001, max_iter=100;, score=(train=0.963, test=0.934) total time=   0.0s
[CV 1/2] END alpha=0.001, max_iter=1000;, score=(train=0.959, test=0.943) total time=   0.0s
[CV 2/2] END alpha=0.001, max_iter=1000;, score=(train=0.963, test=0.934) total time=   0.0s
[CV 1/2] END alpha=0.001, max_iter=10000;, score=(train=0.959, test=0.943) total time=   0.0s
[CV 2/2] END alpha=0.001, max_iter=10000;, score=(train=0.963, test=0.934) total time=   0.0s
[CV 1/2] END alpha=0.01, max_iter=100;, score=(train=0.835, test=0.832) total time=   0.0s
[CV 2/2] END alpha=0.01, max_iter=100;, score=(train=0.853, test=0.809) total time=   0.0s

In [33]:
# use GridSearchCV to find the best parameters for Ridge Regression
model = Ridge()


rng = np.random.RandomState(0)
param_grid = {
    'alpha': [0.01, 0.1, 1],
    'max_iter': [100, 1000, 10000]
}

hgs = HalvingGridSearchCV(
    estimator=model, param_grid=param_grid, random_state=rng, verbose=3, cv=2, scoring='neg_mean_squared_error'
)

hgs.fit(X_train, y_train)

print("\nBest MSE: {:.6f}".format(-hgs.best_score_))
print("Best parameters: {}".format(hgs.best_params_))

#print the r2 results for the best parameters
model = Ridge(
    alpha=hgs.best_params_['alpha'],
    max_iter=hgs.best_params_['max_iter']
)

model.fit(X_train, y_train)
plot_model_hyperparams(hgs, 'Ridge')

print(f'Ridge Regression R2 score: {model.score(X_test, y_test)}')
print(f'Ridge Regression MSE: {mean_squared_error(y_test, model.predict(X_test))}')


n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 1103
max_resources_: 9934
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 9
n_resources: 1103
Fitting 2 folds for each of 9 candidates, totalling 18 fits
[CV 1/2] END alpha=0.01, max_iter=100;, score=(train=-0.000, test=-0.074) total time=   0.0s
[CV 2/2] END alpha=0.01, max_iter=100;, score=(train=-0.000, test=-0.069) total time=   0.0s
[CV 1/2] END alpha=0.01, max_iter=1000;, score=(train=-0.000, test=-0.074) total time=   0.0s
[CV 2/2] END alpha=0.01, max_iter=1000;, score=(train=-0.000, test=-0.069) total time=   0.0s
[CV 1/2] END alpha=0.01, max_iter=10000;, score=(train=-0.000, test=-0.074) total time=   0.0s
[CV 2/2] END alpha=0.01, max_iter=10000;, score=(train=-0.000, test=-0.069) total time=   0.0s
[CV 1/2] END alpha=0.1, max_iter=100;, score=(train=-0.001, test=-0.022) total time=   0.0s
[CV 2/2] END alpha=0.1, max_iter=100;, score=(train=-0.001, test=-0.023) total time

In [16]:
# use GridSearchCV to find the best parameters for ElasticNet Regression
model = ElasticNet()


rng = np.random.RandomState(0)
param_grid = {
    'alpha': [0.001,0.01, 0.1, 1,],
    'max_iter': [100, 1000, 10000],
    'l1_ratio': [0.01, 0.1, 0.5, 0.9, 1]
}

hgs = HalvingGridSearchCV(
    estimator=model, param_grid=param_grid, random_state=rng, verbose=3, cv=2, scoring='neg_mean_squared_error'
)

hgs.fit(X_train, y_train)

print("\nBest MSE: {:.6f}".format(-hgs.best_score_))
print("Best parameters: {}".format(hgs.best_params_))

#print the r2 results for the best parameters
model = ElasticNet(
    alpha=hgs.best_params_['alpha'],
    max_iter=hgs.best_params_['max_iter'],
    l1_ratio=hgs.best_params_['l1_ratio']
)

model.fit(X_train, y_train)
plot_model_hyperparams(hgs, 'ElasticNet')

print(f'ElasticNet Regression R2 score: {model.score(X_test, y_test)}')
print(f'ElasticNet Regression MSE: {mean_squared_error(y_test, model.predict(X_test))}')


n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 367
max_resources_: 9934
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 60
n_resources: 367
Fitting 2 folds for each of 60 candidates, totalling 120 fits
[CV 1/2] END alpha=0.001, l1_ratio=0.01, max_iter=100;, score=(train=-0.000, test=-0.017) total time=   0.0s
[CV 2/2] END alpha=0.001, l1_ratio=0.01, max_iter=100;, score=(train=-0.000, test=-0.019) total time=   0.0s
[CV 1/2] END alpha=0.001, l1_ratio=0.01, max_iter=1000;, score=(train=-0.000, test=-0.017) total time=   0.0s
[CV 2/2] END alpha=0.001, l1_ratio=0.01, max_iter=1000;, score=(train=-0.000, test=-0.019) total time=   0.0s
[CV 1/2] END alpha=0.001, l1_ratio=0.01, max_iter=10000;, score=(train=-0.000, test=-0.017) total time=   0.0s
[CV 2/2] END alpha=0.001, l1_ratio=0.01, max_iter=10000;, score=(train=-0.000, test=-0.019) total time=   0.0s
[CV 1/2] END alpha=0.001, l1_ratio=0.1, max_iter=100;, score=(train=-0.000, te

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 2/2] END alpha=0.001, l1_ratio=0.5, max_iter=100;, score=(train=-0.002, test=-0.019) total time=   0.0s
[CV 1/2] END alpha=0.001, l1_ratio=0.5, max_iter=1000;, score=(train=-0.002, test=-0.019) total time=   0.0s
[CV 2/2] END alpha=0.001, l1_ratio=0.5, max_iter=1000;, score=(train=-0.002, test=-0.019) total time=   0.0s
[CV 1/2] END alpha=0.001, l1_ratio=0.5, max_iter=10000;, score=(train=-0.002, test=-0.019) total time=   0.0s
[CV 2/2] END alpha=0.001, l1_ratio=0.5, max_iter=10000;, score=(train=-0.002, test=-0.019) total time=   0.0s
[CV 1/2] END alpha=0.001, l1_ratio=0.9, max_iter=100;, score=(train=-0.005, test=-0.018) total time=   0.0s
[CV 2/2] END alpha=0.001, l1_ratio=0.9, max_iter=100;, score=(train=-0.004, test=-0.018) total time=   0.0s
[CV 1/2] END alpha=0.001, l1_ratio=0.9, max_iter=1000;, score=(train=-0.005, test=-0.018) total time=   0.0s
[CV 2/2] END alpha=0.001, l1_ratio=0.9, max_iter=1000;, score=(train=-0.004, test=-0.018) total time=   0.0s
[CV 1/2] END alpha=0