# Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import random
import copy
from sklearn.decomposition import PCA
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split,GridSearchCV, HalvingGridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

from torch.utils.tensorboard import SummaryWriter

seed = 42
pca = None
np.random.seed(seed)
random.seed(seed)
df = pd.read_csv('dataset.csv')

In [2]:
df.head()

Unnamed: 0,rating,rating_count,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,3.893708,57309,0.02875,0.02375,0.0625,0.07575,0.14075,0.14675,0.0635,0.20375,...,0,0,0,0,0,0,0,0,0,1995
1,3.251527,24228,0.04125,0.0405,0.06275,0.08275,0.091,0.06125,0.06925,0.096,...,0,0,0,0,0,0,0,0,0,1995
2,3.142028,11804,0.04675,0.0555,0.02925,0.087,0.0475,0.04775,0.046,0.14275,...,0,0,0,0,1,0,0,0,0,1995
3,2.853547,2523,0.03425,0.038,0.0405,0.031,0.065,0.03575,0.029,0.0865,...,0,0,0,0,1,0,0,0,0,1995
4,3.058434,11714,0.043,0.05325,0.038,0.041,0.054,0.06725,0.02775,0.0765,...,0,0,0,0,0,0,0,0,0,1995


In [3]:

columns_to_transform = ['year', 'rating_count']


def transform(X):
    X_norm2 = np.linalg.norm(X, ord=2)
    X = X / X_norm2
    return X

def normalize(df, type):
    print(type)
    for column in columns_to_transform:
        df[column] = transform(df[column])
    return df
df=normalize(df, 'L2_normalization')

L2_normalization


In [4]:
#splitting dataframe df into train and test
X=df.drop(['rating'],axis=1)
y=df['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)



In [45]:
print(f'Number of training samples: {X_train.shape}')
print(f'Number of testing samples: {X_test.shape}')

Number of training samples: (9934, 1150)
Number of testing samples: (2760, 1150)


In [38]:
#PCA
pca = PCA(n_components=0.95)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_val = pca.transform(X_val)
X_test = pca.transform(X_test)


In [5]:
# Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

print(f'Linear Regression R2 score: {model.score(X_test, y_test)}')
print(f'Linear Regression MSE: {mean_squared_error(y_test, model.predict(X_test))}')

Linear Regression R2 score: 0.9764271756767273
Linear Regression MSE: 0.005317075058827069


In [47]:
#hyperparameters tuning for random forest regressor
model=RandomForestRegressor()

#generate random number
rng = np.random.RandomState(0)

param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [5],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': [1.0, 'sqrt', 'log2']
}

hgs = HalvingGridSearchCV(
    estimator=model, param_grid=param_grid, random_state=rng, verbose=3, cv=2, scoring='neg_mean_squared_error'
)
hgs.fit(X_train, y_train)


print("\nBest MSE: {:.6f}".format(-hgs.best_score_))
print("Best parameters: {}".format(hgs.best_params_))


#hgs_results = plot_model_results(hgs, 'RandomForestRegressor')

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 367
max_resources_: 9934
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 54
n_resources: 367
Fitting 2 folds for each of 54 candidates, totalling 108 fits
[CV 1/2] END max_depth=5, max_features=1.0, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=(train=-0.010, test=-0.036) total time=   0.8s
[CV 2/2] END max_depth=5, max_features=1.0, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=(train=-0.010, test=-0.044) total time=   0.8s
[CV 1/2] END max_depth=5, max_features=1.0, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=(train=-0.006, test=-0.031) total time=   3.9s
[CV 2/2] END max_depth=5, max_features=1.0, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=(train=-0.007, test=-0.030) total time=   4.0s
[CV 1/2] END max_depth=5, max_features=1.0, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=(tra

In [49]:
#train random forest regressor with best parameters
model = RandomForestRegressor(
    n_estimators=200,
    max_depth=25,
    min_samples_split=hgs.best_params_['min_samples_split'],
    min_samples_leaf=hgs.best_params_['min_samples_leaf'],
    max_features=hgs.best_params_['max_features'],
    random_state=rng
)
model.fit(X_train, y_train)

print(f'Random Forest Regressor R2 score: {model.score(X_test, y_test)}')
print(f'Random Forest Regressor MSE: {mean_squared_error(y_test, model.predict(X_test))}')


Random Forest Regressor R2 score: 0.9474482750240798
Random Forest Regressor MSE: 0.011853542126979701


In [8]:
# use GridSearchCV to find the best parameters for SVM
model = SVR()


rng = np.random.RandomState(0)
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto'],
    'C': [0.01,0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1,0.5, 1, 5]
}

hgs = HalvingGridSearchCV(
    estimator=model, param_grid=param_grid, random_state=rng, verbose=3, cv=2, scoring='neg_mean_squared_error'
)
hgs.fit(X_train, y_train)

print("\nBest MSE: {:.6f}".format(-hgs.best_score_))
print("Best parameters: {}".format(hgs.best_params_))

#print the r2 results for the best parameters
model = SVR(
    kernel=hgs.best_params_['kernel'],
    degree=hgs.best_params_['degree'],
    gamma=hgs.best_params_['gamma'],
    C=hgs.best_params_['C'],
    epsilon=hgs.best_params_['epsilon']
)

model.fit(X_train, y_train)

print(f'SVM R2 score: {model.score(X_test, y_test)}')
print(f'SVM MSE: {mean_squared_error(y_test, model.predict(X_test))}')







n_iterations: 6
n_required_iterations: 6
n_possible_iterations: 6
min_resources_: 40
max_resources_: 9934
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 600
n_resources: 40
Fitting 2 folds for each of 600 candidates, totalling 1200 fits
[CV 1/2] END C=0.01, degree=2, epsilon=0.01, gamma=scale, kernel=linear;, score=(train=-0.016, test=-0.081) total time=   0.0s
[CV 2/2] END C=0.01, degree=2, epsilon=0.01, gamma=scale, kernel=linear;, score=(train=-0.050, test=-0.105) total time=   0.0s
[CV 1/2] END C=0.01, degree=2, epsilon=0.01, gamma=scale, kernel=poly;, score=(train=-0.115, test=-0.194) total time=   0.0s
[CV 2/2] END C=0.01, degree=2, epsilon=0.01, gamma=scale, kernel=poly;, score=(train=-0.237, test=-0.180) total time=   0.0s
[CV 1/2] END C=0.01, degree=2, epsilon=0.01, gamma=scale, kernel=rbf;, score=(train=-0.133, test=-0.211) total time=   0.0s
[CV 2/2] END C=0.01, degree=2, epsilon=0.01, gamma=scale, kernel=rbf;, score=(train=-0.265, test=-0.205) tota

In [None]:
# use GridSearchCV to find the best parameters for Lasso Regression
model = Lasso()


rng = np.random.RandomState(0)
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100],
    'max_iter': [1000, 10000, 100000]
}

hgs = HalvingGridSearchCV(
    estimator=model, param_grid=param_grid, random_state=rng, verbose=3, cv=2, scoring='neg_mean_squared_error'
)
hgs.fit(X_train, y_train)

print("\nBest MSE: {:.6f}".format(-hgs.best_score_))
print("Best parameters: {}".format(hgs.best_params_))

#print the r2 results for the best parameters
model = Lasso(
    alpha=hgs.best_params_['alpha'],
    max_iter=hgs.best_params_['max_iter']
)

model.fit(X_train, y_train)

print(f'Lasso Regression R2 score: {model.score(X_test, y_test)}')
print(f'Lasso Regression MSE: {mean_squared_error(y_test, model.predict(X_test))}')


In [None]:
# use GridSearchCV to find the best parameters for Ridge Regression
model = Ridge()


rng = np.random.RandomState(0)
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100],
    'max_iter': [1000, 10000, 100000]
}

hgs = HalvingGridSearchCV(
    estimator=model, param_grid=param_grid, random_state=rng, verbose=3, cv=2, scoring='neg_mean_squared_error'
)

hgs.fit(X_train, y_train)

print("\nBest MSE: {:.6f}".format(-hgs.best_score_))
print("Best parameters: {}".format(hgs.best_params_))

#print the r2 results for the best parameters
model = Ridge(
    alpha=hgs.best_params_['alpha'],
    max_iter=hgs.best_params_['max_iter']
)

model.fit(X_train, y_train)

print(f'Ridge Regression R2 score: {model.score(X_test, y_test)}')
print(f'Ridge Regression MSE: {mean_squared_error(y_test, model.predict(X_test))}')


In [None]:
# use GridSearchCV to find the best parameters for ElasticNet Regression
model = ElasticNet()


rng = np.random.RandomState(0)
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100],
    'max_iter': [1000, 10000, 100000],
    'l1_ratio': [0.01, 0.1, 0.5, 0.9, 1]
}

hgs = HalvingGridSearchCV(
    estimator=model, param_grid=param_grid, random_state=rng, verbose=3, cv=2, scoring='neg_mean_squared_error'
)

hgs.fit(X_train, y_train)

print("\nBest MSE: {:.6f}".format(-hgs.best_score_))
print("Best parameters: {}".format(hgs.best_params_))

#print the r2 results for the best parameters
model = ElasticNet(
    alpha=hgs.best_params_['alpha'],
    max_iter=hgs.best_params_['max_iter'],
    l1_ratio=hgs.best_params_['l1_ratio']
)

model.fit(X_train, y_train)

print(f'ElasticNet Regression R2 score: {model.score(X_test, y_test)}')
print(f'ElasticNet Regression MSE: {mean_squared_error(y_test, model.predict(X_test))}')
