# Data Preprocessing

In [32]:
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import random
import copy
from sklearn.decomposition import PCA
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingGridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from torch.utils.tensorboard import SummaryWriter

seed = 42
pca = None
np.random.seed(seed)
random.seed(seed)
df = pd.read_csv('dataset.csv')

In [34]:
df.head()

Unnamed: 0,rating,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,...,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies,year
0,3.893708,0.02875,0.02375,0.0625,0.07575,0.14075,0.14675,0.0635,0.20375,0.202,...,0.01425,0.0305,0.035,0.14125,0.05775,0.039,0.02975,0.08475,0.022,
1,3.251527,0.04125,0.0405,0.06275,0.08275,0.091,0.06125,0.06925,0.096,0.0765,...,0.01575,0.0125,0.02,0.12225,0.03275,0.021,0.011,0.10525,0.01975,
2,3.142028,0.04675,0.0555,0.02925,0.087,0.0475,0.04775,0.046,0.14275,0.0285,...,0.0195,0.02225,0.023,0.122,0.03475,0.017,0.018,0.091,0.01775,
3,2.853547,0.03425,0.038,0.0405,0.031,0.065,0.03575,0.029,0.0865,0.032,...,0.028,0.01675,0.03875,0.182,0.0705,0.01625,0.01425,0.0885,0.015,
4,3.058434,0.043,0.05325,0.038,0.041,0.054,0.06725,0.02775,0.0765,0.0215,...,0.0205,0.01425,0.0255,0.19225,0.02675,0.01625,0.013,0.087,0.016,


In [33]:

columns_to_transform = ['year', 'rating_count']


def transform(X):
            X_norm2 = np.linalg.norm(X, ord=2)
            X = X / X_norm2


def normalize(df, type):
    print(type)
    for column in columns_to_transform:
        df[column] = transform(df[column])

normalize(df, 'L2_normalization')

L2_normalization


KeyError: 'rating_count'

In [22]:
#splitting dataframe df into train and test
X=df.drop(['rating'],axis=1)
y=df['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=seed)



In [23]:
print(f'Number of training samples: {X_train.shape}')
print(f'Number of testing samples: {X_test.shape}')

Number of training samples: (9946, 1128)
Number of testing samples: (2764, 1128)


In [24]:
#PCA
pca = PCA(n_components=0.95)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_val = pca.transform(X_val)
X_test = pca.transform(X_test)


In [25]:
# Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

print(f'Linear Regression R2 score: {model.score(X_test, y_test)}')
print(f'Linear Regression MSE: {mean_squared_error(y_test, model.predict(X_test))}')

Linear Regression R2 score: 0.9709615407207532
Linear Regression MSE: 0.0064368418369928765


In [20]:
#hyperparameters tuning for random forest regressor
model=RandomForestRegressor()

#generate random number
rng = np.random.RandomState(0)

param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [5],
    'min_samples_split': [2],
    'min_samples_leaf': [2],
    'max_features': [1.0, 'sqrt', 'log2']
}

hgs = HalvingGridSearchCV(
    estimator=model, param_grid=param_grid, random_state=rng, verbose=3, cv=2, scoring='neg_mean_squared_error'
)
hgs.fit(X_train, y_train)


print("\nBest MSE: {:.6f}".format(-hgs.best_score_))
print("Best parameters: {}".format(hgs.best_params_))


#hgs_results = plot_model_results(hgs, 'RandomForestRegressor')

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 1105
max_resources_: 9946
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 9
n_resources: 1105
Fitting 2 folds for each of 9 candidates, totalling 18 fits
[CV 1/2] END max_depth=5, max_features=1.0, min_samples_leaf=2, min_samples_split=2, n_estimators=10;, score=(train=-0.035, test=-0.069) total time=   1.4s
[CV 2/2] END max_depth=5, max_features=1.0, min_samples_leaf=2, min_samples_split=2, n_estimators=10;, score=(train=-0.029, test=-0.067) total time=   1.4s
[CV 1/2] END max_depth=5, max_features=1.0, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=(train=-0.031, test=-0.062) total time=   6.9s
[CV 2/2] END max_depth=5, max_features=1.0, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=(train=-0.027, test=-0.064) total time=   6.9s
[CV 1/2] END max_depth=5, max_features=1.0, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=(trai

In [26]:
# hyperparameter tuning
n_tree = [ i for i in range(50, 100, 5)]
criterion = ["squared_error", "friedman_mse", "poisson"]

rfr_best_mse = float('inf')
rfr_best_n = None
rfr_best_c = None
rfr_best = None

i = 0
max_iter = len(n_tree) * len(criterion)

history_rfr = []

for n, c in itertools.product(n_tree, criterion):
    i += 1
    log_name = f"n_estimators={n}, criterion={c}"
    writer = SummaryWriter(f"classicML/RFR/{log_name}")

    rf = RandomForestRegressor(n_estimators=n, criterion=c)
    rf.fit(X_train, y_train)
    Y_pred = rf.predict(X_val)
    mse = np.mean((Y_pred - y_val)**2)

    history_rfr.append([n, c, mse])

    writer.add_scalar('Loss', mse, n)
    writer.add_hparams(
        {'n_estimators': n, 'criterion': c},
        {'mse': mse}
    )
    writer.flush()

    if mse < rfr_best_mse:
        rfr_best_mse = mse
        rfr_best_n = n
        rfr_best_c = c
        rfr_best = copy.deepcopy(rf)
    
    print(" Iteration: {}/{} - N_estimator: {} - Criterion: {} - MSE: {:.4f} - Best MSE: {:.4f}".format(i, max_iter, n, c, mse, rfr_best_mse))

Y_pred = rfr_best.predict(X_test)
rf_r2 = r2_score(y_test, Y_pred)

print("\nRFR Hyperparameter Tuning Results")
print(" - N estimators: ", rfr_best_n)
print(" - Criterion: ", rfr_best_c)
print(" - MSE: ", rfr_best_mse)
print(" - R2: ", rf_r2)

 Iteration: 1/30 - N_estimator: 50 - Criterion: squared_error - MSE: 0.0335 - Best MSE: 0.0335
 Iteration: 2/30 - N_estimator: 50 - Criterion: friedman_mse - MSE: 0.0340 - Best MSE: 0.0335
 Iteration: 3/30 - N_estimator: 50 - Criterion: poisson - MSE: 0.0345 - Best MSE: 0.0335


KeyboardInterrupt: 