# BibRec: Training Random Forest Model

### RF Features: Country, State, Age, Year-of-Publication, Publisher

In [1]:
from bibrec.server.Utils import *
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

books, users, ratings = get_normalized_data(books_path='data/BX-Books.csv',
                                            users_path='data/BX-Users.csv',
                                            ratings_path='data/BX-Book-Ratings.csv',
                                            explicitOnly=True)

  df = pd.read_csv(path, sep=";", encoding="latin-1")
  location_seperated = users.location.str.split(',', 2, expand=True)


In [2]:
users.describe()

Unnamed: 0,user_id,age,user_mean,user_count
count,278858.0,278858.0,278858.0,278858.0
mean,139429.5,34.860508,1.829208,1.376909
std,80499.51502,13.830066,3.314981,20.80045
min,1.0,0.00084,0.0,0.0
25%,69715.25,24.810526,0.0,0.0
50%,139429.5,33.0,0.0,0.0
75%,209143.75,44.0,0.0,0.0
max,278858.0,110.0,10.0,6949.0


In [3]:
books.describe()


Unnamed: 0,year_of_publication,rating_mean,rating_count,normalized_year_of_publication
count,270944.0,270944.0,270944.0,270944.0
mean,1993.11492,4.159348,1.417149,11.88508
std,9.320961,3.953172,5.731537,9.320961
min,1376.0,0.0,0.0,0.0
25%,1989.0,0.0,0.0,5.0
50%,1995.0,5.0,1.0,10.0
75%,2000.0,8.0,1.0,16.0
max,2005.0,10.0,711.0,629.0


In [4]:
ratings.describe()

Unnamed: 0,normalized_rating,user_id,book_rating
count,9444722.0,9444722.0,9444722.0
mean,0.1300947,138075.7,7.742923
std,1.463379,80832.58,1.886697
min,-8.839196,8.0,1.0
25%,-0.4482759,67930.0,7.0
50%,0.0,136255.0,8.0
75%,1.0,209569.0,9.0
max,6.0,278854.0,10.0


In [5]:
# limit data to top-rated books only
df_ratings = ratings.groupby('isbn13').user_id.count().sort_values(ascending=False)
df_ratings = df_ratings[:100]
df_ratings = df_ratings.reset_index()
# user_id => nr of ratings
df_ratings = df_ratings.drop('user_id', axis=1)
df_ratings

Unnamed: 0,isbn13
0,9780316666343
1,9780971880108
2,9780385504201
3,9780312195519
4,9780060928339
...,...
95,978074323719
96,9780439064866
97,978044660899
98,978038550582


In [6]:
books = hot_encode_books(books)
users = hot_encode_users(users)

In [8]:
books

Unnamed: 0,isbn,book_title,book_author,year_of_publication,image_url_s,image_url_m,image_url_l,isbn13,rating_mean,rating_count,...,publisher_harpercollins,publisher_fawcett_books,publisher_signet_book,publisher_random_house_inc,publisher_st_martins_pr,publisher_st._martin's_press,publisher_tor_books,publisher_harpercollins_publishers,publisher_zebra_books,publisher_other
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,9780195153446,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,1
1,0002005018,Clara Callan,Richard Bruce Wright,2001,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,9780002005012,7.666667,9.0,...,0,0,0,0,0,0,0,0,0,1
2,0060973129,Decision in Normandy,Carlo D'Este,1991,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,9780060973124,7.500000,2.0,...,0,0,0,0,0,0,0,0,0,1
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,9780374157067,7.833333,6.0,...,0,0,0,0,0,0,0,0,0,1
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,9780393045215,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270939,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,978044040095,7.000000,1.0,...,0,0,0,0,0,0,0,0,0,1
270940,0525447644,From One to One Hundred,Teri Sloat,1991,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,978052544763,4.000000,1.0,...,0,0,0,0,0,0,0,0,0,1
270941,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,9780060086671,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,1
270942,0192126040,Republic (World's Classics),Plato,1996,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,9780192126047,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,1


In [9]:
users

Unnamed: 0,user_id,age,city,user_mean,user_count,country_usa,country_canada,country_united_kingdom,country_germany,country_spain,...,state_ohio,state_michigan,state_oregon,state_virginia,state_massachusetts,state_missouri,state_nordrhein-westfalen,state_north_carolina,state_,state_other
0,1,25.787123,nyc,0.0,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,18.000000,stockton,0.0,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,51.000824,moscow,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,17.000000,porto,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,34.952239,farnborough,0.0,0.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278853,278854,28.920902,portland,7.0,6.0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
278854,278855,50.000000,tacoma,0.0,0.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
278855,278856,25.239032,brampton,0.0,0.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278856,278857,22.101153,knoxville,0.0,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [12]:
# RF Features: Country, State, Age, Year-of-Publication, Publisher
tmp_users = users.filter(regex="user_id|age|country_|state_", axis=1)
tmp_books = books.filter(regex="isbn13|normalized_year_of_publication|publisher_", axis=1)
# df = df_ratings.filter(regex="isbn13|user_id|normalized_rating", axis=1)
df = df_ratings.filter(regex="isbn13|user_id|book_rating", axis=1)

df = df.merge(tmp_books, on="isbn13", how="left")
df = df.merge(tmp_users, on="user_id", how="left")

KeyError: 'user_id'

In [11]:
tmp_users

Unnamed: 0,user_id,age,country_usa,country_canada,country_united_kingdom,country_germany,country_spain,country_australia,country_italy,country_france,...,state_ohio,state_michigan,state_oregon,state_virginia,state_massachusetts,state_missouri,state_nordrhein-westfalen,state_north_carolina,state_,state_other
0,1,25.787123,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,18.000000,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,51.000824,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,17.000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,34.952239,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278853,278854,28.920902,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
278854,278855,50.000000,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278855,278856,25.239032,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278856,278857,22.101153,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
df

In [None]:
# Features
X = df.drop(['user_id', 'isbn13', 'book_rating'], axis=1)
# Prediction
Y = df['book_rating']

In [None]:
X

In [None]:
Y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=7)

In [None]:
from os.path import exists
import pickle

# Load the model from the file
file="random_forest_classifier5.pkl"

if exists(file):
    print("Loading file:", file)
    with open(file, "rb") as file:
        rfc = pickle.load(file)
else:
    print("Creating new model")
    rfc = RandomForestClassifier(n_estimators=100, min_weight_fraction_leaf=0, n_jobs=3, random_state=1)

In [None]:
rfc.fit(X_train, y_train)

In [None]:
import pickle

# Save the model to a file
with open("rf5.pkl", "wb") as file:
    pickle.dump(rfc, file)

In [None]:
rfc_pred = rfc.predict(X_test)
print(classification_report(y_test, rfc_pred))
print(accuracy_score(y_test, rfc_pred) * 100)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

classifier = rfc

np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
titles_options = [
    ("Confusion matrix, without normalization", None),
    ("Normalized confusion matrix", "true"),
]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        classifier,
        X_test,
        y_test,
        # display_labels=class_names,
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
rfc.score(X_train, y_train)

In [None]:
feature_importance = pd.DataFrame({'Feature_names': X.columns, 'Importances': rfc.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importances',ascending=False)
feature_importance

In [None]:
plt.bar(feature_importance['Feature_names'], feature_importance['Importances'])
plt.show()

In [None]:
country_importance = feature_importance.loc[lambda x: x['Feature_names'].str.contains("^country", regex = True)]
country_importance

In [None]:
state_importance = feature_importance.loc[lambda x: x['Feature_names'].str.contains("^state", regex = True)]
state_importance

In [None]:
age_importance = feature_importance.loc[lambda x: x['Feature_names'].str.contains("age", regex = True)]["Importances"]
print("age:", age_importance)
print("country_importance:", country_importance["Importances"].sum())
print("state_importance:", state_importance["Importances"].sum())