In [3]:
import os
import urllib
import zipfile

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

In [1]:
def get_final_df():
    ref_path = 'data/processed/final_dataframe.pkl'
    final_dataframe = pd.read_pickle(ref_path)
    return final_dataframe

In [5]:
df = get_final_df()
df.head()

Unnamed: 0,user_id_gr,isbn_gr,rating_bx,rating_gr
0,1,038572179X,2.786517,3
17750,1,038549081X,4.117647,3
25293,1,031242227X,3.546584,2
29017,1,1400032717,4.813187,3
38052,1,014023313X,2.436464,3


In [6]:
import requests

def get_genres_for_isbn(isbn):
    api_url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"
    
    response = requests.get(api_url)
    data = response.json()

    if 'items' in data:
        item = data['items'][0]
        if 'volumeInfo' in item:
            volume_info = item['volumeInfo']
            if 'categories' in volume_info:
                genres = volume_info['categories']
                return genres
    
    return []

In [7]:
from tqdm import tqdm 
genres = {}

for isbn in tqdm(df['isbn_gr'].unique()):
    genres[isbn] = get_genres_for_isbn(isbn)

100%|██████████| 397/397 [00:55<00:00,  7.09it/s]


In [8]:
for key in genres:
    if genres[key] is None or len(genres[key]) == 0:
        genres[key] = ['null']
    genres[key] = genres[key][0]

In [9]:
df['genres'] = df['isbn_gr'].map(genres)

In [10]:
books = df[~df['isbn_gr'].duplicated(keep='first')]
books.head()

Unnamed: 0,user_id_gr,isbn_gr,rating_bx,rating_gr,genres
0,1,038572179X,2.786517,3,Fiction
17750,1,038549081X,4.117647,3,Fiction
25293,1,031242227X,3.546584,2,Biography & Autobiography
29017,1,1400032717,4.813187,3,Fiction
38052,1,014023313X,2.436464,3,Diaries


In [11]:
vec = CountVectorizer()
genres_vec = vec.fit_transform(books['isbn_gr'].unique())
genres_vectorized = pd.DataFrame(genres_vec.todense(),columns=vec.get_feature_names_out(),index=books.isbn_gr)
genres_vectorized.head()

Unnamed: 0_level_0,006000150x,006000942x,006008216x,006008460x,006054094x,006056668x,006074068x,006091307x,006093140x,006093316x,...,3822859710,4770019572,4770020678,8173711461,8370540791,843391426x,8466302948,8489163413,8807813025,9725681363
isbn_gr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
038572179X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
038549081X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
031242227X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1400032717,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
014023313X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
csmatrix = cosine_similarity(genres_vec)
csmatrix = pd.DataFrame(csmatrix,columns=books.isbn_gr,index=books.isbn_gr)
csmatrix.head()

isbn_gr,038572179X,038549081X,031242227X,1400032717,014023313X,140003468X,1594480001,1878424505,043965548X,159184021X,...,1892213753,043933909X,3822859710,3822812153,1561483397,1569314063,1570820872,8807813025,067179437X,159307056X
isbn_gr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
038572179X,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
038549081X,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
031242227X,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1400032717,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
014023313X,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
X = df.drop(labels=['rating_gr','genres', 'rating_bx'],axis=1)
y = df['rating_gr']
X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)

In [17]:
X.head()

Unnamed: 0,user_id_gr,isbn_gr
0,1,038572179X
17750,1,038549081X
25293,1,031242227X
29017,1,1400032717
38052,1,014023313X


In [33]:
from tqdm import tqdm
import numpy as np

# Create a dictionary to map users to their read books
user_books = X_train.groupby('user_id_gr')['isbn_gr'].apply(list).to_dict()

preds = []

for user, book in tqdm(zip(X_val['user_id_gr'], X_val['isbn_gr']), total=len(X_val)):
    books_read = user_books.get(user, [])  # Get books read by the user
    if books_read:
        simtable_filtered = simtable.loc[book, books_read]
        most_similar_read = simtable_filtered.idxmax()
        idx = X_train.loc[(X_train['user_id_gr'] == user) & (X_train['isbn_gr'] == most_similar_read)].index.values
        if len(idx) > 0:
            most_similar_rating = y_train.loc[idx[0]]
            preds.append(most_similar_rating)
        else:
            preds.append(None)  # Handle case where no matching record is found
    else:
        preds.append(None)  # Handle case where no books are read by the user


100%|██████████| 52822/52822 [06:09<00:00, 143.03it/s]


In [25]:
def predict_rating(user_item_pair,simtable=csmatrix,X_train=X_train, y_train=y_train):
    book = user_item_pair['isbn_gr']
    user = user_item_pair['user_id_gr']
    books_read = X_train.loc[X_train['user_id_gr']==user, 'isbn_gr'].tolist()
    simtable_filtered = simtable.loc[book,books_read]
    most_similar_read= simtable_filtered.index[np.argmax(simtable_filtered)]
    idx = X_train.loc[(X_train['user_id_gr']==user) & (X_train['isbn_gr']==most_similar_read)].index.values[0]
    most_similar_rating = y_train.loc[idx]
    return most_similar_rating

In [38]:
import pickle
pickle_file_path = 'data/processed/content_filter_preds.pkl'

# Save the list to the pickle file
with open(pickle_file_path, 'wb') as pickle_file:
    pickle.dump(preds, pickle_file)

In [47]:
import numpy as np
preds = np.array([x if x is not None else np.nan for x in preds])
valid_indices = ~np.isnan(preds)

# Calculate MSE using valid predictions
mse = mean_squared_error(y_val[valid_indices], preds[valid_indices])
list_range = max(y_val) - min(y_val) + 1
print(list_range)
mse

6


1.5493558036325246

In [55]:
y_val[valid_indices]

168066    3
120740    3
129716    3
259945    4
165745    4
         ..
9517      5
203901    3
216243    4
204862    5
197648    4
Name: rating_gr, Length: 51149, dtype: int64

In [57]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

accuracy = accuracy_score(y_val[valid_indices], preds[valid_indices])

# Calculate recall (true positive rate)
recall = recall_score(y_val[valid_indices], preds[valid_indices], average = 'micro')

# Calculate precision
precision = precision_score(y_val[valid_indices], preds[valid_indices], average = 'micro')

print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)

Accuracy: 0.3841717335627285
Recall: 0.3841717335627285
Precision: 0.3841717335627285


In [58]:
from scripts.hybrid_filter_recommender import HybridFilterRecommender
hfr = HybridFilterRecommender(df)

In [66]:
import torch
model_dir = 'models/hybrid_recommender.pkl'
model = torch.load(model_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [67]:
df = hfr.process_df()
df.drop(columns = ['genres'], inplace = True)
df = df.astype(int)
X = df.loc[:, ['user_id_gr', 'isbn_gr','rating_bx']]
y = df.loc[:, ['rating_gr']]
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, test_size=0.2)

In [68]:
preds = []
for userid, isbn, bxrating in tqdm(zip(X_val['user_id_gr'],X_val['isbn_gr'],X_val['rating_bx']), total=len(X_val)):
    pred = hfr.predict_rating(model = model, userid = userid, isbn = isbn, bxrating = bxrating, device = device)
    preds.append(pred)

100%|██████████| 52822/52822 [00:06<00:00, 7932.62it/s]


In [75]:
pred_list = [int(tensor.item()) for tensor in preds]

In [78]:
mse = mean_squared_error(y_val, pred_list)

accuracy = accuracy_score(y_val, pred_list)

# Calculate recall (true positive rate)
recall = recall_score(y_val, pred_list, average = 'micro')

# Calculate precision
precision = precision_score(y_val, pred_list, average = 'micro')
print("MSE:", mse)
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)

MSE: 1.949433947976222
Accuracy: 0.21066979667562757
Recall: 0.21066979667562757
Precision: 0.21066979667562757
