In [1]:
import os
import urllib
import zipfile

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

In [31]:
# Download the data from the GroupLens website
datapath = './data/ml-latest-small'

if not os.path.exists('./data'):
    os.makedirs('./data')
if not os.path.exists(datapath):
    url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
    urllib.request.urlretrieve(url,filename='data/ml-latest-small.zip')
    zip_ref = zipfile.ZipFile('data/ml-latest-small.zip', 'r')
    zip_ref.extractall('data/')
    zip_ref.close()

# Load data
ratings = pd.read_csv(os.path.join(datapath,'ratings.csv'))
movies = pd.read_csv(os.path.join(datapath,'movies.csv'))
ratings = ratings.merge(movies,on='movieId')
ratings = ratings[['userId','movieId','genres','rating']]
ratings['genres'] = ratings['genres'].apply(lambda x: x.replace('|',' '))
ratings.head()

Unnamed: 0,userId,movieId,genres,rating
0,1,1,Adventure Animation Children Comedy Fantasy,4.0
1,5,1,Adventure Animation Children Comedy Fantasy,4.0
2,7,1,Adventure Animation Children Comedy Fantasy,4.5
3,15,1,Adventure Animation Children Comedy Fantasy,2.5
4,17,1,Adventure Animation Children Comedy Fantasy,4.5


In [61]:
final_dataframe = get_final_df()
final_dataframe

Unnamed: 0,user_id_gr,isbn_gr,rating_bx,rating_gr
0,1,038572179X,2.786517,3
17750,1,038549081X,4.117647,3
25293,1,031242227X,3.546584,2
29017,1,1400032717,4.813187,3
38052,1,014023313X,2.436464,3
...,...,...,...,...
59977,53423,043965548X,3.533333,5
154513,53423,014038572X,4.460674,5
222792,53423,1567921892,4.500000,4
262957,53424,043933909X,10.000000,5


In [62]:
df.to_pickle('data/processed/genre_df.pkl')

In [60]:
def get_final_df():
    ref_path = 'data/processed/final_dataframe.pkl'
    final_dataframe = pd.read_pickle(ref_path)
    return final_dataframe

In [3]:
df = get_final_df()

In [4]:
df.head()

Unnamed: 0,user_id_gr,isbn_gr,rating_bx,rating_gr
0,1,038572179X,2.786517,3
17750,1,038549081X,4.117647,3
25293,1,031242227X,3.546584,2
29017,1,1400032717,4.813187,3
38052,1,014023313X,2.436464,3


In [5]:
import requests

def get_genres_for_isbn(isbn):
    api_url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"
    
    response = requests.get(api_url)
    data = response.json()

    if 'items' in data:
        item = data['items'][0]
        if 'volumeInfo' in item:
            volume_info = item['volumeInfo']
            if 'categories' in volume_info:
                genres = volume_info['categories']
                return genres
    
    return []

In [6]:
from tqdm import tqdm 
genres = {}

for isbn in tqdm(df['isbn_gr'].unique()):
    genres[isbn] = get_genres_for_isbn(isbn)

  0%|          | 0/397 [00:00<?, ?it/s]

100%|██████████| 397/397 [00:54<00:00,  7.24it/s]


In [7]:
for key in genres:
    if genres[key] is None or len(genres[key]) == 0:
        genres[key] = ['null']
    genres[key] = genres[key][0]

In [8]:
df['genres'] = df['isbn_gr'].map(genres)

In [33]:
books = df[~df['isbn_gr'].duplicated(keep='first')]
books.head()

In [35]:
vec = CountVectorizer()
genres_vec = vec.fit_transform(books['isbn_gr'].unique())
genres_vectorized = pd.DataFrame(genres_vec.todense(),columns=vec.get_feature_names_out(),index=books.isbn_gr)
genres_vectorized.head()

Unnamed: 0_level_0,006000150x,006000942x,006008216x,006008460x,006054094x,006056668x,006074068x,006091307x,006093140x,006093316x,...,3822859710,4770019572,4770020678,8173711461,8370540791,843391426x,8466302948,8489163413,8807813025,9725681363
isbn_gr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
038572179X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
038549081X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
031242227X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1400032717,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
014023313X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
csmatrix = cosine_similarity(genres_vec)
csmatrix = pd.DataFrame(csmatrix,columns=books.isbn_gr,index=books.isbn_gr)
csmatrix.head()

isbn_gr,038572179X,038549081X,031242227X,1400032717,014023313X,140003468X,1594480001,1878424505,043965548X,159184021X,...,1892213753,043933909X,3822859710,3822812153,1561483397,1569314063,1570820872,8807813025,067179437X,159307056X
isbn_gr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
038572179X,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
038549081X,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
031242227X,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1400032717,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
014023313X,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
ratings.head()

Unnamed: 0,userId,movieId,genres,rating
0,1,1,Adventure Animation Children Comedy Fantasy,4.0
1,5,1,Adventure Animation Children Comedy Fantasy,4.0
2,7,1,Adventure Animation Children Comedy Fantasy,4.5
3,15,1,Adventure Animation Children Comedy Fantasy,2.5
4,17,1,Adventure Animation Children Comedy Fantasy,4.5


In [37]:
df.head()

Unnamed: 0,user_id_gr,isbn_gr,rating_bx,rating_gr,genres
0,1,038572179X,2.786517,3,Fiction
17750,1,038549081X,4.117647,3,Fiction
25293,1,031242227X,3.546584,2,Biography & Autobiography
29017,1,1400032717,4.813187,3,Fiction
38052,1,014023313X,2.436464,3,Diaries


In [39]:
X = df.drop(labels=['rating_gr','genres', 'rating_bx'],axis=1)
y = df['rating_gr']
X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)

In [49]:
def predict_rating(user_item_pair,simtable=csmatrix,X_train=X_train, y_train=y_train):
    movie_to_rate = user_item_pair['isbn_gr']
    user = user_item_pair['user_id_gr']
    movies_watched = X_train.loc[X_train['user_id_gr']==user, 'isbn_gr'].tolist()
    if movies_watch.empty:
        continue
    simtable_filtered = simtable.loc[movie_to_rate,movies_watched]
    most_similar_watched = simtable_filtered.index[np.argmax(simtable_filtered)]
    idx = X_train.loc[(X_train['user_id_gr']==user) & (X_train['isbn_gr']==most_similar_watched)].index.values[0]
    most_similar_rating = y_train.loc[idx]
    return most_similar_rating

SyntaxError: 'continue' not properly in loop (1439193950.py, line 7)

In [None]:
for value1, value2 in zip(X_val['user_id_gr'], X_val['isbn_gr']):

In [50]:
ratings_valset = X_val.apply(lambda x: predict_rating(x),axis=1)
val_rmse = np.sqrt(mean_squared_error(y_val,ratings_valset))
print('RMSE of predicted ratings is {:.3f}'.format(val_rmse))

51028
11692
53266
23316
39852
51885
10214
2515
45759
32456
2037
28508
28313
5274
34925


ValueError: attempt to get argmax of an empty sequence

In [54]:
def predict_new_pair_rating(user,movie,simtable=csmatrix,X_train=X_train, y_train=y_train):
    # Filter similarity matrix to only movies already reviewed by user
    movies_watched = X_train.loc[X_train['user_id_gr']==user, 'isbn_gr'].tolist()
    simtable_filtered = simtable.loc[movie,movies_watched]
    # Get the most similar movie already watched to current movie to rate
    most_similar_watched = simtable_filtered.index[np.argmax(simtable_filtered)]
    # Get user's rating for most similar movie
    idx = X_train.loc[(X_train['user_id_gr']==user) & (X_train['isbn_gr']==most_similar_watched)].index.values[0]
    most_similar_rating = y_train.loc[idx]
    return most_similar_rating

rating = predict_new_pair_rating(5,'038572179X')
print('Predicted rating is {:.1f}'.format(rating))

Predicted rating is 4.0


In [55]:
books

Unnamed: 0,user_id_gr,isbn_gr,rating_bx,rating_gr,genres
0,1,038572179X,2.786517,3,Fiction
17750,1,038549081X,4.117647,3,Fiction
25293,1,031242227X,3.546584,2,Biography & Autobiography
29017,1,1400032717,4.813187,3,Fiction
38052,1,014023313X,2.436464,3,Diaries
...,...,...,...,...,...
263397,7243,1569314063,10.000000,3,
263565,7949,1570820872,4.625000,4,
263774,8172,8807813025,3.893617,4,
263846,9280,067179437X,1.250000,4,


In [70]:
def generate_recommendations(user,simtable,df):
    user_ratings = df.loc[df['user_id_gr']==user]
    user_ratings = user_ratings.sort_values(by='rating_bx',axis=0,ascending=False)
    topratedbook = user_ratings.iloc[0,:]['isbn_gr']
    sims = simtable.loc[topratedbook,:]
    mostsimilar = sims.sort_values(ascending=False).index.values
    mostsimilar = mostsimilar[0:3]
    return mostsimilar

In [71]:


# Get recommendations for a random user
user = 5
recs = generate_recommendations(user,simtable=csmatrix,df=df)
recs

array(['067973225X', '038572179X', '3426619148'], dtype=object)

In [1]:
from scripts.content_filtering import ContentFilter

In [2]:
cf = ContentFilter()

In [3]:
recs = cf.generate_recommendations()


In [4]:
recs

array(['140003468X', '038572179X', '1584230703'], dtype=object)

In [5]:
cf.get_aty_recs()

(['Gabriel García Márquez, Edith Grossman',
  'Ian McEwan',
  'Marshall McLuhan, Quentin Fiore, Jerome Agel'],
 ['Love in the Time of Cholera', 'Atonement', 'The Medium is the Massage'],
 [1985.0, 2001.0, 1967.0])

In [17]:
from scripts.hybrid_filter_recommender import HybridFilterRecommender
import pandas as pd
import torch
def get_final_df():
    ref_path = 'data/processed/final_dataframe.pkl'
    final_dataframe = pd.read_pickle(ref_path)
    return final_dataframe
hfr = HybridFilterRecommender(get_final_df())

In [23]:
books = isbn_df['title_gr']
selected_books = books.iloc[8:10]
selected_books

44124    Harry Potter and the Prisoner of Azkaban (Harr...
43952    Purple Cow: Transform Your Business by Being R...
Name: title_gr, dtype: object