### IMPORT LIBRARIES & DATA

In [None]:
import pandas as pd
import mysql.connector
import sqlalchemy as sa
import getpass

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

ENGLISH_WORDS = set(words.words())

In [24]:
import numpy as np

In [2]:
import pickle

In [3]:
from setup import setup
pool = setup()

Current time:  2023-06-08 13:42:35


In [4]:
books_query = sa.text(
    "SELECT * FROM books;"
)

books_df = pd.read_sql_query(books_query, con=pool.connect())
display(books_df.head(1))
display(len(books_df))

Unnamed: 0,goodreads_book_id,isbn,isbn13,original_publication_year,original_title,title,language_code,average_rating,ratings_count,description
0,2767052,439023483,9780439023480.0,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,"Could you survive on your own in the wild, wit..."


9814

### MODEL

In [5]:
class tfidf_recommender:
    def __init__(self, data, id, docs):
        self.data = data
        self.id = id
        self.id_data = data[id]
        self.docs_data = data[docs]
        self.tfidf = None
        self.tfidf_vectors = None
        self.indices = pd.Series(data.index, index=data[id])
        self.sim_matrix = None

    def preprocess_text(self, docs):
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        tokens_list = []
        for doc in docs:
            tokens = nltk.word_tokenize(doc)
            tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token.isalpha() and token.lower() in ENGLISH_WORDS]        
            tokens = [lemmatizer.lemmatize(token) for token in tokens]
            tokens_list.append(tokens)
        return tokens_list

    def _get_tfidf_scores(self, docs):
        self.tfidf = TfidfVectorizer()
        self.tfidf_vectors = self.tfidf.fit_transform(docs)  
        self.sim_matrix = cosine_similarity(self.tfidf_vectors)  
        
    def get_tfidf_scores(self):
        return self.indices, self.tfidf_vectors

    def get_recommendations(self, id, num_recommends=5):
        idx = self.indices[id]
        sim_scores = list(enumerate(self.sim_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:num_recommends]
        item_indices = [i[0] for i in sim_scores]
        return self.data.iloc[item_indices]

    def fit(self):
        tokens_list = [[' '.join(doc)][0] for doc in self.preprocess_text(self.docs_data)]
        self._get_tfidf_scores(tokens_list)

In [6]:
# recommender = tfidf_recommender(books_df, 'goodreads_book_id', 'description')
# recommender.fit()

In [7]:
# recommender.get_recommendations('3', 10)

Unnamed: 0,goodreads_book_id,isbn,isbn13,original_publication_year,original_title,title,language_code,average_rating,ratings_count,description
3239,8,439682584,9780439682590.0,2003.0,eng,"Harry Potter Boxed Set, Books 1-5 (Harry Potte...",eng,4.77,33220,Box Set containing Harry Potter and the Sorcer...
22,15881,439064864,9780439064870.0,1998.0,Harry Potter and the Chamber of Secrets,Harry Potter and the Chamber of Secrets (Harry...,eng,4.37,1779331,Ever since Harry Potter had come home for the ...
8228,3130430,1416554955,9781416554950.0,2008.0,"Harry, A History: The True Story of a Boy Wiza...","Harry, a History: The True Story of a Boy Wiza...",eng,4.09,12308,THE HARRY POTTER BOOKS WERE JUST THE BEGINNING...
2079,99298,439249546,9780439249550.0,1999.0,Harry Potter Boxed Set Books 1-4,"The Harry Potter Collection 1-4 (Harry Potter,...",eng,4.66,43929,"The exciting tales of Harry Potter, the young ..."
17,5,043965548X,9780439655480.0,1999.0,Harry Potter and the Prisoner of Azkaban,Harry Potter and the Prisoner of Azkaban (Harr...,eng,4.53,1832823,"Harry Potter, along with his best friends, Ron..."
23,6,439139600,9780439139600.0,2000.0,Harry Potter and the Goblet of Fire,Harry Potter and the Goblet of Fire (Harry Pot...,eng,4.53,1753043,It is the summer holidays and soon Harry Potte...
6912,483445,042519891X,9780425198920.0,2001.0,The Magical Worlds of Harry Potter: A Treasury...,The Magical Worlds of Harry Potter: A Treasury...,eng,3.96,13820,An indispensable source guide to J. K. Rowling...
1836,10872085,031253955X,9780312539560.0,2011.0,Only Time Will Tell,"Only Time Will Tell (The Clifton Chronicles, #1)",eng,4.04,41187,\nFrom the internationally bestselling author ...
1383,91475,451461401,9780451461410.0,2007.0,White Night,"White Night (The Dresden Files, #9)",eng,4.41,73534,Wizard Harry Dresden must investigate his own ...


In [8]:
# indices, vectors = recommender.get_tfidf_scores()

In [21]:
# arr = vectors.toarray()
# arr

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
# precision = np.finfo(arr.dtype).precision
# longest_float = np.max(np.round(arr, decimals=precision))
# print(longest_float)

1.0


In [32]:
# np.finfo(arr.dtype).precision

15

In [25]:
# np.savetxt('./data/processed/tfidf_matrix.dat', arr, dtype='f', delimiter=':')

In [None]:
# from sklearn.decomposition import TruncatedSVD
# from sklearn.preprocessing import StandardScaler
# from scipy.sparse import csr_matrix

In [43]:
# scaler = StandardScaler(with_mean=False) 
# X_scaled = scaler.fit_transform(vectors)
# n_components = 5000  
# svd = TruncatedSVD(n_components=n_components)
# X_svd = svd.fit_transform(X_scaled)

In [44]:
# sim_matrix = cosine_similarity(X_svd)
# idx = indices['3']
# sim_scores = list(enumerate(sim_matrix[idx]))
# sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# sim_scores = sim_scores[1:5]
# item_indices = [i[0] for i in sim_scores]
# books_df.iloc[item_indices]

### SAVE MODEL

In [9]:
# with open('tfidf_recommender.pkl', 'wb') as f:
#     pickle.dump(recommender, f)

In [11]:
# with open('./models/tfidf_recommender.pkl', 'rb') as f:
#     model = pickle.load(f)

# model.get_recommendations('18813642', 10)