In [3]:
import pandas as pd
import mysql.connector
import sqlalchemy as sa
import getpass

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

ENGLISH_WORDS = set(words.words())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [4]:
def getconn():
    conn = mysql.connector.connect(
        host="localhost",
        user="root",
        password=getpass.getpass(),
        database="goodreads"
    )
    return conn

pool = sa.create_engine(
    "mysql+mysqlconnector://",
    creator=getconn,
)

with pool.connect() as db_conn:
    results = db_conn.execute(sa.text("SELECT NOW()")).fetchone()
    print("Current time: ", results[0])

Current time:  2023-05-09 14:51:01


In [5]:
books_query = sa.text(
    "SELECT * FROM books;"
)

books_df = pd.read_sql_query(books_query, con=pool.connect())
display(books_df.head(1))
display(len(books_df))

Unnamed: 0,goodreads_book_id,work_id,books_count,isbn,isbn13,original_publication_year,original_title,title,language_code,average_rating,ratings_count,description
0,2767052,2792775,272,439023483,9780439023480.0,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,"Could you survive on your own in the wild, wit..."


9814

In [7]:
class tfidf_recommender:
    def __init__(self, data, id, docs):
        self.data = data
        self.id = id
        self.id_data = data[id]
        self.docs_data = data[docs]
        self.tfidf = None
        self.tfidf = None
        self.indices = pd.Series(data.index, index=data[id])
        self.cosine_sim = None

    def preprocess_text(self, docs):
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        tokens_list = []
        for doc in docs:
            tokens = nltk.word_tokenize(doc)
            tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token.isalpha() and token.lower() in ENGLISH_WORDS]        
            tokens = [lemmatizer.lemmatize(token) for token in tokens]
            tokens_list.append(tokens)
        return tokens_list

    def get_tfidf_scores(self, docs):
        self.tfidf = TfidfVectorizer()
        self.tfidf_matrix = self.tfidf.fit_transform(docs)  
        self.cosine_sim = cosine_similarity(self.tfidf_matrix)  
        
    def get_recommendations(self, id, num_recommends=5):
        idx = self.indices[id]
        sim_scores = list(enumerate(self.cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:num_recommends]
        item_indices = [i[0] for i in sim_scores]
        return self.data[self.id].iloc[item_indices]

    def fit(self):
        tokens_list = [[' '.join(doc)][0] for doc in self.preprocess_text(self.docs_data)]
        self.get_tfidf_scores(tokens_list)

In [8]:
import pickle
with open('tfidf_recommender.pkl', 'rb') as f:
    model = pickle.load(f)

model.get_recommendations('18813642', 10)

2546       19596
6198      731804
5581    25065629
1060       29209
4292    16101018
1771       99300
5052     6449290
5359      225545
5826      171020
Name: goodreads_book_id, dtype: object