### LIBS & CONNECTOR

In [1]:
from IPython.display import clear_output
import math
import numpy as np
import pandas as pd
import sqlalchemy as sa
import mysql.connector
import getpass
import warnings
warnings.filterwarnings("ignore")

In [2]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from hdbscan import HDBSCAN

In [None]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

ENGLISH_WORDS = set(words.words())

In [3]:
HOST = 'localhost'
USER = 'root'
DATABASE = 'goodreads'
PASSWORD = getpass.getpass(f'Enter password for {USER}: ')

In [4]:
def getconn():
    conn = mysql.connector.connect(
        host=HOST,
        user=USER,
        password=PASSWORD,
        database=DATABASE
    )
    return conn

pool = sa.create_engine(
    "mysql+mysqlconnector://",
    creator=getconn,
)

with pool.connect() as db_conn:
    results = db_conn.execute(sa.text("SELECT NOW()")).fetchone()
    print("Current time: ", results[0])

Current time:  2023-06-07 10:28:19


### DATA

In [6]:
books_query = sa.text(
    "SELECT * FROM books;"
)
books_df = pd.read_sql_query(books_query, con=pool.connect())

### MODEL

In [83]:
class Doc2VecModel:
    def __init__(self, data, data_id, data_document):
        self.data = data
        self.data_id = data_id
        self.data_document = data_document
        self.documents = data[data_document]
        self.ids = data[data_id]
        self.indices = pd.Series(data.index, index=self.ids)
        self.model = None
        self.docvecs = []
        self.sim_matrix = None

    def preprocess_doc(self):
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        docs_list = []
        for doc in self.documents:
            tokens = word_tokenize(doc)
            tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token.isalpha() and token.lower() in ENGLISH_WORDS]        
            tokens = [lemmatizer.lemmatize(token) for token in tokens]
            docs_list.append(' '.join(tokens))
        return docs_list

    def get_recommendations(self, book_id, num_recommends=5):
        idx = self.indices[book_id]
        sim_scores = list(enumerate(self.sim_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:num_recommends]
        item_indices = [i[0] for i in sim_scores]
        return self.data.iloc[item_indices]

    def get_docvecs(self):
        return self.indices, self.docvecs

    def fit(self):
        processed_docs = self.preprocess_doc()
        documents = [TaggedDocument(doc, [index]) for index, doc in enumerate(processed_docs)]
        self.model = Doc2Vec(documents, vector_size=50, window=5, min_count=3, workers=4, epochs=5)
        data_len = len(self.ids)
        for i in range(0, data_len):
            self.docvecs.append(self.model.docvecs[i])
        self.sim_matrix = cosine_similarity(self.docvecs)

In [89]:
book_id = 'goodreads_book_id'
description = 'description'
model = Doc2VecModel(books_df, book_id, description)
model.fit()
model.get_recommendations('3')

Unnamed: 0,goodreads_book_id,isbn,isbn13,original_publication_year,original_title,title,language_code,average_rating,ratings_count,description
7657,285212,425216594,9780425216590.0,2007.0,On the Prowl,"On the Prowl (Alpha & Omega, #0.5)",eng,4.05,21757,Alpha and Omega 0.5 by Patricia BriggsThe were...
7948,33293,393324869,9780393324860.0,2002.0,Naked Economics: Undressing the Dismal Science,Naked Economics: Undressing the Dismal Science,eng,4.0,5646,Naked Economics makes up for all of those Econ...
7524,37100,743451791,9780743451800.0,1990.0,Devil in a Blue Dress,"Devil in a Blue Dress (Easy Rawlins, #1)",eng,3.91,12969,"In Los Angeles of the late 1940s, Easy Rawlins..."
7556,439363,1421513439,9781421513430.0,2001.0,ラブ★コン 1,"Love★Com, Vol. 1",eng,4.22,13801,"Risa Koizumi is the tallest girl in class, and..."


In [94]:
import pickle

with open('./models/doc2vec.pkl', 'wb') as f:
    pickle.dump(model, f)

In [95]:
with open('./models/doc2vec.pkl', 'rb') as f:
    test_model = pickle.load(f)


In [97]:
test_model.get_recommendations('731804')

Unnamed: 0,goodreads_book_id,isbn,isbn13,original_publication_year,original_title,title,language_code,average_rating,ratings_count,description
9509,17487,340796154,9780340796150.0,1943.0,Five Go Adventuring Again,"Five Go Adventuring Again (Famous Five, #2)",eng,3.99,10362,There's a thief at Kirrin Cottage! The Famous ...
8899,348564,60097914,9780060097910.0,1964.0,Flat Stanley,"Flat Stanley (Flat Stanley, #1)",eng,3.82,11078,"When Stanley Lambchop wakes up one morning, hi..."
8792,633270,140376410,9780140376420.0,1994.0,"The Ear, the Eye, and the Arm","The Ear, the Eye, and the Arm",eng,3.85,12389,General Matsika's children steal out of the ho...
8495,232958,439673631,9780439673630.0,2005.0,I Love You Through And Through,I Love You Through and Through,eng,4.39,10808,A rhyming story of unconditional love with ado...


In [98]:
books_df.loc[indices['731804']]

goodreads_book_id                                                       731804
isbn                                                                  60776390
isbn13                                                         9780060776400.0
original_publication_year                                               2006.0
original_title                                                    Pinkalicious
title                                                             Pinkalicious
language_code                                                              eng
average_rating                                                            4.05
ratings_count                                                            16585
description                  This sparkling New York Times bestselling pict...
Name: 6198, dtype: object