### LIB & CONNECTOR

In [1]:
from IPython.display import clear_output
import pandas as pd
import numpy as np
import sqlalchemy as sa
import mysql.connector
import getpass

In [2]:
import math
import numpy as np
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
from nltk import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from hdbscan import HDBSCAN

In [9]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [4]:
HOST = 'localhost'
USER = 'root'
DATABASE = 'goodreads'
PASSWORD = getpass.getpass(f'Enter password for {USER}: ')

In [5]:
def getconn():
    conn = mysql.connector.connect(
        host=HOST,
        user=USER,
        password=PASSWORD,
        database=DATABASE
    )
    return conn

pool = sa.create_engine(
    "mysql+mysqlconnector://",
    creator=getconn,
)

with pool.connect() as db_conn:
    results = db_conn.execute(sa.text("SELECT NOW()")).fetchone()
    print("Current time: ", results[0])

Current time:  2023-06-04 15:00:39


### DATA

In [38]:
books_query = sa.text(
    "SELECT * FROM books;"
)
books_df = pd.read_sql_query(books_query, con=pool.connect())

In [6]:
query = sa.text(
    "SELECT * FROM processed_description;"
)
descriptions_df = pd.read_sql_query(query, con=pool.connect())
descriptions = descriptions_df.processed_descr.tolist()

In [7]:
user_id_query = sa.text(
    "SELECT DISTINCT(user_id) FROM new_ratings_;"
)
user_id_df = pd.read_sql_query(user_id_query, con=pool.connect())
user_id_list = user_id_df.user_id.tolist()

In [8]:
query = sa.text(
    "SELECT user_id, goodreads_book_id, (rating * 0.2) as rating from new_ratings_;"
)
user_ratings = pd.read_sql_query(query, con=pool.connect())

### MODEL

In [10]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(descriptions)]
model = Doc2Vec(documents, vector_size=100, window=5, min_count=2, workers=4, epochs=5)

In [18]:
docvecs = []
for i in range(0, 9814):
    docvecs.append(model.docvecs[i])

In [23]:
ids = descriptions_df.goodreads_book_id.tolist()
indices = pd.Series(descriptions_df.index, index=ids)

In [21]:
sim_matrix = cosine_similarity(docvecs)

In [46]:
idx = indices[4214]
sim_scores = list(enumerate(sim_matrix[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# sim_scores = sim_scores[1:10]
item_indices = [i[0] for i in sim_scores[1:3]]

# GET RECOMMENDS
recommends = books_df.loc[books_df.goodreads_book_id == '4214']
items = descriptions_df.iloc[item_indices].goodreads_book_id.tolist()
for index in items:
    # query = sa.text(
    #     f"SELECT * FROM books WHERE goodreads_book_id = {index};"
    # )
    # temp = pd.read_sql_query(query, con=pool.connect())
    temp = books_df.loc[books_df.goodreads_book_id == str(index)]
    recommends = pd.concat([recommends, temp])
display(recommends)

Unnamed: 0,goodreads_book_id,isbn,isbn13,original_publication_year,original_title,title,language_code,average_rating,ratings_count,description
44,4214,770430074,9780770430080.0,2001.0,Life of Pi,Life of Pi,eng,3.88,1003228,Life of Pi is a fantasy adventure novel by Yan...
9091,16762211,1481268503,9781481268510.0,2012.0,Sometimes Never,"Sometimes Never (Sometimes Never, #1)",eng,3.94,15306,Hope didn't have the best role model when it c...
8025,19307,670557110,9780670557110.0,1948.0,Pippi Långstrump i Söderhavet,Pippi in the South Seas,eng,4.1,13911,"""Any reappearance of the irrepressible Pippi L..."


In [110]:
user_1 = user_1.sort_values(by=['rating'], ascending=False)
user_1.index = user_1.goodreads_book_id
user_1.drop(columns=['goodreads_book_id'], inplace=True)

In [147]:
rated_items = user_1.index

In [159]:
rated_recommend = [str(item) for item in item_indices if str(item) in rated_items]

In [160]:
user_1.loc[rated_recommend]

Unnamed: 0_level_0,rating
goodreads_book_id,Unnamed: 1_level_1
7126,0.6
7728,1.0
7603,0.6
7244,0.8
7144,1.0
320,0.2
9712,0.8
5826,0.4
5297,0.8
5064,0.8
