### LIB & CONNECTOR

In [1]:
from IPython.display import clear_output
import pandas as pd
import numpy as np
import sqlalchemy as sa
import mysql.connector
import getpass

In [2]:
import math
import numpy as np
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
from nltk import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from hdbscan import HDBSCAN

In [4]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [5]:
HOST = 'localhost'
USER = 'root'
DATABASE = 'goodreads'
PASSWORD = getpass.getpass(f'Enter password for {USER}: ')

In [6]:
def getconn():
    conn = mysql.connector.connect(
        host=HOST,
        user=USER,
        password=PASSWORD,
        database=DATABASE
    )
    return conn

pool = sa.create_engine(
    "mysql+mysqlconnector://",
    creator=getconn,
)

with pool.connect() as db_conn:
    results = db_conn.execute(sa.text("SELECT NOW()")).fetchone()
    print("Current time: ", results[0])

Current time:  2023-06-07 10:19:10


### DATA

In [7]:
books_query = sa.text(
    "SELECT * FROM books;"
)
books_df = pd.read_sql_query(books_query, con=pool.connect())

In [8]:
query = sa.text(
    "SELECT * FROM processed_description;"
)
descriptions_df = pd.read_sql_query(query, con=pool.connect())
descriptions = descriptions_df.processed_descr.tolist()
book_ids = descriptions_df.goodreads_book_id.tolist()

In [9]:
user_id_query = sa.text(
    "SELECT DISTINCT(user_id) FROM new_ratings_;"
)
user_id_df = pd.read_sql_query(user_id_query, con=pool.connect())
user_id_list = user_id_df.user_id.tolist()

In [10]:
query = sa.text(
    "SELECT user_id, goodreads_book_id, (rating * 0.2) as rating from new_ratings_;"
)
user_ratings = pd.read_sql_query(query, con=pool.connect())

### MODEL

In [11]:
documents = [TaggedDocument(doc, [index]) for index, doc in enumerate(descriptions)]
model = Doc2Vec(documents, vector_size=50, window=5, min_count=3, workers=4, epochs=5)

In [12]:
docvecs = []
for i in range(0, 9814):
    docvecs.append(model.docvecs[i])

In [15]:
# ids = descriptions_df.goodreads_book_id.tolist()
# indices = pd.Series(descriptions_df.index, index=ids)
# sim_matrix = cosine_similarity(docvecs)
# idx = indices[4214]
# sim_scores = list(enumerate(sim_matrix[idx]))
# sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# # sim_scores = sim_scores[1:10]
# item_indices = [i[0] for i in sim_scores[1:3]]

# # GET RECOMMENDS
# recommends = books_df.loc[books_df.goodreads_book_id == '4214']
# items = descriptions_df.iloc[item_indices].goodreads_book_id.tolist()
# for index in items:
#     # query = sa.text(
#     #     f"SELECT * FROM books WHERE goodreads_book_id = {index};"
#     # )
#     # temp = pd.read_sql_query(query, con=pool.connect())
#     temp = books_df.loc[books_df.goodreads_book_id == str(index)]
#     recommends = pd.concat([recommends, temp])
# display(recommends)

In [13]:
ids_dict = {key: value for value, key in enumerate(book_ids)}

In [15]:
user_1 = user_ratings.loc[user_ratings.user_id == '1']
user_1_books = user_1.goodreads_book_id.tolist()
user_1_vecs = [docvecs[ids_dict[int(id)]] for id in user_1_books]
user_1_profile = pd.DataFrame(user_1_vecs, columns=range(0,50))

In [17]:
user_1_profile['rating'] = user_1.rating
user_1_profile.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,rating
0,0.01424,-0.064749,-0.08857,0.078107,-0.016659,0.028966,-0.019114,0.003012,-0.034805,-0.004169,...,0.045016,0.01231,0.037559,0.233561,0.031068,0.13257,0.050284,0.106182,-0.072444,0.8
1,0.008835,-0.138932,-0.05866,0.022575,-0.009486,0.062698,-0.004848,0.08297,-0.005111,0.038003,...,0.053243,-0.049654,-0.039993,0.16586,0.066175,0.134577,0.06602,0.032832,-0.065835,0.6
2,-0.010517,-0.113985,0.005687,0.096157,0.002216,0.111049,-0.052219,0.033234,0.080375,0.01342,...,-0.002338,-0.033754,0.050848,0.176285,-0.028354,0.132166,-0.001133,0.097015,-0.041196,0.6
3,-0.035552,-0.05167,-0.064328,0.120816,0.015242,0.039946,-0.010479,0.042313,-0.077645,0.003459,...,0.012973,-0.004381,0.034311,0.20107,0.071518,0.102538,-0.006899,0.036028,-0.107303,0.8
4,-0.175414,-0.178755,-0.091183,0.185268,-0.062459,0.079219,-0.056909,0.163982,-0.165086,-0.017532,...,-0.092319,-0.083106,-0.029026,0.493535,0.11501,0.276578,-0.020296,0.156983,-0.127958,0.6
