### IMPORT LIBRARIES

In [11]:
import pandas as pd
import numpy as np
import sqlalchemy as sa
import mysql.connector
import getpass

In [21]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

### CREATE CONNECTION

In [None]:
HOST = 'localhost'
USER = 'root'
DATABASE = 'goodreads'

In [7]:
def getconn():
    conn = mysql.connector.connect(
        host="localhost",
        user="root",
        password=getpass.getpass(f'Enter password for root: '),
        database="goodreads"
    )
    return conn

In [12]:
pool = sa.create_engine(
    "mysql+mysqlconnector://",
    creator=getconn,
)

with pool.connect() as db_conn:
    results = db_conn.execute(sa.text("SELECT NOW()")).fetchone()
    print("Current time: ", results[0])

Current time:  2023-05-08 19:40:14


### IMPORT DATA

In [18]:
id_descr_query = sa.text(
    "SELECT goodreads_book_id, description "
    "FROM books;"
)

In [19]:
id_descr_df = pd.read_sql_query(id_descr_query, con=pool.connect())
id_descr_df.head()

Unnamed: 0,goodreads_book_id,description
0,2767052,could survive wild every one make sure live se...
1,3,harry potter idea famous raised miserable aunt...
2,41865,three absolutely part know dominant part might...
3,2657,unforgettable novel childhood sleepy southern ...
4,4671,alternate cover edition great third book supre...


In [20]:
len(id_descr_df)

9814

In [31]:
descr_df = pd.DataFrame(id_descr_df.description.tolist(), columns=['description'],index=id_descr_df.goodreads_book_id)
descr_df.head()

Unnamed: 0_level_0,description
goodreads_book_id,Unnamed: 1_level_1
2767052,could survive wild every one make sure live se...
3,harry potter idea famous raised miserable aunt...
41865,three absolutely part know dominant part might...
2657,unforgettable novel childhood sleepy southern ...
4671,alternate cover edition great third book supre...


### MODEL

In [53]:
def get_tfidf_scores(df):
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df.description)
    return pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out(), index=id_descr_df.goodreads_book_id)

In [54]:
tfidf_df = get_tfidf_scores(descr_df)
tfidf_df

Unnamed: 0_level_0,aa,aba,aback,abandon,abandoned,abandonment,abbas,abbey,abbot,abdication,...,zodiac,zombie,zone,zoo,zoologist,zoology,zoom,zorro,zounds,zucchini
goodreads_book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2767052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18071296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5750628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
cosine_sim = cosine_similarity(tfidf_matrix)

In [60]:
indices = pd.Series(id_descr_df.index, index=id_descr_df.goodreads_book_id)

In [63]:
def get_recommendations(id, cosine_sim=cosine_sim):
    idx = indices[id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    book_indices = [i[0] for i in sim_scores]
    return id_descr_df['goodreads_book_id'].iloc[book_indices]

In [73]:
def result(id):
    display(get_recommendations(id))
    display(id_descr_df[id_descr_df.goodreads_book_id == id])

In [74]:
result('18339743')

3274    2595138
2100     662597
6717       7531
8368      36064
421      119073
4106    9413044
2000    2921082
22        15881
278       32499
23            6
Name: goodreads_book_id, dtype: object

Unnamed: 0,goodreads_book_id,description
7017,18339743,indispensable source guide harry potter inform...
