### IMPORT LIBRARIES & CREATE CONNECTOR

In [1]:
import pandas as pd
import numpy as np
import sqlalchemy as sa
import mysql.connector
import getpass

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

ENGLISH_WORDS = set(words.words())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [26]:
import math

In [3]:
HOST = 'localhost'
USER = 'root'
DATABASE = 'goodreads'
PASSWORD = getpass.getpass(f'Enter password for {USER}: ')

In [5]:
def getconn():
    conn = mysql.connector.connect(
        host=HOST,
        user=USER,
        password=PASSWORD,
        database=DATABASE
    )
    return conn

pool = sa.create_engine(
    "mysql+mysqlconnector://",
    creator=getconn,
)

with pool.connect() as db_conn:
    results = db_conn.execute(sa.text("SELECT NOW()")).fetchone()
    print("Current time: ", results[0])

Current time:  2023-05-29 16:53:57


### SANDBOX

In [6]:
book_id_query = sa.text(
    "SELECT goodreads_book_id FROM books;"
)
book_id = pd.read_sql_query(book_id_query, con=pool.connect())
book_id.head()

Unnamed: 0,goodreads_book_id
0,2767052
1,3
2,41865
3,2657
4,4671


In [7]:
id_list = book_id.goodreads_book_id.tolist()

In [8]:
user_id_query = sa.text(
    "SELECT DISTINCT(user_id) FROM new_ratings_;"
)
user_id = pd.read_sql_query(user_id_query, con=pool.connect())
user_id.head()

Unnamed: 0,user_id
0,1
1,10
2,100
3,1000
4,10000


In [9]:
user_list = user_id.user_id.tolist()
user_list[:5]

['1', '10', '100', '1000', '10000']

In [10]:
# def get_ratings(user_id):
#     query = sa.text(
#         f"SELECT goodreads_book_id as book_id, rating FROM new_ratings_ WHERE user_id = {user_id};"
#     )
#     result = pd.read_sql_query(query, con=pool.connect())
#     books_list = result.book_id.tolist()
#     ratings_list = result.rating.apply(lambda x: round(x*0.2,4)).tolist()
#     return books_list, ratings_list

In [11]:
# user_ratings_df = pd.DataFrame(columns=list(['user_id']+id_list))
# for user in user_list[:5]:
#     user_books, user_ratings = get_ratings(user)
#     user_profile = pd.DataFrame({'user_id': [user]}, columns=user_ratings_df.columns)
#     user_profile.loc[0,user_books] = user_ratings
#     user_profile.fillna(0, inplace=True)
#     user_ratings_df = pd.concat([user_ratings_df, user_profile], ignore_index=True)

In [12]:
query =  sa.text(
    "SELECT * FROM processed_description;"
)

df = pd.read_sql_query(query, con=pool.connect())
df.head()

Unnamed: 0,goodreads_book_id,processed_descr
0,2767052,could survive wild every one make sure live se...
1,3,harry potter idea famous raised miserable aunt...
2,41865,three absolutely part know dominant part might...
3,2657,unforgettable novel childhood sleepy southern ...
4,4671,alternate cover edition great third book supre...


In [17]:
def preprocess_text(docs):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens_list = []
    for doc in docs:
        tokens = nltk.word_tokenize(doc)
        tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token.isalpha() and token.lower() in ENGLISH_WORDS]        
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        tokens_list.append(tokens)
    return tokens_list
    
def get_recommendations( id, num_recommends=5):
    idx = indices[id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommends]
    item_indices = [i[0] for i in sim_scores]
    return data[id].iloc[item_indices]

docs_data = df['processed_descr']
tokens_list = [[' '.join(doc)][0] for doc in preprocess_text(docs_data)]
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(tokens_list)  
cosine_sim = cosine_similarity(tfidf_matrix)  

In [23]:
sample

'could survive wild every one make sure live see morning place known north nation shining surrounded twelve outlying harsh cruel line forcing send one boy one girl twelve eighteen participate annual hunger fight death live alone mother younger sister death sentence forward take sister place close dead survival second nature without really meaning becomes contender win start making weight survival humanity life love'

In [29]:
def get_tf_idf_score(doc, corpus):
  """
  Calculates the TF-IDF score of a document.

  Args:
    doc: The document to calculate the TF-IDF score for.
    corpus: The corpus of documents.

  Returns:
    A dictionary mapping each word in the document to its TF-IDF score.
  """

  # Calculate the term frequency of each word in the document.
  term_frequencies = {}
  for word in doc:
    if word not in term_frequencies:
      term_frequencies[word] = 0
    term_frequencies[word] += 1

  # Calculate the inverse document frequency of each word in the corpus.
  inverse_document_frequencies = {}
  for word in corpus:
    if word not in inverse_document_frequencies:
      inverse_document_frequencies[word] = 0
    inverse_document_frequencies[word] += 1

  # Calculate the TF-IDF score of each word in the document.
  tf_idf_scores = {}
  for word in term_frequencies:
    tf_idf_scores[word] = term_frequencies[word] * math.log(len(corpus) / inverse_document_frequencies[word])

  return tf_idf_scores


In [42]:
sample = tfidf_matrix.toarray()[0]

In [35]:
corpus = tfidf.get_feature_names_out()
get_tf_idf_score(nltk.word_tokenize(sample), corpus)

{'could': 9.884814287270506,
 'survive': 9.884814287270506,
 'wild': 9.884814287270506,
 'every': 9.884814287270506,
 'one': 29.654442861811518,
 'make': 9.884814287270506,
 'sure': 9.884814287270506,
 'live': 19.769628574541013,
 'see': 9.884814287270506,
 'morning': 9.884814287270506,
 'place': 19.769628574541013,
 'known': 9.884814287270506,
 'north': 9.884814287270506,
 'nation': 9.884814287270506,
 'shining': 9.884814287270506,
 'surrounded': 9.884814287270506,
 'twelve': 19.769628574541013,
 'outlying': 9.884814287270506,
 'harsh': 9.884814287270506,
 'cruel': 9.884814287270506,
 'line': 9.884814287270506,
 'forcing': 9.884814287270506,
 'send': 9.884814287270506,
 'boy': 9.884814287270506,
 'girl': 9.884814287270506,
 'eighteen': 9.884814287270506,
 'participate': 9.884814287270506,
 'annual': 9.884814287270506,
 'hunger': 9.884814287270506,
 'fight': 9.884814287270506,
 'death': 19.769628574541013,
 'alone': 9.884814287270506,
 'mother': 9.884814287270506,
 'younger': 9.8848142