### IMPORT LIBRARIES

In [16]:
import pandas as pd
import mysql.connector
import sqlalchemy as sa
import getpass

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

ENGLISH_WORDS = set(words.words())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

### CREATE CONNECTOR

In [19]:
def getconn():
    conn = mysql.connector.connect(
        host="localhost",
        user="root",
        password=getpass.getpass(),
        database="goodreads"
    )
    return conn

In [20]:
pool = sa.create_engine(
    "mysql+mysqlconnector://",
    creator=getconn,
)

with pool.connect() as db_conn:
    results = db_conn.execute(sa.text("SELECT NOW()")).fetchone()
    print("Current time: ", results[0])

Current time:  2023-05-09 10:27:21


### IMPORT DATA

In [21]:
# # import 
# descr_query = sa.text(
#     "SELECT * FROM book_description;"
# )
# descr_df = pd.read_sql_query(descr_query, con=pool.connect())
# display(descr_df.head())
# display(len(descr_df))

# books_df = pd.read_csv(r'D:\project\goodreads\data\goodbooks-10k\books.csv')
# display(books_df.head())
# display(len(books_df))

In [22]:
# books_df = books_df[['goodreads_book_id', 'work_id',
#                      'books_count', 'isbn', 'isbn13', 'original_publication_year',
#                      'original_title', 'title', 'language_code', 'average_rating',
#                      'ratings_count']]

In [23]:
# books_df.head(1)

In [24]:
# books_df['description'] = descr_df['description']
# books_df.head(1)

In [25]:
# books_df.to_sql(name="books", con=pool, if_exists='replace', index=False)

In [26]:
books_query = sa.text(
    "SELECT * FROM books;"
)

books_df = pd.read_sql_query(books_query, con=pool.connect())
display(books_df.head(1))
display(len(books_df))

Unnamed: 0,goodreads_book_id,work_id,books_count,isbn,isbn13,original_publication_year,original_title,title,language_code,average_rating,ratings_count,description
0,2767052,2792775,272,439023483,9780439000000.0,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,"Could you survive on your own in the wild, wit..."


10000

In [27]:
books_df.language_code.isna().value_counts()

False    8916
True     1084
Name: language_code, dtype: int64

In [28]:
books_df.fillna(value='eng', inplace=True)
display(books_df.language_code.isna().value_counts())
display(books_df.language_code.value_counts())

False    10000
Name: language_code, dtype: int64

eng      7425
en-US    2070
en-GB     257
ara        64
en-CA      58
fre        25
ind        21
spa        20
ger        13
per         7
jpn         7
por         6
pol         6
en          4
nor         3
dan         3
fil         2
ita         2
vie         1
tur         1
nl          1
swe         1
rum         1
mul         1
rus         1
Name: language_code, dtype: int64

In [29]:
replace_dict = {'en-US': 'eng', 'en-GB': 'eng', 'en-CA': 'eng', 'en': 'eng'}
replace_dict

{'en-US': 'eng', 'en-GB': 'eng', 'en-CA': 'eng', 'en': 'eng'}

In [30]:
books_df.language_code.replace(replace_dict, inplace=True)
books_df.language_code.value_counts()

eng    9814
ara      64
fre      25
ind      21
spa      20
ger      13
jpn       7
per       7
pol       6
por       6
nor       3
dan       3
fil       2
ita       2
nl        1
rum       1
mul       1
tur       1
swe       1
vie       1
rus       1
Name: language_code, dtype: int64

In [31]:
books_df = books_df[books_df.language_code == 'eng']
display(len(books_df))
display(books_df.head(1))

9814

Unnamed: 0,goodreads_book_id,work_id,books_count,isbn,isbn13,original_publication_year,original_title,title,language_code,average_rating,ratings_count,description
0,2767052,2792775,272,439023483,9780439023480.0,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,"Could you survive on your own in the wild, wit..."


In [32]:
descr_list = books_df.description.tolist()
display(descr_list[:2])
display(len(descr_list))

["Could you survive on your own in the wild, with every one out to make sure you don't live to see the morning?In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and one girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV.Sixteen-year-old Katniss Everdeen, who lives alone with her mother and younger sister, regards it as a death sentence when she steps forward to take her sister's place in the Games. But Katniss has been close to dead before—and survival, for her, is second nature. Without really meaning to, she becomes a contender. But if she is to win, she will have to start making choices that weight survival against humanity and life against love.",
 "Harry Potter has no idea how famous he is. That's because he's being raised by his 

9814

### REPROCESS DESCRIPTION

In [33]:
def preprocess_text(docs):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens_list = []
    for doc in docs:
        tokens = nltk.word_tokenize(doc)
        tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token.isalpha() and token.lower() in ENGLISH_WORDS]        
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        tokens_list.append(tokens)
    return tokens_list

In [34]:
processed_descr_list = [[' '.join(doc)][0] for doc in preprocess_text(descr_list)]
display(processed_descr_list[0])
display(descr_list[0])

'could survive wild every one make sure live see morning place known north nation shining surrounded twelve outlying harsh cruel line forcing send one boy one girl twelve eighteen participate annual hunger fight death live alone mother younger sister death sentence forward take sister place close dead survival second nature without really meaning becomes contender win start making weight survival humanity life love'

"Could you survive on your own in the wild, with every one out to make sure you don't live to see the morning?In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and one girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV.Sixteen-year-old Katniss Everdeen, who lives alone with her mother and younger sister, regards it as a death sentence when she steps forward to take her sister's place in the Games. But Katniss has been close to dead before—and survival, for her, is second nature. Without really meaning to, she becomes a contender. But if she is to win, she will have to start making choices that weight survival against humanity and life against love."

In [35]:
id_list = books_df.goodreads_book_id.tolist()
processed_descr_df = pd.DataFrame({'goodreads_book_id': id_list, 'processed_descr': processed_descr_list})
display(processed_descr_df.head(2))

Unnamed: 0,goodreads_book_id,processed_descr
0,2767052,could survive wild every one make sure live se...
1,3,harry potter idea famous raised miserable aunt...


In [36]:
processed_descr_df.to_sql(name='processed_description', con=pool, if_exists='replace', index=False)

9814

### CONTENT BASED MODEL

In [42]:
def get_tfidf_scores(docs):
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(docs)
    return pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out()), tfidf_matrix

In [43]:
tfidf_df, tfidf_matrix = get_tfidf_scores(processed_descr_list)
display(tfidf_df.head(1))
display(tfidf_matrix)

Unnamed: 0,aa,aba,aback,abandon,abandoned,abandonment,abbas,abbess,abbey,abbot,...,zodiac,zombie,zone,zoo,zoologist,zoology,zoom,zorro,zounds,zucchini
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<9814x19649 sparse matrix of type '<class 'numpy.float64'>'
	with 504929 stored elements in Compressed Sparse Row format>

In [45]:
cosine_sim = cosine_similarity(tfidf_matrix)

In [63]:
processed_descr_df.goodreads_book_id = processed_descr_df.goodreads_book_id.astype(str)
processed_descr_df.goodreads_book_id

0       2767052
1             3
2         41865
3          2657
4          4671
         ...   
9809    7130616
9810     208324
9811      77431
9812    8565083
9813       8914
Name: goodreads_book_id, Length: 9814, dtype: object

In [64]:
indices = pd.Series(processed_descr_df.index, index=processed_descr_df.goodreads_book_id)
def get_recommendations(id, cosine_sim=cosine_sim):
    idx = indices[id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    book_indices = [i[0] for i in sim_scores]
    return processed_descr_df['goodreads_book_id'].iloc[book_indices]

In [78]:
def result(id):
    recommendations = get_recommendations(id)
    for recommendation in recommendations:
        print(recommendation)

In [82]:
tfidf_model = get_recommendations

In [83]:
tfidf_model('731804')

1609     6452796
7833       84351
7910    13593553
9812     8565083
2711    18813642
8765      833550
4365       34898
5193     8046350
7725      820273
5323       93731
Name: goodreads_book_id, dtype: object

In [80]:
books_df[books_df.goodreads_book_id == '731804']

Unnamed: 0,goodreads_book_id,work_id,books_count,isbn,isbn13,original_publication_year,original_title,title,language_code,average_rating,ratings_count,description
6296,731804,717994,12,60776390,9780060776400.0,2006.0,Pinkalicious,Pinkalicious,eng,4.05,16585,This sparkling New York Times bestselling pict...


In [None]:
# books_df.goodreads_book_id = books_df.goodreads_book_id.astype(str)
# books_df.to_sql(name='books', con=pool, if_exists='replace', index=False)

In [84]:
import pickle

with open('tfidf_model.pkl', 'wb') as f:
    pickle.dump(tfidf_model, f)

In [86]:
with open('tfidf_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [87]:
model('731804')

1609     6452796
7833       84351
7910    13593553
9812     8565083
2711    18813642
8765      833550
4365       34898
5193     8046350
7725      820273
5323       93731
Name: goodreads_book_id, dtype: object

In [108]:
class tfidf_recommender:
    def __init__(self, data, id, docs):
        self.data = data
        self.id_data = data[id]
        self.docs_data = data[docs]
        self.tfidf = None
        self.tfidf = None
        self.indices = pd.Series(data.index, index=data[id])
        self.cosine_sim = None

    def preprocess_text(self, docs):
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        tokens_list = []
        for doc in docs:
            tokens = nltk.word_tokenize(doc)
            tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token.isalpha() and token.lower() in ENGLISH_WORDS]        
            tokens = [lemmatizer.lemmatize(token) for token in tokens]
            tokens_list.append(tokens)
        return tokens_list

    def get_tfidf_scores(self, docs):
        self.tfidf = TfidfVectorizer()
        self.tfidf_matrix = self.tfidf.fit_transform(docs)  
        self.cosine_sim = cosine_similarity(self.tfidf_matrix)  
        
    def get_recommendations(self, id, num_recommends=5):
        idx = self.indices[id]
        sim_scores = list(enumerate(self.cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:num_recommends]
        item_indices = [i[0] for i in sim_scores]
        return self.data['id'].iloc[item_indices]

    def fit(self):
        tokens_list = [[' '.join(doc)][0] for doc in self.preprocess_text(self.docs_data)]
        self.get_tfidf_scores(tokens_list)

In [None]:
recommender = tfidf_recommender(books_df, 'goodreads_book_id', 'description')
recommender.fit()

In [None]:
recommender.get_recommendations(731804)