In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
df = pd.read_csv("../data/cleaned_books.csv")
df.head()

Unnamed: 0,book_id,authors,original_publication_year,title,language_code,average_rating,image_url,description,content
0,2767052,suzanne collins,2008.0,"The Hunger Games (The Hunger Games, #1)",eng,4.34,https://images.gr-assets.com/books/1447303603m...,first in the ground-breaking hunger games tril...,first in the ground-breaking hunger games tril...
1,3,"j.k. rowling, mary grandpré",1997.0,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,https://images.gr-assets.com/books/1474154022m...,rescued from the outrageous neglect of his aun...,rescued from the outrageous neglect of his aun...
2,41865,stephenie meyer,2005.0,"Twilight (Twilight, #1)",en-US,3.57,https://images.gr-assets.com/books/1361039443m...,"when 17 year old isabella swan moves to forks,...","when 17 year old isabella swan moves to forks,..."
3,2657,harper lee,1960.0,To Kill a Mockingbird,eng,4.25,https://images.gr-assets.com/books/1361975680m...,harper lee's classic novel of a lawyer in the ...,harper lee's classic novel of a lawyer in the ...
4,4671,f. scott fitzgerald,1925.0,The Great Gatsby,eng,3.89,https://images.gr-assets.com/books/1490528560m...,the only authorized edition of the twentieth-c...,the only authorized edition of the twentieth-c...


In [2]:
df[['title', 'content']].isnull().sum()

title      0
content    0
dtype: int64

In [3]:
df.shape

(4763, 9)

In [4]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['content'])
tfidf_matrix.shape

(4763, 38127)

In [5]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

(4763, 4763)

In [6]:
def recommend_books(title, df, cosine_sim, top_n):
    matches = df[df['title'].str.contains(title, case=False, na=False)]

    if matches.empty:
        return "No matching books found."

    idx = matches.index[0]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    book_indices = [i[0] for i in sim_scores[1:top_n+1]]

    return df.iloc[book_indices][['title', 'authors', 'average_rating']]

In [7]:
recommend_books('Psychology',df,cosine_sim,10)

Unnamed: 0,title,authors,average_rating
2625,A Long Way from Chicago (A Long Way from Chica...,richard peck,3.92
2971,"A Year Down Yonder (A Long Way from Chicago, #2)",richard peck,4.1
1506,How to Stop Worrying and Start Living,dale carnegie,4.08
1732,Big Magic: Creative Living Beyond Fear,elizabeth gilbert,3.9
4002,"Stay with Me (Wait for You, #3)","j. lynn, jennifer l. armentrout",4.17
1204,The Art of Happiness,"dalai lama xiv, howard c. cutler",4.14
658,"Scott Pilgrim, Volume 1: Scott Pilgrim's Preci...",bryan lee o'malley,4.18
2035,The Story of My Experiments With Truth,mahatma gandhi,4.07
1349,Behind Closed Doors,b.a. paris,3.93
630,A New Earth: Awakening to Your Life's Purpose,eckhart tolle,4.02


In [8]:
import pickle
with open("../data/cosine_sim.pkl", "wb") as f:
    pickle.dump(cosine_sim, f)