In [2]:
# general imports
import numpy as np
import pandas as pd
import os

#langchain imports
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# adding env keys
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
books = pd.read_csv("books_cleaned.csv")
books.head(4)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_subtitle,tagged_descr
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883: A NOVEL THAT READERS and critic...
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982: A new 'Christie for Christmas' ...
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736: A memorable, mesmerizing heroin..."
3,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897: Lewis' work on the nature of lo...


## Creating a txt file for storing as documents to Chroma

In [6]:
books['tagged_descr'].to_csv("tagged_description.txt", sep = '\n',index=False,header=False)

## Loading document to vectordb

In [15]:
# loading descriptions from the description file
tagged_descr_file = TextLoader("tagged_description.txt",encoding = 'utf-8').load()

# # Creating chunk of single description lines
text_splitter = CharacterTextSplitter(chunk_size = 0,chunk_overlap = 0,separator='\n')
docs = text_splitter.split_documents(tagged_descr_file)

Created a chunk of size 1169, which is longer than the specified 0
Created a chunk of size 1215, which is longer than the specified 0
Created a chunk of size 374, which is longer than the specified 0
Created a chunk of size 310, which is longer than the specified 0
Created a chunk of size 484, which is longer than the specified 0
Created a chunk of size 483, which is longer than the specified 0
Created a chunk of size 961, which is longer than the specified 0
Created a chunk of size 189, which is longer than the specified 0
Created a chunk of size 129, which is longer than the specified 0
Created a chunk of size 844, which is longer than the specified 0
Created a chunk of size 297, which is longer than the specified 0
Created a chunk of size 198, which is longer than the specified 0
Created a chunk of size 882, which is longer than the specified 0
Created a chunk of size 1089, which is longer than the specified 0
Created a chunk of size 1190, which is longer than the specified 0
Create

In [16]:
len(docs)

6144

## Create embeddings and storing to Chroma DB

In [18]:
embeddings = OpenAIEmbeddings(model = 'text-embedding-3-small')

DB_NAME = 'books-vector-db'
if os.path.exists(DB_NAME):
    Chroma(persist_directory=DB_NAME,embedding_function=embeddings).delete_collection()
    
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory= DB_NAME)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")    

Vectorstore created with 6144 documents


In [19]:
#  Fetch a sample vector and how many dimensions it has
collection = vectorstore._collection
sample_embedding = collection.get(limit = 1, include = ['embeddings'])['embeddings'][0]
dimensions = len(sample_embedding)
print("Nos of dims: ",dimensions)

Nos of dims:  1536


## Sample query from Chroma db

In [5]:
# load existing vectorstore
DB_NAME = 'books-vector-db'

# Initialize OpenAI Embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Load the existing Chroma vectorstore
persist_directory = DB_NAME  # Change to your actual directory
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

In [6]:
query = "A book on meaning of life and how to make the best version of ourselves"
simil = vectorstore.similarity_search(query,k=3)
simil

[Document(id='419906c4-4cff-4318-a7d5-802d157bda2b', metadata={'source': 'tagged_description.txt'}, page_content='9781577314806: The author shares the secret of his own self-realization and the philosophy for living in the present he has developed.'),
 Document(id='616d20d3-0871-42c1-9485-a28e1b2051ab', metadata={'source': 'tagged_description.txt'}, page_content='9780446518628: You have never read a book like this before -- a book that comes along once in a lifetime to change lives forever. In the rain forests of Peru, an ancient manuscript has been discovered. Within its pages are 9 key insights into life itself -- insights each human being is predicted to grasp sequentially; one insight, then another, as we move toward a completely spiritual culture on Earth. Drawing on ancient wisdom, it tells you how to make connections among the events happening in your life right now and lets you see what is going to happen to you in the years to come. The story it tells is a gripping one of adve

In [7]:
for i in simil:
    print(i.page_content)

9781577314806: The author shares the secret of his own self-realization and the philosophy for living in the present he has developed.
9780446518628: You have never read a book like this before -- a book that comes along once in a lifetime to change lives forever. In the rain forests of Peru, an ancient manuscript has been discovered. Within its pages are 9 key insights into life itself -- insights each human being is predicted to grasp sequentially; one insight, then another, as we move toward a completely spiritual culture on Earth. Drawing on ancient wisdom, it tells you how to make connections among the events happening in your life right now and lets you see what is going to happen to you in the years to come. The story it tells is a gripping one of adventure and discovery, but it is also a guidebook that has the power to crystallize your perceptions of why you are where you are in life and to direct your steps with a new energy and optimisim as you head into tomorrow.
97800071957

In [8]:
isbn_simil = [int(i.page_content.split(':')[0].strip()) for i in simil]
print(isbn_simil)

[9781577314806, 9780446518628, 9780007195718]


In [7]:
# function to return dataframe of books based on our query
def retrieve_semantic_recommendations(query:str, top_k: int = 5)-> pd.DataFrame:
    recs = vectorstore.similarity_search(query,k = top_k)
    books_list = []
    
    for i in recs:
        books_list.append(int(i.page_content.strip('"').split(':')[0]))
    return books[books['isbn13'].isin(books_list)]

In [10]:
retrieve_semantic_recommendations("book about science fiction",3)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_subtitle,tagged_descr
1451,9780312852535,312852533,The Humanoids,Jack Williamson,Fiction,http://books.google.com/books/content?id=vPSl0...,A classic science fiction novel features human...,1996.0,3.75,299.0,880.0,The Humanoids: A Novel,9780312852535: A classic science fiction novel...
1468,9780312890216,312890214,The Starry Rift,James Tiptree,Fiction,,This novel set in the far-future and filled wi...,1994.0,3.82,250.0,220.0,The Starry Rift,9780312890216: This novel set in the far-futur...
6082,9781932100563,1932100563,The Anthology at the End of the Universe,Glenn Yeffeth,Literary Criticism,http://books.google.com/books/content?id=R7y1P...,A selection of essays dealing with every aspec...,2005.0,3.85,240.0,123.0,The Anthology at the End of the Universe: Lead...,9781932100563: A selection of essays dealing w...


## Book Categories

In [9]:
books['categories'].value_counts().reset_index()

Unnamed: 0,categories,count
0,Fiction,2415
1,Juvenile Fiction,503
2,Biography & Autobiography,371
3,History,245
4,Literary Criticism,149
...,...,...
508,Conspiracies,1
509,Brothers and sisters,1
510,"Banks and banking, British",1
511,Rock musicians,1


In [10]:
category_mapping = {'Fiction' : "Fiction",
 'Juvenile Fiction': "Children's Fiction",
 'Biography & Autobiography': "Nonfiction",
 'History': "Nonfiction",
 'Literary Criticism': "Nonfiction",
 'Philosophy': "Nonfiction",
 'Religion': "Nonfiction",
 'Comics & Graphic Novels': "Fiction",
 'Drama': "Fiction",
 'Juvenile Nonfiction': "Children's Nonfiction",
 'Science': "Nonfiction",
 'Poetry': "Fiction"}

books['new_category'] = books['categories'].map(category_mapping)

In [11]:
# how many have thse broad labels
books[~books['new_category'].isna()].shape

(4436, 14)

### Zero Shot Classification using Huggingface Opensource Models

In [24]:
from transformers import pipeline
fiction_categories = ['Fiction','Nonfiction']
classifier = pipeline("zero-shot-classification",model="facebook/bart-large-mnli",device=1)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBartForSequenceClassification: ['model.encoder.version', 'model.decoder.version']
- This IS expected if you are initializing TFBartForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBartForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBartForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForSequenceClassification for predictions without further training.
Device set to use 1


In [26]:
sequence = books[books['new_category']=='Fiction']['description'][0]

In [27]:
print(sequence)

A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst the world has

In [29]:
classifier(sequence,fiction_categories)

{'sequence': 'A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst

In [31]:
def generate_labels(sequence,categories):
    predictions = classifier(sequence,categories)
    max_index = np.argmax(predictions['scores'])
    max_label = predictions['labels'][max_index]
    
    return max_label

In [35]:
from tqdm import tqdm
actual_cats = []
pred_cats = []

for i in tqdm(range(300)):
    sequence = books[books['new_category']=='Fiction']['description'].reset_index(drop = True)[i]
    pred_cats.append(generate_labels(sequence,fiction_categories))
    actual_cats.append('Fiction')

100%|██████████| 300/300 [10:17<00:00,  2.06s/it]


In [36]:
for i in tqdm(range(300)):
    sequence = books[books['new_category']=='Nonfiction']['description'].reset_index(drop = True)[i]
    pred_cats.append(generate_labels(sequence,fiction_categories))
    actual_cats.append('Nonfiction')

100%|██████████| 300/300 [12:05<00:00,  2.42s/it]


In [41]:
accuracy = sum(1 for i in range(len(actual_cats)) if actual_cats[i]==pred_cats[i])/len(actual_cats)
print(accuracy)

0.7666666666666667


So, our zero shot claasifier is doing a good job in categorizing into fiction and non-fiction,  
We can use this for our fiction/ non-fiction categorization from our book descriptions

In [44]:
missing_cats = books[books['new_category'].isna()][['isbn13','description']].reset_index(drop = True)

In [46]:
# giving labels if missing based on description
isbns = []
pred_cats = []
for i in tqdm(range(len(missing_cats))):
    sequence = missing_cats['description'][i]
    pred_cats.append(generate_labels(sequence,fiction_categories))
    isbns.append(missing_cats['isbn13'][i])

100%|██████████| 1708/1708 [2:01:44<00:00,  4.28s/it]     


In [47]:
missing_predicted_df = pd.DataFrame({'isbn13':isbns,'predicted_cat':pred_cats})

In [48]:
missing_predicted_df.to_csv('missing_predicted_df.csv',index=False)

## Loading Missing Category dataframe and using it for imputation

In [14]:
missing_predicted_df = pd.read_csv('missing_predicted_df.csv')
missing_predicted_df.head(3)

Unnamed: 0,isbn13,predicted_cat
0,9780002261982,Fiction
1,9780006280897,Nonfiction
2,9780006280934,Nonfiction


In [15]:
# merge with df
books = pd.merge(left=books,right=missing_predicted_df, on = 'isbn13',how = 'left')
books.head(3)


Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_subtitle,tagged_descr,new_category,predicted_cat
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883: A NOVEL THAT READERS and critic...,Fiction,
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982: A new 'Christie for Christmas' ...,,Fiction
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736: A memorable, mesmerizing heroin...",Fiction,


In [16]:
books['new_category'] = np.where(books['new_category'].isna(),books['predicted_cat'],books['new_category'])

In [17]:
books.drop(columns=['predicted_cat'],inplace = True)

In [18]:
books['new_category'].value_counts()

new_category
Fiction                  3246
Nonfiction               2296
Children's Fiction        503
Children's Nonfiction      99
Name: count, dtype: int64

## Extract emotions from books description

In [19]:
from transformers import pipeline
emotion_clf = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True,device = 0)
emotion_clf("I am not really sure how to describe this")





All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at j-hartmann/emotion-english-distilroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
Device set to use 0


[[{'label': 'anger', 'score': 0.009566172026097775},
  {'label': 'disgust', 'score': 0.016042295843362808},
  {'label': 'fear', 'score': 0.02668078802525997},
  {'label': 'joy', 'score': 0.0031731538474559784},
  {'label': 'neutral', 'score': 0.2814905345439911},
  {'label': 'sadness', 'score': 0.00686474097892642},
  {'label': 'surprise', 'score': 0.6561824083328247}]]

In [20]:
emotion_clf(books['description'][0])

[[{'label': 'anger', 'score': 0.00393513822928071},
  {'label': 'disgust', 'score': 0.01910071074962616},
  {'label': 'fear', 'score': 0.6548417210578918},
  {'label': 'joy', 'score': 0.015161281451582909},
  {'label': 'neutral', 'score': 0.16985173523426056},
  {'label': 'sadness', 'score': 0.11640884727239609},
  {'label': 'surprise', 'score': 0.020700590685009956}]]

From the description it seems that it is not capturing the emotion correctly, lets try capturing the individual sentences emotions.

In [21]:
emotion_clf(books['description'][0].split('.'))

[[{'label': 'anger', 'score': 0.0091563630849123},
  {'label': 'disgust', 'score': 0.0026284765917807817},
  {'label': 'fear', 'score': 0.06816217303276062},
  {'label': 'joy', 'score': 0.047942597419023514},
  {'label': 'neutral', 'score': 0.14038574695587158},
  {'label': 'sadness', 'score': 0.0021221619099378586},
  {'label': 'surprise', 'score': 0.7296024560928345}],
 [{'label': 'anger', 'score': 0.04047835245728493},
  {'label': 'disgust', 'score': 0.2735918462276459},
  {'label': 'fear', 'score': 0.0068790544755756855},
  {'label': 'joy', 'score': 0.10908298939466476},
  {'label': 'neutral', 'score': 0.4493703246116638},
  {'label': 'sadness', 'score': 0.09362737089395523},
  {'label': 'surprise', 'score': 0.026970157399773598}],
 [{'label': 'anger', 'score': 0.011031882837414742},
  {'label': 'disgust', 'score': 0.04342261329293251},
  {'label': 'fear', 'score': 0.014084076508879662},
  {'label': 'joy', 'score': 0.014211481437087059},
  {'label': 'neutral', 'score': 0.6462160944

In [22]:
import numpy as np

emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = []

def calculate_max_emotion_scores(predictions):
    per_emotion_scores = {label:[] for label in emotion_labels}
    for pred in predictions:
        sorted_predictions = sorted(pred, key = lambda x:x['label']) #makes sure order of labels is same
        for idx,label in enumerate(emotion_labels):
            per_emotion_scores[label].append(sorted_predictions[idx]['score'])
    
    return {label:np.max(scores) for label,scores in per_emotion_scores.items()}       


In [28]:
for i in range(30,35):
    print(books['description'][i])
    print('--'*50)
    preds = emotion_clf(books['description'][i].split('.'))
    pp = calculate_max_emotion_scores(preds)
    print(pp)

Half of this new, post-Cold War world is intent on building a better Lexus, on streamlining their societies and economies for the global marketplace, while the other half is locked in elemental struggles over who owns which olive tree, which strip of land.
----------------------------------------------------------------------------------------------------
{'anger': 0.0641336441040039, 'disgust': 0.10400677472352982, 'fear': 0.05136280879378319, 'joy': 0.0405644029378891, 'sadness': 0.8819841146469116, 'surprise': 0.11169026046991348, 'neutral': 0.07876547425985336}
FROM THE PULIZER PRIZE-WINNING AUTHOR OF THE #1 "NEW YORK TIMES" BESTSELLER "ANGELA'S ASHES" Frank McCourt's glorious childhood memoir, "Angela's Ashes, " has been loved and celebrated by readers everywhere. It won the National Book Critics Circle Award, the "Los Angeles Times" Book Award and the Pulitzer Prize. Rarely has a book so swiftly found its place on the literary landscape. And now we have "'Tis, " the story of Fran

In [23]:
from tqdm import tqdm

emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

for i in tqdm(range(len(books))):
    isbn.append(books["isbn13"][i])
    sentences = books["description"][i].split(".")
    predictions = emotion_clf(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

100%|██████████| 6144/6144 [54:28<00:00,  1.88it/s]  


In [28]:
emotions_df = pd.DataFrame(emotion_scores)
emotions_df['isbn13'] = isbn

In [30]:
emotions_df.head(3)

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,neutral,isbn13
0,0.064134,0.273592,0.928168,0.932798,0.646216,0.967158,0.729602,9780002005883
1,0.612619,0.348285,0.942528,0.704421,0.88794,0.11169,0.252546,9780002261982
2,0.064134,0.104007,0.972321,0.767237,0.549477,0.11169,0.078765,9780006178736


In [31]:
books = pd.merge(books,emotions_df,on = 'isbn13',how = 'left')

In [None]:
books.to_csv('base_data_books.csv',index=False)