In [None]:
import pandas as pd
from transformers import pipeline
import torch
from dotenv import load_dotenv
import numpy as np
load_dotenv()

In [4]:
df = pd.read_csv('../data/cleaned_data_v2.csv')

In [5]:
df['categories'][:100]

0               Fiction
1        Christian life
2        Christian life
3          Africa, East
4     Adventure stories
            ...        
95     Juvenile Fiction
96              Fiction
97           Philosophy
98              Fiction
99             Religion
Name: categories, Length: 100, dtype: object

We can look at categories with 34 books or more (the most frequent categories)

In [6]:
df['categories'].value_counts().reset_index().query('count >= 34')

Unnamed: 0,categories,count
0,Fiction,1681
1,Juvenile Fiction,337
2,Biography & Autobiography,232
3,History,134
4,Comics & Graphic Novels,86
5,Religion,80
6,Philosophy,70
7,Literary Criticism,66
8,Drama,65
9,Juvenile Nonfiction,46


We will narrow down these frequent categories into a smaller meaningful subset 

In [7]:
category_mapping = {
 'Fiction' : "Fiction",
 'Juvenile Fiction': "Children's Fiction",
 'Biography & Autobiography': "Nonfiction",
 'History': "Nonfiction",
 'Literary Criticism': "Nonfiction",
 'Philosophy': "Nonfiction",
 'Religion': "Nonfiction",
 'Comics & Graphic Novels': "Fiction",
 'Drama': "Fiction",
 'Juvenile Nonfiction': "Children's Nonfiction",
 'Science': "Nonfiction",
 'Poetry': "Fiction"
 }

In [8]:
df['simple_categories'] = df['categories'].map(category_mapping)

In [9]:
df.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,missing_description,age_of_book,words_in_description,title_and_subtitle,tagged_description,simple_categories
0,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,0,32.0,57,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction
1,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,0,23.0,45,The Four Loves,9780006280897 Lewis' work on the nature of lov...,
2,9780006280934,6280935,The Problem of Pain,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,0,23.0,75,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",
3,9780006380832,6380832,Empires of the Monsoon,A History of the Indian Ocean and Its Invaders,Richard Hall,"Africa, East",http://books.google.com/books/content?id=MuPEQ...,Until Vasco da Gama discovered the sea-route t...,1998.0,4.41,608.0,65.0,0,27.0,80,Empires of the Monsoon: A History of the India...,9780006380832 Until Vasco da Gama discovered t...,
4,9780006472612,6472613,Master of the Game,,Sidney Sheldon,Adventure stories,http://books.google.com/books/content?id=TkTYp...,Kate Blackwell is an enigma and one of the mos...,1982.0,4.11,489.0,43540.0,0,43.0,30,Master of the Game,9780006472612 Kate Blackwell is an enigma and ...,


In [10]:
df[~(df['simple_categories'].isna())].shape

(2869, 18)

In [11]:
print(torch.__version__)

2.7.1+cpu


We will use zero-shot text classification (using facebook bart pretrained model)

In [14]:

pipe = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [15]:
fiction_categories = ['Fiction','Nonfiction']

In [18]:
sequence = df.loc[df['simple_categories'] == 'Fiction','description'].reset_index(drop=True)[0]

In [19]:
pipe(sequence,fiction_categories)

{'sequence': "A memorable, mesmerizing heroine Jennifer -- brilliant, beautiful, an attorney on the way up until the Mafia's schemes win her the hatred of an implacable enemy -- and a love more destructive than hate. A dangerous, dramatic world The Dark Arena of organized crime and flashbulb lit courtrooms where ambitious prosecutors begin their climb to political power.",
 'labels': ['Fiction', 'Nonfiction'],
 'scores': [0.5055234432220459, 0.4944764971733093]}

Now we can llok at the category with highest score/probability using np.argmax

In [21]:
max_index = np.argmax(pipe(sequence,fiction_categories)['scores'])
max_label = pipe(sequence,fiction_categories)['labels'][max_index]

In [22]:
max_label

'Fiction'

We can now create a function that generates the most probable label(fiction or non fiction) based on given sequence

In [23]:
def generate_predictions(sequence,categories):
    predictions = pipe(sequence,categories)
    max_index = np.argmax(predictions['scores'])
    max_label = predictions['labels'][max_index] 
    return max_label   

Checking Model accuracy

In [24]:
from tqdm import tqdm
actual_cats = []
predicted_cats = []

for i in tqdm(range(0,300)):
    sequence = df.loc[df['simple_categories'] == 'Fiction',"description"].reset_index(drop=True)
    predicted_cats += [generate_predictions(sequence,fiction_categories)]
    actual_cats += ["Fiction"]

100%|██████████| 300/300 [04:32<00:00,  1.10it/s]


In [25]:
for i in tqdm(range(0,300)):
    sequence = df.loc[df['simple_categories'] == 'Nonfiction',"description"].reset_index(drop=True)
    predicted_cats += [generate_predictions(sequence,fiction_categories)]
    actual_cats += ["Nonfiction"]

100%|██████████| 300/300 [04:52<00:00,  1.03it/s]


In [26]:
predictions_df = pd.DataFrame({"actual_categories":actual_cats,"predicted_categories":predicted_cats})
predictions_df

Unnamed: 0,actual_categories,predicted_categories
0,Fiction,Fiction
1,Fiction,Fiction
2,Fiction,Fiction
3,Fiction,Fiction
4,Fiction,Fiction
...,...,...
595,Nonfiction,Nonfiction
596,Nonfiction,Nonfiction
597,Nonfiction,Nonfiction
598,Nonfiction,Nonfiction


In [27]:
predictions_df['correct_predictions'] = (
    np.where(predictions_df['actual_categories'] == predictions_df['predicted_categories'],1,0)
)

In [30]:
predictions_df['correct_predictions'].sum()/len(predictions_df)

np.float64(1.0)

Now we can use this model to predict the missing categories

In [50]:
isbns = []
predicted_cats = []

missing_cats = df.loc[df['simple_categories'].isna(),["isbn13","description"]].reset_index(drop=True)
len(missing_cats)

956

In [52]:
for i in tqdm(range(0,len(missing_cats))):
    sequence = missing_cats['description'][i]
    predicted_cats += [generate_predictions(sequence,fiction_categories)]
    isbns += [missing_cats['isbn13'][i]]

100%|██████████| 956/956 [20:58<00:00,  1.32s/it]


In [53]:
len(isbns)

956

In [54]:
len(predicted_cats)

956

In [56]:
missing_predicted_df = pd.DataFrame({"isbn13":isbns,"predicted_categories":predicted_cats})
missing_predicted_df

Unnamed: 0,isbn13,predicted_categories
0,9780006280897,Nonfiction
1,9780006280934,Nonfiction
2,9780006380832,Nonfiction
3,9780006472612,Nonfiction
4,9780006483014,Fiction
...,...,...
951,9784766113389,Fiction
952,9784770028037,Nonfiction
953,9788125026600,Nonfiction
954,9788171565641,Fiction


Last step will be merging the dataframes

In [57]:
df = pd.merge(df, missing_predicted_df, on="isbn13", how="left")
df["simple_categories"] = np.where(df["simple_categories"].isna(), df["predicted_categories"], df["simple_categories"])
df = df.drop(columns = ["predicted_categories"])

In [60]:
df['simple_categories'].isna().sum()

np.int64(0)

Saving the dataset

In [62]:
df.to_csv("../data/books_with_categories.csv", index=False)