In [1]:
import pandas as pd
import numpy as np
from transformers import pipeline
from typing import List
from tqdm import tqdm

In [2]:
books_df = pd.read_csv("../data/cleaned_books.csv")

We want to take a closer look at the categories variable to figure out how we can simplify things a bit more

In [3]:
books_categories = books_df["categories"].value_counts().reset_index().sort_values("count", ascending=False)
books_categories

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
...,...,...
252,"Detective and mystery comic books, strips, etc",1
251,Antiheroes,1
250,Divorce,1
249,"Theology, Doctrinal",1


The tail end consists of values for categories that show up just once; we are going to ignore these and actually just focus on those values that show up more than 50 times

In [4]:
books_categories[books_categories["count"] >= 50]

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
5,Philosophy,117
6,Religion,117
7,Comics & Graphic Novels,116
8,Drama,86
9,Juvenile Nonfiction,57


Next we want to map all of these to fiction and non-fiction

In [5]:
books_categories_map = {
    "Fiction": "fiction",
	"Juvenile Fiction": "fiction",
    "Biography & Autobiography": "non-fiction",
    "History": "non-fiction",
    "Literary Criticism": "non-fiction",
    "Philosophy": "non-fiction",
    "Religion": "non-fiction",
    "Comics & Graphic Novels": "fiction",
    "Drama": "fiction",
    "Juvenile Nonfiction": "non-fiction",
    "Science": "non-fiction",
    "Poetry": "fiction",
    "Literary Collections":	"fiction"
}

books_df["simple_categories"] = books_df["categories"].map(books_categories_map)

books_df

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_join_subtitle,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,fiction
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,fiction
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,Mistaken Identity,
5193,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,
5194,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,
5195,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,non-fiction


We want to check how many records that left categorized and uncategorized into fiction or non-fiction

In [6]:
len(books_df[books_df["simple_categories"].notna()])

3793

In [7]:
len(books_df[books_df["simple_categories"].isna()])

1404

Next we want to create a transformer pipeline with a zero-shot classification model (a model that has no prior training example of the classification task that it is about to perform, facebook/bart-large-mnli is one of the best overall)

For that we need to define the labels that we want it to classify to and then create the pipeline and then call the pipeline object on the text sequence and then pick the label with the highest probability

In [8]:
labels = ["fiction", "non-fiction"]
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device="mps")

Device set to use mps


In [9]:
text = "The King bolted up from his iron throne and ran towards the knight, sword gripped firmly in his hand."
output = classifier(text, labels)
print(output)
print(f"The text was classified as {labels[np.argmax(output["scores"])]}")

{'sequence': 'The King bolted up from his iron throne and ran towards the knight, sword gripped firmly in his hand.', 'labels': ['non-fiction', 'fiction'], 'scores': [0.7784358263015747, 0.22156420350074768]}
The text was classified as fiction


Compile all of that into a function

In [10]:
def generate_simple_category(text: str) -> str:
    '''
    Takes a text string and a list of labels and
    returns the most likely label
    '''
    output = classifier(text, labels)
    return output["labels"][np.argmax(output["scores"])]

Next we want to classify all descriptions into fiction and non-fiction categories, but first we want to check over a reasonable sample size, the accuracy of the model

To do that, we pick a sample of 100 fiction and 100 non-fiction descriptions and run the classifier on them and compute some statistics

In [11]:
preds = []
actual = []

In [12]:
fiction_descriptions = books_df[books_df["simple_categories"]=="fiction"]["description"].reset_index(drop=True)
nonfiction_descriptions = books_df[books_df["simple_categories"]=="non-fiction"]["description"].reset_index(drop=True)

In [13]:
generate_simple_category(fiction_descriptions[10])

'fiction'

In [14]:
generate_simple_category(nonfiction_descriptions[10])

'non-fiction'

In [15]:
for i in tqdm(range(100)):
    preds.append(generate_simple_category(fiction_descriptions[i]))
    actual.append("fiction")

for i in tqdm(range(100)):
    preds.append(generate_simple_category(nonfiction_descriptions[i]))
    actual.append("non-fiction")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:41<00:00,  2.39it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:42<00:00,  2.36it/s]


In [16]:
preds_stats_df = pd.DataFrame({
    "preds": preds,
    "actual": actual
})

preds_stats_df.head(101)

Unnamed: 0,preds,actual
0,fiction,fiction
1,non-fiction,fiction
2,non-fiction,fiction
3,non-fiction,fiction
4,non-fiction,fiction
...,...,...
96,non-fiction,fiction
97,non-fiction,fiction
98,non-fiction,fiction
99,fiction,fiction


In [17]:
preds_stats_df["correct_preds"] = np.where(
    preds_stats_df["preds"] == preds_stats_df["actual"], 1, 0
)

total_correct_preds = np.sum(preds_stats_df["correct_preds"])
accuracy = total_correct_preds/len(preds_stats_df)
print(f"The model was {accuracy * 100}% accurate")

The model was 65.5% accurate


This score can be improved but it isn't too bad a start

For now, we will pass the dsecriptions with missing categories to the classifier and use those predicted categories

While there is a risk involved in using an LLM to categorize the books, it is still fairly okay since these models are trained on huge corpuses and our overall labels are a good mix of both LLM-generated labels and human-annotated labels

In [18]:
missing_categories = books_df[books_df["simple_categories"].isna()][["isbn10", "description"]].reset_index(drop=True)

In [19]:
predicted_categories = []
for i in tqdm(range(len(missing_categories))):
    predicted_categories.append(generate_simple_category(missing_categories.loc[i, "description"]))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1404/1404 [07:04<00:00,  3.31it/s]


In [20]:
missing_categories = pd.concat([missing_categories, pd.Series(predicted_categories, name="predicted_categories")], axis=1).drop(columns=["description"])

In [21]:
books_df = pd.merge(books_df, missing_categories, on="isbn10", how="left")
books_df["simple_categories"] = np.where(
    books_df["simple_categories"].isna(), books_df["predicted_categories"], books_df["simple_categories"]
)
books_df.drop(columns=["predicted_categories"])

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_join_subtitle,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,fiction
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,fiction
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,fiction
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,non-fiction
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,non-fiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,Mistaken Identity,non-fiction
5193,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,non-fiction
5194,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,fiction
5195,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,non-fiction


Finally, we save the file

In [22]:
books_df.to_csv("../data/cleaned_categorized_books.csv", index=False)