In [None]:
import pandas as pd

In [None]:
books = pd.read_csv("books_cleaned.csv")

In [None]:
books["categories"].value_counts().reset_index()

In [None]:
books["categories"].value_counts().reset_index().query("count >= 50")

In [None]:
books[books["categories"] == "Juvenile Fiction"]

In [None]:
catgeory_mapping = {
    "Fiction": "Fiction",
    "Juvenile Fiction": "Children's Fiction",
    "Biography & Autobiography": "Nonfiction",
    "History": "Nonfiction",
    "Literacy Criticism": "Nonfiction",
    "Philosophy": "Nonfiction",
    "Religion": "Nonfiction",
    "Comics & Graphic Novels": "Fiction",
    "Drama": "Fiction",
    "Juvenile Nonfiction": "Children's Nonfiction",
    "Science": "Nonfiction",
    "Poetry": "Nonfiction",
    }

books["simple_categories"] = books["categories"].map(catgeory_mapping)

In [None]:
books[~(books["simple_categories"].isna())]

In [None]:
from transformers import pipeline

fiction_categories = ["Fiction", "Nonfiction"]
pipe = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [None]:
sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[0]

In [None]:
pipe(sequence, fiction_categories)

In [None]:
import numpy as np

max_index = np.argmax(pipe(sequence, fiction_categories)["scores"])
max_label = pipe(sequence, fiction_categories)["labels"][max_index]

In [None]:
print(max_label)

In [None]:
def generate_predictions(sequence, categories):
    predictions = pipe(sequence, categories)
    max_index = np.argmax(predictions["scores"])
    max_label = predictions["labels"][max_index]

    return max_label

In [None]:
from tqdm import tqdm

actual_categories = []
predicted_categories = []

for i in tqdm(range(0, 300)):
    sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[i]
    predicted_categories += [generate_predictions(sequence, fiction_categories)]
    actual_categories += ["Fiction"]

In [None]:
for i in tqdm(range(0, 300)):
    sequence = books.loc[books["simple_categories"] == "Nonfiction", "description"].reset_index(drop=True)[i]
    predicted_categories += [generate_predictions(sequence, fiction_categories)]
    actual_categories += ["Nonfiction"]

In [None]:
predictions_df = pd.DataFrame({"actual_categories": actual_categories, "predicted_categories": predicted_categories})

In [None]:
predictions_df.head()

In [None]:
predictions_df["correct_prediction"] = np.where(predictions_df["actual_categories"] == predictions_df["predicted_categories"], True, False)

In [None]:
predictions_df["correct_prediction"].sum() / len(predictions_df)

In [None]:
isbns = []
predicted_categories = []

missing_categories = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)

In [None]:
for i in tqdm(range(0, len(missing_categories))):
    sequence = missing_categories["description"][i]
    predicted_categories += [generate_predictions(sequence, fiction_categories)]
    isbns += [missing_categories["isbn13"][i]]

In [None]:
missing_predicted_df = pd.DataFrame({"isbn13": isbns, "predicted_categories": predicted_categories})

In [None]:
missing_predicted_df