In [3]:
import plotly_express as px
import pandas as pd

cleaned_df = pd.read_csv('../artifacts/datasets/books_cleaned.csv')

px.bar(cleaned_df.categories.value_counts()[:30])

In [None]:
top_categories = cleaned_df.categories.value_counts().reset_index().categories[:10].values
cleaned_df[cleaned_df['categories'].isin(top_categories)]

In [None]:
# Loading the model
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [None]:
labels = {
 'Fiction':'Fiction',
 'Juvenile Fiction':"Children's Fiction",
 'Biography & Autobiography':'NonFiction',
 'History':'NonFiction',
 'Literary Criticism':'NonFiction',
 'Philosophy':'NonFiction',
 'Religion':'NonFiction',
 'Comics & Graphic Novels':'Fiction',
 'Drama':'Fiction',
 'Juvenile Nonfiction':'Children\'s NonFiction',
 'Science':'NonFiction',
 'Poetry':'Fiction',
}
possible_labels = ['Fiction', 'NonFiction']

In [None]:
cleaned_df['simple_categories'] = cleaned_df['categories'].apply( lambda x: labels[x] if x in labels.keys() else np.nan)
cleaned_df

#### Checking Classification Accuracy

In [None]:
def classify(x,labels=possible_labels):
    pred = classifier(x,labels)
    return pred['labels'][int(np.argmax(pred['scores']))]

labled_data = cleaned_df[~cleaned_df['simple_categories'].isna()]

labled_data['predicted_simple_categories'] = labled_data['categories'].apply(lambda x : classify(x,possible_labels))

In [None]:
correct_pred = labled_data.loc[labled_data['simple_categories']==labled_data['predicted_simple_categories'],:]
correct_pred.shape[0]/labled_data.shape[0]*100
# 88% accuracy

In [None]:
predicted_simple_categories = cleaned_df['categories'].apply(lambda x : classifier(x,possible_labels))

#### Labeling with predicted labels

In [None]:
def label_category(cat,lables = labels ,possible_labels=possible_labels):
  if cat in labels.keys():
    return labels[cat]
  else:
    pred = classifier(cat,possible_labels)
    return pred['labels'][int(np.argmax(pred['scores']))]

In [None]:
cleaned_df['simple_label_prediction'] = cleaned_df['categories'].apply(lambda x: label_category(x))

simple_categories_classified = cleaned_df[['isbn10','categories','simple_label_prediction']]
simple_categories_classified.to_csv('../artifacts/datasets/simple_categories_classified.csv',index=False)

In [None]:
simple_categories_classified = pd.read_csv('../artifacts/datasets/simple_categories_classified.csv',index_col=0)

In [None]:
combined_df = cleaned_df.merge(simple_categories_classified, on = 'isbn10', how='left')
combined_df.columns

In [None]:
combined_df['simple_categories'] = combined_df['simple_label_prediction']
combined_df = combined_df.drop(columns=['simple_label_prediction'])

combined_df = combined_df[~combined_df['categories'].isna()]

In [None]:
combined_df.to_csv('../artifacts/datasets/books_with_categories.txt',index=False)