In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/books_with_categories.csv')

We will use a small fine-tuned model that extracts 7 different emotions from given text

In [4]:
from transformers import pipeline
classifier = pipeline("text-classification", 
                      model="j-hartmann/emotion-english-distilroberta-base",
                      top_k = None,
                      )
classifier("I love this!")

Device set to use cpu
  return forward_call(*args, **kwargs)


[[{'label': 'joy', 'score': 0.9771687984466553},
  {'label': 'surprise', 'score': 0.008528673090040684},
  {'label': 'neutral', 'score': 0.005764589179307222},
  {'label': 'anger', 'score': 0.004419779404997826},
  {'label': 'sadness', 'score': 0.002092391485348344},
  {'label': 'disgust', 'score': 0.0016119909705594182},
  {'label': 'fear', 'score': 0.00041385178337804973}]]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [5]:
df['description'][0]

"A memorable, mesmerizing heroine Jennifer -- brilliant, beautiful, an attorney on the way up until the Mafia's schemes win her the hatred of an implacable enemy -- and a love more destructive than hate. A dangerous, dramatic world The Dark Arena of organized crime and flashbulb lit courtrooms where ambitious prosecutors begin their climb to political power."

In [6]:
classifier(df['description'][0])

  return forward_call(*args, **kwargs)


[[{'label': 'fear', 'score': 0.9392913579940796},
  {'label': 'anger', 'score': 0.02367589809000492},
  {'label': 'joy', 'score': 0.018979186192154884},
  {'label': 'neutral', 'score': 0.007240647450089455},
  {'label': 'disgust', 'score': 0.005369345657527447},
  {'label': 'surprise', 'score': 0.003144619520753622},
  {'label': 'sadness', 'score': 0.002298859879374504}]]

We can also work the classifier on different parts of the description in case it is not clear

In [7]:
classifier(df['description'][0].split('.'))

  return forward_call(*args, **kwargs)


[[{'label': 'joy', 'score': 0.7672374844551086},
  {'label': 'fear', 'score': 0.10406126081943512},
  {'label': 'neutral', 'score': 0.04217596352100372},
  {'label': 'anger', 'score': 0.04130087420344353},
  {'label': 'disgust', 'score': 0.024568378925323486},
  {'label': 'sadness', 'score': 0.01085986103862524},
  {'label': 'surprise', 'score': 0.009796065278351307}],
 [{'label': 'fear', 'score': 0.9723208546638489},
  {'label': 'anger', 'score': 0.013167041353881359},
  {'label': 'neutral', 'score': 0.004641933366656303},
  {'label': 'surprise', 'score': 0.00456618145108223},
  {'label': 'joy', 'score': 0.002469703322276473},
  {'label': 'disgust', 'score': 0.0015335628995671868},
  {'label': 'sadness', 'score': 0.0013006995432078838}],
 [{'label': 'neutral', 'score': 0.5494767427444458},
  {'label': 'sadness', 'score': 0.11169015616178513},
  {'label': 'disgust', 'score': 0.10400672256946564},
  {'label': 'surprise', 'score': 0.07876548916101456},
  {'label': 'anger', 'score': 0.064

In [8]:
import numpy as np

emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

def calculate_max_emotion_scores(predictions):
    per_emotion_scores = {label: [] for label in emotion_labels}
    for prediction in predictions:
        sorted_predictions = sorted(prediction, key=lambda x: x["label"])
        for index, label in enumerate(emotion_labels):
            per_emotion_scores[label].append(sorted_predictions[index]["score"])
    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}

In [9]:

for i in range(10):
    isbn.append(df["isbn13"][i])
    sentences = df["description"][i].split(".")
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

  return forward_call(*args, **kwargs)


In [10]:
emotion_scores

{'anger': [np.float64(0.06413363665342331),
  np.float64(0.3514834940433502),
  np.float64(0.0814124047756195),
  np.float64(0.23222540318965912),
  np.float64(0.06413363665342331),
  np.float64(0.06413363665342331),
  np.float64(0.06413363665342331),
  np.float64(0.0723925232887268),
  np.float64(0.09586366266012192),
  np.float64(0.16215932369232178)],
 'disgust': [np.float64(0.10400672256946564),
  np.float64(0.15072275698184967),
  np.float64(0.18449535965919495),
  np.float64(0.7271743416786194),
  np.float64(0.10400672256946564),
  np.float64(0.17792679369449615),
  np.float64(0.10400672256946564),
  np.float64(0.11641564220190048),
  np.float64(0.025357654318213463),
  np.float64(0.47978565096855164)],
 'fear': [np.float64(0.9723208546638489),
  np.float64(0.36070650815963745),
  np.float64(0.0950433686375618),
  np.float64(0.05136279761791229),
  np.float64(0.40449780225753784),
  np.float64(0.05136279761791229),
  np.float64(0.05136279761791229),
  np.float64(0.953626394271850

In [12]:
from tqdm import tqdm

emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

for i in tqdm(range(len(df))):
    isbn.append(df["isbn13"][i])
    sentences = df["description"][i].split(".")
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

  return forward_call(*args, **kwargs)
100%|██████████| 3825/3825 [05:51<00:00, 10.88it/s]


In [13]:
emotions_df = pd.DataFrame(emotion_scores)
emotions_df["isbn13"] = isbn

In [14]:
emotions_df

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,neutral,isbn13
0,0.064134,0.104007,0.972321,0.767237,0.549477,0.111690,0.078765,9780006178736
1,0.351483,0.150723,0.360707,0.251881,0.732685,0.111690,0.078765,9780006280897
2,0.081412,0.184495,0.095043,0.040564,0.884390,0.475881,0.078765,9780006280934
3,0.232225,0.727174,0.051363,0.043376,0.621393,0.111690,0.271903,9780006380832
4,0.064134,0.104007,0.404498,0.040564,0.549477,0.820282,0.234487,9780006472612
...,...,...,...,...,...,...,...,...
3820,0.956607,0.104007,0.051363,0.040564,0.549477,0.111690,0.078765,9784770028969
3821,0.541743,0.226292,0.305919,0.040564,0.594007,0.111690,0.078765,9788122200850
3822,0.064134,0.340519,0.185487,0.071290,0.778808,0.161394,0.118303,9788125026600
3823,0.064134,0.104007,0.051363,0.749307,0.964967,0.111690,0.078765,9788171565641


In [15]:

df = pd.merge(df, emotions_df, on = "isbn13")

In [16]:
df.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,...,title_and_subtitle,tagged_description,simple_categories,anger,disgust,fear,joy,sadness,surprise,neutral
0,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,...,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction,0.064134,0.104007,0.972321,0.767237,0.549477,0.11169,0.078765
1,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,...,The Four Loves,9780006280897 Lewis' work on the nature of lov...,Nonfiction,0.351483,0.150723,0.360707,0.251881,0.732685,0.11169,0.078765
2,9780006280934,6280935,The Problem of Pain,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,...,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",Nonfiction,0.081412,0.184495,0.095043,0.040564,0.88439,0.475881,0.078765
3,9780006380832,6380832,Empires of the Monsoon,A History of the Indian Ocean and Its Invaders,Richard Hall,"Africa, East",http://books.google.com/books/content?id=MuPEQ...,Until Vasco da Gama discovered the sea-route t...,1998.0,4.41,...,Empires of the Monsoon: A History of the India...,9780006380832 Until Vasco da Gama discovered t...,Nonfiction,0.232225,0.727174,0.051363,0.043376,0.621393,0.11169,0.271903
4,9780006472612,6472613,Master of the Game,,Sidney Sheldon,Adventure stories,http://books.google.com/books/content?id=TkTYp...,Kate Blackwell is an enigma and one of the mos...,1982.0,4.11,...,Master of the Game,9780006472612 Kate Blackwell is an enigma and ...,Nonfiction,0.064134,0.104007,0.404498,0.040564,0.549477,0.820282,0.234487


In [17]:
df.to_csv("../data/books_with_emotions.csv", index = False)