In [1]:
import re

import pandas as pd
import numpy as np

from torch.utils.data import Dataset
from transformers import pipeline

from tqdm import tqdm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [30]:
import json

def line_processor(line):
    line = re.sub("[\t\n]", "", line) # remove tabs and newlines
    line = re.sub(r'\s+([.,!?;:])', r'\1', line) # remove spaces before punctuation
    line = line.strip() # remove leading and trailing spaces
    if len(line.split()) <= 10: # remove lines with less than 10 words
        return None
    return line


class ListDataset(Dataset):
    def __init__(self, original_list, sample_subset=0, seed=42):
        if sample_subset and sample_subset < len(original_list):
            np.random.seed(seed)
            np.random.shuffle(original_list)
            self.ds_list = original_list[:sample_subset]
        else:
            self.ds_list = original_list
    def __len__(self):
        return len(self.ds_list)

    def __getitem__(self, i):
        return self.ds_list[i]

# Load your JSON data from the file
with open('regen.json', 'r') as file:
    data = json.load(file)

output_texts = {}
# Loop through each task in the data and extract the output
for task in data:
    output_texts[hash(task['instruction'])] = line_processor(task['output'])



In [31]:
output_texts

{4642226854051725291: 'The weather for today is mostly sunny with a high of 75 degrees. There is a 20% chance of rain in the evening. Tomorrow, the weather will be partly cloudy with a high of 80 degrees and a 10% chance of rain. Looks like a great day to enjoy some time outdoors!',
 4670833557879808156: '- Fresh broccoli for added vitamins and fiber- Brown rice as a healthy source of carbohydrates- Lean chicken breasts for a good source of protein- Almonds for healthy fats and nutrients. With these items, you can make a delicious and nutritious meal to fuel your body and keep you feeling your best.',
 -2463548543465539305: "It was a typical Monday morning when Sarah discovered she had won the $100 million lottery. She couldn't believe it and immediately quit her job at the local diner. She decided to use some of the money to travel to her dream destination, Greece. However, while on her trip, Sarah met an artist struggling to make ends meet and decided to use her winnings to help him 

In [34]:
# select the emotion model
# Larger model, more emotions category with better accuracy, slower inference
# NOTE: This model oftens predict neutral emotion
pipe = pipeline(
    "text-classification", 
    model="SamLowe/roberta-base-go_emotions", 
    top_k=None,
    framework="pt", # pytorch
    device="mps" # multi-precision support for M1/M2 mac
    )

In [35]:
# Classify emotions of texts in dictionary format of {source: [line1, line2, ...]}
def classify_emotion(texts, pipe, debug=False):
    if debug:
        items_to_process = dict(list(texts.items())[:2])
    else:
        items_to_process = texts
    results = []
    for source, ds in items_to_process.items():
        print(f"Processing {source}")
        for i, scores in enumerate(tqdm(pipe(ds, truncation=True, padding=True))):
            # store the metadata and classification scores in a list of dictionaries
            results += [{"source": source, "text": ds[i], "scores": scores}]
    return results

In [36]:
single_score= pipe('good day', truncation=True, padding=True)
single_score

[[{'label': 'joy', 'score': 0.5746990442276001},
  {'label': 'admiration', 'score': 0.13253362476825714},
  {'label': 'gratitude', 'score': 0.10777304321527481},
  {'label': 'excitement', 'score': 0.10059910267591476},
  {'label': 'caring', 'score': 0.05107508599758148},
  {'label': 'approval', 'score': 0.03529415279626846},
  {'label': 'neutral', 'score': 0.03499137610197067},
  {'label': 'relief', 'score': 0.021551210433244705},
  {'label': 'optimism', 'score': 0.01971583254635334},
  {'label': 'love', 'score': 0.010799115523695946},
  {'label': 'pride', 'score': 0.010441371239721775},
  {'label': 'desire', 'score': 0.00317931454628706},
  {'label': 'amusement', 'score': 0.0030931527726352215},
  {'label': 'realization', 'score': 0.003043602919206023},
  {'label': 'annoyance', 'score': 0.0022891098633408546},
  {'label': 'surprise', 'score': 0.002054033800959587},
  {'label': 'sadness', 'score': 0.0018666906980797648},
  {'label': 'disapproval', 'score': 0.0017995773814618587},
  {'l

In [38]:
single_score[0][0]['label']

'joy'

In [15]:
scores = classify_emotion(output_texts, pipe, debug=True)

Processing 4642226854051725291


100%|██████████| 1/1 [00:00<00:00, 35848.75it/s]


Processing 4670833557879808156


100%|██████████| 1/1 [00:00<00:00, 30174.85it/s]


In [16]:
scores

[{'source': 4642226854051725291,
  'text': 'T',
  'scores': [{'label': 'joy', 'score': 0.7305636405944824},
   {'label': 'admiration', 'score': 0.31073689460754395},
   {'label': 'optimism', 'score': 0.13744428753852844},
   {'label': 'excitement', 'score': 0.12445004284381866},
   {'label': 'approval', 'score': 0.08518987894058228},
   {'label': 'neutral', 'score': 0.026590565219521523},
   {'label': 'caring', 'score': 0.025285402312874794},
   {'label': 'gratitude', 'score': 0.01968013495206833},
   {'label': 'relief', 'score': 0.018525484949350357},
   {'label': 'desire', 'score': 0.013881110586225986},
   {'label': 'love', 'score': 0.01323052030056715},
   {'label': 'pride', 'score': 0.012432207353413105},
   {'label': 'amusement', 'score': 0.011624273844063282},
   {'label': 'realization', 'score': 0.005895585753023624},
   {'label': 'curiosity', 'score': 0.00381640437990427},
   {'label': 'annoyance', 'score': 0.0034638645593076944},
   {'label': 'disapproval', 'score': 0.0028047

In [17]:
# Build preprocessed data into a dataframe
def build_results_df(results):
    n_labels = len(results[0]["scores"])

    df = pd.DataFrame(results)

    # expanding the classification scores data structure from classifier output
    scores_df = df["scores"].apply(pd.Series)
    result_dfs = []
    for i in range(n_labels):
        tmp = scores_df[i].apply(pd.Series).add_suffix(f"_{i}")
        result_dfs.append(tmp)

    result_dfs = pd.concat(result_dfs, axis=1)
    df = pd.concat([df[["source", "text"]], result_dfs], axis=1)
    return df

In [18]:
df = build_results_df(scores)

In [20]:
df.head()

Unnamed: 0,source,text,label_0,score_0,label_1,score_1,label_2,score_2,label_3,score_3,...,label_23,score_23,label_24,score_24,label_25,score_25,label_26,score_26,label_27,score_27
0,4642226854051725291,T,joy,0.730564,admiration,0.310737,optimism,0.137444,excitement,0.12445,...,fear,0.000985,grief,0.000814,remorse,0.000673,disgust,0.000536,embarrassment,0.000475
1,4670833557879808156,-,approval,0.450838,neutral,0.212659,admiration,0.205657,caring,0.080455,...,surprise,0.000629,grief,0.000568,disgust,0.000551,remorse,0.000322,embarrassment,0.00026


In [21]:
df["label_0"].value_counts() # first label is the most confident one

label_0
joy         1
approval    1
Name: count, dtype: int64