In [20]:
import re
import pickle

import pandas as pd
import numpy as np

from torch.utils.data import Dataset
from transformers import pipeline

from tqdm import tqdm

## Pipeline
- get data
- apply preprocessing
- infer emotions
- aggregate and save results

In [2]:
dir_path = f"data/babylm_data/babylm_100M/"
data_sources = [
        "aochildes.train", 
        "bnc_spoken.train", 
        "cbt.train",
        "children_stories.train",
        "gutenberg.train",
        "open_subtitles.train",
        "qed.train",
        "simple_wikipedia.train",
        "switchboard.train",
        "wikipedia.train"
    ]

In [12]:
class ListDataset(Dataset):
    def __init__(self, original_list, sample_subset=0, seed=42):
        if sample_subset and sample_subset < len(original_list):
            np.random.seed(seed)
            np.random.shuffle(original_list)
            self.ds_list = original_list[:sample_subset]
        else:
            self.ds_list = original_list
    def __len__(self):
        return len(self.ds_list)

    def __getitem__(self, i):
        return self.ds_list[i]

# Parse data and preprocessing
# Feel free to add rules to filter lines
def line_processor(line):
    line = re.sub("[\t\n]", "", line) # remove tabs and newlines
    line = re.sub(r'\s+([.,!?;:])', r'\1', line) # remove spaces before punctuation
    line = line.strip() # remove leading and trailing spaces
    if len(line.split()) <= 10: # remove lines with less than 10 words
        return None
    return line

def get_data(data_sources, sample_subset=0, seed=42):
    data = {}
    for source in data_sources:
        source_path = dir_path + source
        with open(source_path, "r") as f:
            processed = [l for l in map(line_processor, f.readlines()) if l] # remove empty lines
            processed = list(set(processed)) # remove duplicates
            data[source_path] = ListDataset(processed, sample_subset, seed)
    return data

# Classify emotions of texts in dictionary format of {source: [line1, line2, ...]}
def classify_emotion(texts, pipe):
    results = []
    for source, ds in texts.items():
        print(f"Processing {source}")
        for i, scores in enumerate(tqdm(pipe(ds, truncation=True, padding=True))):
            # store the metadata and classification scores in a list of dictionaries
            res_dict = {"source": source, "text": ds[i]}
            # unnest the list of scores dictionary
            res_dict.update({f'{k}_{i}': v for i, d in enumerate(scores) for k, v in d.items()})
            results += [res_dict]
    return results

In [13]:
texts = get_data(data_sources, sample_subset=100)

In [14]:
for source, ds in texts.items():
    print(f"{source}: {len(ds)}")

data/babylm_data/babylm_100M/aochildes.train: 100
data/babylm_data/babylm_100M/bnc_spoken.train: 100
data/babylm_data/babylm_100M/cbt.train: 100
data/babylm_data/babylm_100M/children_stories.train: 100
data/babylm_data/babylm_100M/gutenberg.train: 100
data/babylm_data/babylm_100M/open_subtitles.train: 100
data/babylm_data/babylm_100M/qed.train: 100
data/babylm_data/babylm_100M/simple_wikipedia.train: 100
data/babylm_data/babylm_100M/switchboard.train: 100
data/babylm_data/babylm_100M/wikipedia.train: 100


In [15]:
# Select emotion classification model

# Faster model, less emotions category with worse accuracy
# pipe = pipeline(
#     "text-classification", 
#     model="j-hartmann/emotion-english-distilroberta-base", 
#     top_k=None,
#     framework="pt", # pytorch
#     )

# Larger model, more emotions category with better accuracy, slower inference
# NOTE: This model oftens predict neutral emotion
pipe = pipeline(
    "text-classification", 
    model="SamLowe/roberta-base-go_emotions", 
    top_k=None,
    framework="pt", # pytorch
    device="cpu" # cpu / cuda / mps
    )

In [16]:
scores = classify_emotion(texts, pipe)

Processing data/babylm_data/babylm_100M/aochildes.train


100%|██████████| 100/100 [00:03<00:00, 27.75it/s]


Processing data/babylm_data/babylm_100M/bnc_spoken.train


100%|██████████| 100/100 [00:03<00:00, 27.92it/s]


Processing data/babylm_data/babylm_100M/cbt.train


100%|██████████| 100/100 [00:03<00:00, 28.26it/s]


Processing data/babylm_data/babylm_100M/children_stories.train


100%|██████████| 100/100 [00:04<00:00, 21.33it/s]


Processing data/babylm_data/babylm_100M/gutenberg.train


100%|██████████| 100/100 [00:03<00:00, 26.10it/s]


Processing data/babylm_data/babylm_100M/open_subtitles.train


100%|██████████| 100/100 [00:03<00:00, 26.20it/s]


Processing data/babylm_data/babylm_100M/qed.train


100%|██████████| 100/100 [00:03<00:00, 26.12it/s]


Processing data/babylm_data/babylm_100M/simple_wikipedia.train


100%|██████████| 100/100 [00:04<00:00, 22.47it/s]


Processing data/babylm_data/babylm_100M/switchboard.train


100%|██████████| 100/100 [00:03<00:00, 26.45it/s]


Processing data/babylm_data/babylm_100M/wikipedia.train


100%|██████████| 100/100 [00:05<00:00, 18.91it/s]


In [21]:
pickle.dump(scores, open("processed_data/babylm.pkl", "wb"))

In [15]:
# Alternatively, create a dataframe and save to csv or parquet
# df = pd.DataFrame(scores)
# df.to_csv("processed_data/babylm.csv", index=False)
# df.to_parquet("processed_data/babylm.parquet", index=False)

## Further filter emotion labels

In [24]:
scores = pickle.load(open("processed_data/babylm.pkl", "rb"))
df = pd.DataFrame(scores)

In [25]:
set(df.iloc[0].filter(regex="^label_").values) # all possible emotions

{'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'neutral',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise'}

In [26]:
def filter_emotion(df, threshold=0.5):
    filtered_df = df.copy()
    scores_df = df.filter(regex="^score_")
    labels_df = df.filter(regex="^label_")
    # retrieve confident label(s) with scores >= threshold
    labels = labels_df.where((scores_df >= threshold).values).values
    # remove nans
    labels = [tuple(set(l[~pd.isnull(l)]))for l in labels]
    filtered_df["filtered_labels"] = labels
    filtered_df["num_labels"] = [len(l) for l in labels]
    return filtered_df

filtered_df = filter_emotion(df)

In [27]:
emotion_freq = filtered_df.filtered_labels.value_counts()
emotion_freq.head(40) # label statistics

filtered_labels
(neutral,)                689
()                        114
(curiosity,)               23
(approval,)                20
(sadness,)                 19
(admiration,)              19
(confusion,)               13
(joy,)                     10
(disapproval,)              9
(fear,)                     9
(realization,)              8
(desire,)                   8
(gratitude,)                7
(optimism,)                 7
(caring,)                   6
(surprise,)                 4
(annoyance,)                4
(amusement,)                4
(neutral, disapproval)      3
(disgust,)                  3
(love,)                     3
(disappointment,)           3
(neutral, curiosity)        2
(remorse,)                  2
(neutral, confusion)        2
(sadness, neutral)          2
(neutral, approval)         1
(curiosity, confusion)      1
(sadness, remorse)          1
(anger,)                    1
(excitement,)               1
(realization, neutral)      1
(amusement, joy)        

In [28]:
emotion_freq.iloc[2:].sum()

197

In [29]:
filtered_df.num_labels.value_counts() # number of labels per text

num_labels
1    872
0    114
2     14
Name: count, dtype: int64

In [30]:
# drop if neutral or no emotion
tmp_df = filtered_df[~filtered_df.filtered_labels.apply(lambda x: "neutral" in x or not x)]
tmp_df.groupby("source").filtered_labels.value_counts().groupby(level=0).head(5)

source                                               filtered_labels
data/babylm_data/babylm_100M/aochildes.train         (confusion,)       6
                                                     (curiosity,)       5
                                                     (admiration,)      2
                                                     (approval,)        2
                                                     (desire,)          2
data/babylm_data/babylm_100M/bnc_spoken.train        (curiosity,)       4
                                                     (disapproval,)     3
                                                     (fear,)            3
                                                     (gratitude,)       3
                                                     (admiration,)      2
data/babylm_data/babylm_100M/cbt.train               (admiration,)      6
                                                     (sadness,)         4
                                           

In [35]:
# get samples with specific label
emo = "optimism"
filtered_df.loc[filtered_df.filtered_labels.apply(lambda x: emo in x), "text"].head().values

array(["So, we have erm, underlying confidence in the long term future of Alton Towers, I think that's very important to state.",
       "I hope you wo n't catch the measles, for they are not nice, especially when they strike in, but you would look all right, even if you did have red spots on your face.",
       'therefore, that the medical profession has long been hoping that a',
       'You know, you have a chance to have a real future here.',
       "I can see that posterity will triumph... in that day's transaction."],
      dtype=object)