In [1]:
import re

import pandas as pd
import numpy as np

from torch.utils.data import Dataset
from transformers import pipeline

from tqdm import tqdm

## Pipeline
- get data
- apply preprocessing
- infer emotions
- aggregate and save results

In [6]:
dir_path = f"data/babylm_data/babylm_100M/"
data_sources = [
        "aochildes.train", 
        "bnc_spoken.train", 
        "cbt.train",
        "children_stories.train",
        "gutenberg.train",
        "open_subtitles.train",
        "qed.train",
        "simple_wikipedia.train",
        "switchboard.train",
        "wikipedia.train"
    ]

In [7]:
class ListDataset(Dataset):
    def __init__(self, original_list, sample_subset=0, seed=42):
        if sample_subset and sample_subset < len(original_list):
            np.random.seed(seed)
            np.random.shuffle(original_list)
            self.ds_list = original_list[:sample_subset]
        else:
            self.ds_list = original_list
    def __len__(self):
        return len(self.ds_list)

    def __getitem__(self, i):
        return self.ds_list[i]

# Parse data and preprocessing
# Feel free to add rules to filter lines
def line_processor(line):
    line = re.sub("[\t\n]", "", line) # remove tabs and newlines
    line = re.sub(r'\s+([.,!?;:])', r'\1', line) # remove spaces before punctuation
    line = line.strip() # remove leading and trailing spaces
    if len(line.split()) <= 10: # remove lines with less than 10 words
        return None
    return line

def get_data(data_sources, sample_subset=0, seed=42):
    data = {}
    for source in data_sources:
        source_path = dir_path + source
        with open(source_path, "r") as f:
            processed = [l for l in map(line_processor, f.readlines()) if l] # remove empty lines
            processed = list(set(processed)) # remove duplicates
            data[source_path] = ListDataset(processed, sample_subset, seed)
    return data

# Classify emotions of texts in dictionary format of {source: [line1, line2, ...]}
def classify_emotion(texts, pipe, debug=False):
    results = []
    for source, ds in texts.items():
        print(f"Processing {source}")
        if debug:
            ds = ds[:100] # draw a few samples for debugging purposes
        for i, scores in enumerate(tqdm(pipe(ds, truncation=True, padding=True))):
            # store the metadata and classification scores in a list of dictionaries
            results += [{"source": source, "text": ds[i], "scores": scores}]
    return results

# Build preprocessed data into a dataframe
def build_results_df(results):
    n_labels = len(results[0]["scores"])

    df = pd.DataFrame(results)

    # expanding the classification scores data structure from classifier output
    scores_df = df["scores"].apply(pd.Series)
    result_dfs = []
    for i in range(n_labels):
        tmp = scores_df[i].apply(pd.Series).add_suffix(f"_{i}")
        result_dfs.append(tmp)

    result_dfs = pd.concat(result_dfs, axis=1)
    df = pd.concat([df[["source", "text"]], result_dfs], axis=1)
    return df

In [8]:
texts = get_data(data_sources, sample_subset=50000)

In [9]:
for source, ds in texts.items():
    print(f"{source}: {len(ds)}")

data/babylm_data/babylm_100M/aochildes.train: 42420
data/babylm_data/babylm_100M/bnc_spoken.train: 50000
data/babylm_data/babylm_100M/cbt.train: 50000
data/babylm_data/babylm_100M/children_stories.train: 50000
data/babylm_data/babylm_100M/gutenberg.train: 50000
data/babylm_data/babylm_100M/open_subtitles.train: 50000
data/babylm_data/babylm_100M/qed.train: 50000
data/babylm_data/babylm_100M/simple_wikipedia.train: 50000
data/babylm_data/babylm_100M/switchboard.train: 39608
data/babylm_data/babylm_100M/wikipedia.train: 50000


In [10]:
# Select emotion classification model

# Faster model, less emotions category with worse accuracy
# pipe = pipeline(
#     "text-classification", 
#     model="j-hartmann/emotion-english-distilroberta-base", 
#     top_k=None,
#     framework="pt", # pytorch
#     )

# Larger model, more emotions category with better accuracy, slower inference
# NOTE: This model oftens predict neutral emotion
pipe = pipeline(
    "text-classification", 
    model="SamLowe/roberta-base-go_emotions", 
    top_k=None,
    framework="pt", # pytorch
    device="mps" # multi-precision support for M1/M2 mac
    )

In [11]:
scores = classify_emotion(texts, pipe, debug=False)

Processing data/babylm_data/babylm_100M/aochildes.train


100%|██████████| 42420/42420 [13:42<00:00, 51.59it/s] 


Processing data/babylm_data/babylm_100M/bnc_spoken.train


100%|██████████| 50000/50000 [21:09<00:00, 39.40it/s]  


Processing data/babylm_data/babylm_100M/cbt.train


100%|██████████| 50000/50000 [16:01<00:00, 52.01it/s]  


Processing data/babylm_data/babylm_100M/children_stories.train


100%|██████████| 50000/50000 [29:16<00:00, 28.47it/s]  


Processing data/babylm_data/babylm_100M/gutenberg.train


100%|██████████| 50000/50000 [14:19<00:00, 58.19it/s]


Processing data/babylm_data/babylm_100M/open_subtitles.train


100%|██████████| 50000/50000 [14:39<00:00, 56.86it/s] 


Processing data/babylm_data/babylm_100M/qed.train


100%|██████████| 50000/50000 [15:03<00:00, 55.32it/s]


Processing data/babylm_data/babylm_100M/simple_wikipedia.train


100%|██████████| 50000/50000 [17:46<00:00, 46.90it/s]  


Processing data/babylm_data/babylm_100M/switchboard.train


100%|██████████| 39608/39608 [11:54<00:00, 55.42it/s]


Processing data/babylm_data/babylm_100M/wikipedia.train


100%|██████████| 50000/50000 [20:37<00:00, 40.40it/s]  


In [12]:
df = build_results_df(scores)

In [15]:
df.to_csv("processed_data/babylm.csv", index=False)

In [16]:
df.to_parquet("processed_data/babylm.parquet", index=False)

  if _pandas_api.is_sparse(col):


## Further filter emotion labels

In [2]:
df = pd.read_parquet("processed_data/babylm.parquet")

In [114]:
set(df.iloc[0].filter(regex="^label_").values) # all possible emotions

{'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'neutral',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise'}

In [123]:
def filter_emotion(df, threshold=0.5):
    filtered_df = df.copy()
    scores_df = df.filter(regex="^score_")
    labels_df = df.filter(regex="^label_")
    # retrieve confident label(s) with scores >= threshold
    labels = labels_df.where((scores_df >= threshold).values).values
    # remove nans
    labels = [tuple(set(l[~pd.isnull(l)]))for l in labels]
    filtered_df["filtered_labels"] = labels
    filtered_df["num_labels"] = [len(l) for l in labels]
    return filtered_df

filtered_df = filter_emotion(df)

In [124]:
emotion_freq = filtered_df.filtered_labels.value_counts()
emotion_freq.head(40) # label statistics

filtered_labels
(neutral,)                339181
()                         55195
(curiosity,)               13031
(admiration,)              12284
(approval,)                 7412
(sadness,)                  6690
(confusion,)                5977
(disapproval,)              4721
(joy,)                      4260
(caring,)                   3149
(desire,)                   2783
(love,)                     2563
(optimism,)                 2539
(fear,)                     2368
(amusement,)                2236
(disappointment,)           2211
(surprise,)                 1925
(realization,)              1917
(gratitude,)                1812
(remorse,)                  1073
(annoyance,)                 926
(anger,)                     826
(neutral, curiosity)         821
(excitement,)                820
(neutral, sadness)           600
(neutral, disapproval)       365
(disapproval, neutral)       365
(confusion, neutral)         346
(disgust,)                   334
(neutral, confusion)       

In [125]:
emotion_freq.iloc[2:].sum()

87652

In [126]:
filtered_df.num_labels.value_counts() # number of labels per text

num_labels
1    421357
0     55195
2      5473
3         3
Name: count, dtype: int64

In [132]:
# drop if neutral or no emotion
tmp_df = filtered_df[~filtered_df.filtered_labels.apply(lambda x: "neutral" in x or not x)]
tmp_df.groupby("source").filtered_labels.value_counts().groupby(level=0).head(5)

source                                               filtered_labels  
data/babylm_data/babylm_100M/aochildes.train         (curiosity,)         4910
                                                     (confusion,)         1272
                                                     (admiration,)         728
                                                     (disapproval,)        619
                                                     (approval,)           468
data/babylm_data/babylm_100M/bnc_spoken.train        (curiosity,)         1958
                                                     (approval,)          1625
                                                     (confusion,)         1370
                                                     (disapproval,)        961
                                                     (admiration,)         881
data/babylm_data/babylm_100M/cbt.train               (admiration,)        2140
                                                     (sadnes

In [135]:
# get samples with specific label
emo = "optimism"
filtered_df.loc[filtered_df.filtered_labels.apply(lambda x: emo in x), "text"].sample(10).values

array(["What I'm hoping is that you will tell him to stand down on this and let this go.",
       'So if we hope for the future to be different, the only place we have to stand is now.',
       "and, uh, we've had General Motors for years and have always had real good luck with them.",
       "`` If Rumania comes in, as I have strong hopes now of her doing, you will see the end in five months instead of five years, '' said Susan.",
       'I will offer them freely whatever good gifts Providence permits me to distribute, and will tell them to be thankful for what they have, and humbly hopeful for more; and surely, if they are not absolute fools, they will condescend to be happy, and will allow me to be a happy Year.',
       "I mean we hope we're doing the right thing, and as I say the response we're getting seems to indicate that, but we have not erm followed through each of  we have a large number of participants every year, you see, in the order of about erm a hundred and fifty each 