In [1]:
import re

import pandas as pd

from torch.utils.data import Dataset
from transformers import pipeline

from tqdm import tqdm

In [2]:
dir_path = f"data/babylm_data/babylm_100M/"
data_sources = [
        "aochildes.train", 
        "bnc_spoken.train", 
        "cbt.train",
        "children_stories.train",
        "gutenberg.train",
        "open_subtitles.train",
        "qed.train",
        "simple_wikipedia.train",
        "switchboard.train",
        "wikipedia.train"
    ]

In [15]:
class ListDataset(Dataset):
    def __init__(self, original_list):
        self.original_list = original_list

    def __len__(self):
        return len(self.original_list)

    def __getitem__(self, i):
        return self.original_list[i]

# Parse data and preprocessing
# Feel free to add rules to filter lines
def line_processor(line):
    line = re.sub("[\t\n]", "", line)
    if len(line.split()) <= 10: # remove lines with less than 10 words
        return ""
    return line

def get_data(data_sources):
    data = {}
    for source in data_sources:
        source_path = dir_path + source
        with open(source_path, "r") as f:
            processed = [l for l in map(line_processor, f.readlines()) if l != ""] # remove empty lines
            data[source_path] = ListDataset(processed)
    return data

# Classify emotions of texts in dictionary format of {source: [line1, line2, ...]}
def classify_emotion(texts, pipe, debug=False):
    results = []
    for source, ds in texts.items():
        print(f"Processing {source}")
        if debug:
            ds = ds[:100] # draw a few samples for debugging purposes
        for i, scores in enumerate(tqdm(pipe(ds))):
            # store the metadata and classification scores in a list of dictionaries
            results += [{"source": source, "text": ds[i], "scores": scores}]
    return results

# Build preprocessed data into a dataframe
def build_results_df(results):
    n_labels = len(results[0]["scores"])

    df = pd.DataFrame(results)

    # expanding the classification scores data structure from classifier output
    scores_df = df["scores"].apply(pd.Series)
    result_dfs = []
    for i in range(n_labels):
        tmp = scores_df[i].apply(pd.Series).add_suffix(f"_{i}")
        result_dfs.append(tmp)

    result_dfs = pd.concat(result_dfs, axis=1)
    df = pd.concat([df[["source", "text"]], result_dfs], axis=1)
    return df

In [16]:
texts = get_data(data_sources)

In [17]:
for source, ds in texts.items():
    print(f"{source}: {len(ds)}")

data/babylm_data/babylm_100M/aochildes.train: 58443
data/babylm_data/babylm_100M/bnc_spoken.train: 230305
data/babylm_data/babylm_100M/cbt.train: 193152
data/babylm_data/babylm_100M/children_stories.train: 60727
data/babylm_data/babylm_100M/gutenberg.train: 586518
data/babylm_data/babylm_100M/open_subtitles.train: 643669
data/babylm_data/babylm_100M/qed.train: 334878
data/babylm_data/babylm_100M/simple_wikipedia.train: 312748
data/babylm_data/babylm_100M/switchboard.train: 39645
data/babylm_data/babylm_100M/wikipedia.train: 158724


In [20]:
# Faster model, less emotions category with worse accuracy
# pipe = pipeline(
#     "text-classification", 
#     model="j-hartmann/emotion-english-distilroberta-base", 
#     top_k=None,
#     framework="pt", # pytorch
#     )

# Larger model, more emotions category with better accuracy, slower inference
# NOTE: This model oftens predict neutral emotion
pipe = pipeline(
    "text-classification", 
    model="SamLowe/roberta-base-go_emotions", 
    top_k=None,
    framework="pt", # pytorch
    device="mps" # multi-precision support for M1/M2 mac
    )

In [22]:
scores = classify_emotion(texts, pipe, debug=True)

Processing data/babylm_data/babylm_100M/aochildes.train


100%|██████████| 100/100 [00:00<00:00, 348364.12it/s]


Processing data/babylm_data/babylm_100M/bnc_spoken.train


100%|██████████| 100/100 [00:00<00:00, 68635.31it/s]


Processing data/babylm_data/babylm_100M/cbt.train


100%|██████████| 100/100 [00:00<00:00, 78135.32it/s]


Processing data/babylm_data/babylm_100M/children_stories.train


100%|██████████| 100/100 [00:00<00:00, 517815.31it/s]


Processing data/babylm_data/babylm_100M/gutenberg.train


100%|██████████| 100/100 [00:00<00:00, 2231012.77it/s]


Processing data/babylm_data/babylm_100M/open_subtitles.train


100%|██████████| 100/100 [00:00<00:00, 1847711.01it/s]


Processing data/babylm_data/babylm_100M/qed.train


100%|██████████| 100/100 [00:00<00:00, 1542023.53it/s]


Processing data/babylm_data/babylm_100M/simple_wikipedia.train


100%|██████████| 100/100 [00:00<00:00, 241607.37it/s]


Processing data/babylm_data/babylm_100M/switchboard.train


100%|██████████| 100/100 [00:00<00:00, 1784810.21it/s]


Processing data/babylm_data/babylm_100M/wikipedia.train


100%|██████████| 100/100 [00:00<00:00, 757094.58it/s]


In [23]:
df = build_results_df(scores)

In [24]:
df.sample(20)

Unnamed: 0,source,text,label_0,score_0,label_1,score_1,label_2,score_2,label_3,score_3,...,label_23,score_23,label_24,score_24,label_25,score_25,label_26,score_26,label_27,score_27
150,data/babylm_data/babylm_100M/bnc_spoken.train,"I I, what I would say cos we will be competing...",neutral,0.912323,approval,0.061228,realization,0.01863,optimism,0.014532,...,pride,0.000463,nervousness,0.000344,embarrassment,0.000307,remorse,0.000302,grief,0.000216
372,data/babylm_data/babylm_100M/children_stories....,Early the next morning the Mayor was walking i...,neutral,0.599123,disappointment,0.125382,sadness,0.076769,annoyance,0.046245,...,desire,0.001532,gratitude,0.001132,love,0.000956,curiosity,0.000812,confusion,0.000703
517,data/babylm_data/babylm_100M/open_subtitles.train,"He continues singing into the night, perhaps b...",neutral,0.891077,approval,0.042059,confusion,0.041083,realization,0.034707,...,remorse,0.000579,embarrassment,0.000474,anger,0.000419,grief,0.000412,pride,0.000359
944,data/babylm_data/babylm_100M/wikipedia.train,"In March 1944, after the ""Great Escape"" from S...",neutral,0.930208,approval,0.060955,realization,0.017518,admiration,0.007075,...,surprise,0.000606,embarrassment,0.000506,nervousness,0.000441,grief,0.000426,remorse,0.000372
654,data/babylm_data/babylm_100M/qed.train,"And pretty soon, he was drawing up fake work o...",neutral,0.799525,approval,0.053756,realization,0.023438,annoyance,0.022349,...,curiosity,0.000495,confusion,0.000472,gratitude,0.000461,grief,0.000391,remorse,0.000238
46,data/babylm_data/babylm_100M/aochildes.train,that's not the one that you had in your crib l...,neutral,0.511131,disapproval,0.498174,caring,0.026068,approval,0.021217,...,remorse,0.001168,nervousness,0.001166,surprise,0.00114,embarrassment,0.000808,pride,0.000419
450,data/babylm_data/babylm_100M/gutenberg.train,"George kept his head down, and Harry, who was ...",neutral,0.956541,approval,0.037086,realization,0.009469,admiration,0.00508,...,grief,0.000436,embarrassment,0.000418,nervousness,0.000418,surprise,0.000389,pride,0.000371
98,data/babylm_data/babylm_100M/aochildes.train,you want to hold the bird oh he's a funny bird...,amusement,0.600101,admiration,0.274011,joy,0.219254,curiosity,0.164385,...,disgust,0.000883,fear,0.000799,embarrassment,0.000696,remorse,0.000673,grief,0.000561
71,data/babylm_data/babylm_100M/aochildes.train,well you know if you tip it upside down they'l...,neutral,0.929897,approval,0.049182,optimism,0.020698,realization,0.008864,...,pride,0.000262,embarrassment,0.000261,remorse,0.00024,nervousness,0.00024,grief,0.000198
116,data/babylm_data/babylm_100M/bnc_spoken.train,We'll put in large ones but I mean I've got a ...,neutral,0.898712,approval,0.059181,realization,0.024587,confusion,0.013899,...,remorse,0.000386,nervousness,0.000357,embarrassment,0.000311,pride,0.00026,grief,0.000236


In [25]:
df["label_0"].value_counts()

label_0
neutral           799
admiration         32
curiosity          26
approval           19
sadness            18
joy                16
disappointment     13
confusion          11
amusement           9
disapproval         9
love                9
caring              8
optimism            5
excitement          5
annoyance           4
desire              3
embarrassment       3
anger               2
surprise            2
remorse             2
realization         2
gratitude           1
disgust             1
fear                1
Name: count, dtype: int64

In [26]:
df.to_csv("processed_data/babylm.csv", index=False)