In [1]:
import re

import pandas as pd
import numpy as np

from torch.utils.data import Dataset
from transformers import pipeline

from tqdm import tqdm

In [2]:
dir_path = f"data/babylm_data/babylm_100M/"
data_sources = [
        "aochildes.train", 
        "bnc_spoken.train", 
        "cbt.train",
        "children_stories.train",
        "gutenberg.train",
        "open_subtitles.train",
        "qed.train",
        "simple_wikipedia.train",
        "switchboard.train",
        "wikipedia.train"
    ]

In [40]:
class ListDataset(Dataset):
    def __init__(self, original_list, sample_subset=0, seed=42):
        if sample_subset and sample_subset < len(original_list):
            np.random.seed(seed)
            np.random.shuffle(original_list)
            self.ds_list = original_list[:sample_subset]
        else:
            self.ds_list = original_list
    def __len__(self):
        return len(self.ds_list)

    def __getitem__(self, i):
        return self.ds_list[i]

# Parse data and preprocessing
# Feel free to add rules to filter lines
def line_processor(line):
    line = re.sub("[\t\n]", "", line) # remove tabs and newlines
    line = re.sub(r'\s+([.,!?;:])', r'\1', line) # remove spaces before punctuation
    line = line.strip() # remove leading and trailing spaces
    if len(line.split()) <= 10: # remove lines with less than 10 words
        return None
    return line

def get_data(data_sources, sample_subset=0, seed=42):
    data = {}
    for source in data_sources:
        source_path = dir_path + source
        with open(source_path, "r") as f:
            processed = [l for l in map(line_processor, f.readlines()) if l] # remove empty lines
            data[source_path] = ListDataset(processed, sample_subset, seed)
    return data

# Classify emotions of texts in dictionary format of {source: [line1, line2, ...]}
def classify_emotion(texts, pipe, debug=False):
    results = []
    for source, ds in texts.items():
        print(f"Processing {source}")
        if debug:
            ds = ds[:100] # draw a few samples for debugging purposes
        for i, scores in enumerate(tqdm(pipe(ds, truncation=True, padding=True))):
            # store the metadata and classification scores in a list of dictionaries
            results += [{"source": source, "text": ds[i], "scores": scores}]
    return results

# Build preprocessed data into a dataframe
def build_results_df(results):
    n_labels = len(results[0]["scores"])

    df = pd.DataFrame(results)

    # expanding the classification scores data structure from classifier output
    scores_df = df["scores"].apply(pd.Series)
    result_dfs = []
    for i in range(n_labels):
        tmp = scores_df[i].apply(pd.Series).add_suffix(f"_{i}")
        result_dfs.append(tmp)

    result_dfs = pd.concat(result_dfs, axis=1)
    df = pd.concat([df[["source", "text"]], result_dfs], axis=1)
    return df

In [4]:
texts = get_data(data_sources, sample_subset=50000)

In [5]:
for source, ds in texts.items():
    print(f"{source}: {len(ds)}")

data/babylm_data/babylm_100M/aochildes.train: 50000
data/babylm_data/babylm_100M/bnc_spoken.train: 50000
data/babylm_data/babylm_100M/cbt.train: 50000
data/babylm_data/babylm_100M/children_stories.train: 50000
data/babylm_data/babylm_100M/gutenberg.train: 50000
data/babylm_data/babylm_100M/open_subtitles.train: 50000
data/babylm_data/babylm_100M/qed.train: 50000
data/babylm_data/babylm_100M/simple_wikipedia.train: 50000
data/babylm_data/babylm_100M/switchboard.train: 39645
data/babylm_data/babylm_100M/wikipedia.train: 50000


In [6]:
# Select emotion classification model

# Faster model, less emotions category with worse accuracy
# pipe = pipeline(
#     "text-classification", 
#     model="j-hartmann/emotion-english-distilroberta-base", 
#     top_k=None,
#     framework="pt", # pytorch
#     )

# Larger model, more emotions category with better accuracy, slower inference
# NOTE: This model oftens predict neutral emotion
pipe = pipeline(
    "text-classification", 
    model="SamLowe/roberta-base-go_emotions", 
    top_k=None,
    framework="pt", # pytorch
    device="mps" # multi-precision support for M1/M2 mac
    )

In [7]:
scores = classify_emotion(texts, pipe, debug=False)

Processing data/babylm_data/babylm_100M/aochildes.train


100%|██████████| 50000/50000 [15:52<00:00, 52.52it/s] 


Processing data/babylm_data/babylm_100M/bnc_spoken.train


100%|██████████| 50000/50000 [21:15<00:00, 39.20it/s]  


Processing data/babylm_data/babylm_100M/cbt.train


100%|██████████| 50000/50000 [15:24<00:00, 54.07it/s]


Processing data/babylm_data/babylm_100M/children_stories.train


100%|██████████| 50000/50000 [27:59<00:00, 29.77it/s]  


Processing data/babylm_data/babylm_100M/gutenberg.train


100%|██████████| 50000/50000 [14:22<00:00, 57.97it/s]


Processing data/babylm_data/babylm_100M/open_subtitles.train


100%|██████████| 50000/50000 [14:34<00:00, 57.17it/s]


Processing data/babylm_data/babylm_100M/qed.train


100%|██████████| 50000/50000 [15:44<00:00, 52.95it/s]  


Processing data/babylm_data/babylm_100M/simple_wikipedia.train


100%|██████████| 50000/50000 [17:54<00:00, 46.54it/s]


Processing data/babylm_data/babylm_100M/switchboard.train


100%|██████████| 39645/39645 [11:50<00:00, 55.81it/s]


Processing data/babylm_data/babylm_100M/wikipedia.train


100%|██████████| 50000/50000 [20:12<00:00, 41.24it/s]  


In [8]:
df = build_results_df(scores)

In [10]:
df["label_0"].value_counts() # first label is the most confident one

label_0
neutral           373462
curiosity          18573
admiration         15225
approval           12410
sadness             8700
confusion           8461
disapproval         6658
joy                 6037
caring              4297
disappointment      4039
desire              3664
optimism            3373
love                3175
amusement           2909
fear                2898
realization         2809
surprise            2702
annoyance           2414
gratitude           2059
excitement          1814
anger               1346
remorse             1276
disgust              591
nervousness          354
embarrassment        324
pride                 72
relief                 3
Name: count, dtype: int64

In [42]:
emotions = df["label_0"].unique()
for emotion in emotions:
    print(f"Sample text for {emotion}:")
    print(df.loc[df["label_0"] == emotion, "text"].sample(3).values)

Sample text for curiosity:
["'But why will a guy from a respectable family marry a don's sister?'"
 'are you gonna be the man who tends the store?'
 'And then name one radioactive metal which is made in a power station?']
Sample text for confusion:
["You either reject the null hypothesis or you don't reject the null hypothesis, okay?"
 "so when you found eggs at your grandma's house were they real eggs or were they plastic eggs?"
 'yeah what was he doing that he should not do there?']
Sample text for neutral:
['Brock had learned of the story from a major donor to GOPAC who connected him with Cliff Jackson, a longtime critic of Governor Clinton.'
 'This is an Ayyappa Temple on the south side of the Valllicode.'
 'Shrivelling is a natural phenomenon where an object, with an attached sub-elastic covering, has its interior volume reduced in some way. The covering, which cannot contract any further, is then obliged to wrinkle and buckle, in order to preserve surface area while containing th

In [43]:
df.to_csv("processed_data/babylm.csv", index=False)

In [44]:
df.to_parquet("processed_data/babylm.parquet", index=False)

  if _pandas_api.is_sparse(col):
