In [5]:
import re

import pandas as pd
import numpy as np

from torch.utils.data import Dataset
from transformers import pipeline

from tqdm import tqdm

In [6]:
dir_path = f"data/babylm_data/babylm_100M/"
data_sources = [
        "aochildes.train", 
        "bnc_spoken.train", 
        "cbt.train",
        "children_stories.train",
        "gutenberg.train",
        "open_subtitles.train",
        "qed.train",
        "simple_wikipedia.train",
        "switchboard.train",
        "wikipedia.train"
    ]

In [7]:
class ListDataset(Dataset):
    def __init__(self, original_list, sample_subset=0, seed=42):
        if sample_subset and sample_subset < len(original_list):
            np.random.seed(seed)
            np.random.shuffle(original_list)
            self.ds_list = original_list[:sample_subset]
        else:
            self.ds_list = original_list
    def __len__(self):
        return len(self.ds_list)

    def __getitem__(self, i):
        return self.ds_list[i]

# Parse data and preprocessing
# Feel free to add rules to filter lines
def line_processor(line):
    line = re.sub("[\t\n]", "", line) # remove tabs and newlines
    line = re.sub(r'\s+([.,!?;:])', r'\1', line) # remove spaces before punctuation
    line = line.strip() # remove leading and trailing spaces
    if len(line.split()) <= 10: # remove lines with less than 10 words
        return None
    return line

def get_data(data_sources, sample_subset=0, seed=42):
    data = {}
    for source in data_sources:
        source_path = dir_path + source
        with open(source_path, "r") as f:
            processed = [l for l in map(line_processor, f.readlines()) if l] # remove empty lines
            processed = list(set(processed)) # remove duplicates
            data[source_path] = ListDataset(processed, sample_subset, seed)
    return data

# Classify emotions of texts in dictionary format of {source: [line1, line2, ...]}
def classify_emotion(texts, pipe, debug=False):
    results = []
    for source, ds in texts.items():
        print(f"Processing {source}")
        if debug:
            ds = ds[:100] # draw a few samples for debugging purposes
        for i, scores in enumerate(tqdm(pipe(ds, truncation=True, padding=True))):
            # store the metadata and classification scores in a list of dictionaries
            results += [{"source": source, "text": ds[i], "scores": scores}]
    return results

# Build preprocessed data into a dataframe
def build_results_df(results):
    n_labels = len(results[0]["scores"])

    df = pd.DataFrame(results)

    # expanding the classification scores data structure from classifier output
    scores_df = df["scores"].apply(pd.Series)
    result_dfs = []
    for i in range(n_labels):
        tmp = scores_df[i].apply(pd.Series).add_suffix(f"_{i}")
        result_dfs.append(tmp)

    result_dfs = pd.concat(result_dfs, axis=1)
    df = pd.concat([df[["source", "text"]], result_dfs], axis=1)
    return df

In [8]:
texts = get_data(data_sources, sample_subset=50000)

In [9]:
for source, ds in texts.items():
    print(f"{source}: {len(ds)}")

data/babylm_data/babylm_100M/aochildes.train: 42420
data/babylm_data/babylm_100M/bnc_spoken.train: 50000
data/babylm_data/babylm_100M/cbt.train: 50000
data/babylm_data/babylm_100M/children_stories.train: 50000
data/babylm_data/babylm_100M/gutenberg.train: 50000
data/babylm_data/babylm_100M/open_subtitles.train: 50000
data/babylm_data/babylm_100M/qed.train: 50000
data/babylm_data/babylm_100M/simple_wikipedia.train: 50000
data/babylm_data/babylm_100M/switchboard.train: 39608
data/babylm_data/babylm_100M/wikipedia.train: 50000


In [10]:
# Select emotion classification model

# Faster model, less emotions category with worse accuracy
# pipe = pipeline(
#     "text-classification", 
#     model="j-hartmann/emotion-english-distilroberta-base", 
#     top_k=None,
#     framework="pt", # pytorch
#     )

# Larger model, more emotions category with better accuracy, slower inference
# NOTE: This model oftens predict neutral emotion
pipe = pipeline(
    "text-classification", 
    model="SamLowe/roberta-base-go_emotions", 
    top_k=None,
    framework="pt", # pytorch
    device="mps" # multi-precision support for M1/M2 mac
    )

In [11]:
scores = classify_emotion(texts, pipe, debug=False)

Processing data/babylm_data/babylm_100M/aochildes.train


100%|██████████| 42420/42420 [13:42<00:00, 51.59it/s] 


Processing data/babylm_data/babylm_100M/bnc_spoken.train


100%|██████████| 50000/50000 [21:09<00:00, 39.40it/s]  


Processing data/babylm_data/babylm_100M/cbt.train


100%|██████████| 50000/50000 [16:01<00:00, 52.01it/s]  


Processing data/babylm_data/babylm_100M/children_stories.train


100%|██████████| 50000/50000 [29:16<00:00, 28.47it/s]  


Processing data/babylm_data/babylm_100M/gutenberg.train


100%|██████████| 50000/50000 [14:19<00:00, 58.19it/s]


Processing data/babylm_data/babylm_100M/open_subtitles.train


100%|██████████| 50000/50000 [14:39<00:00, 56.86it/s] 


Processing data/babylm_data/babylm_100M/qed.train


100%|██████████| 50000/50000 [15:03<00:00, 55.32it/s]


Processing data/babylm_data/babylm_100M/simple_wikipedia.train


100%|██████████| 50000/50000 [17:46<00:00, 46.90it/s]  


Processing data/babylm_data/babylm_100M/switchboard.train


100%|██████████| 39608/39608 [11:54<00:00, 55.42it/s]


Processing data/babylm_data/babylm_100M/wikipedia.train


100%|██████████| 50000/50000 [20:37<00:00, 40.40it/s]  


In [12]:
df = build_results_df(scores)

In [13]:
df["label_0"].value_counts() # first label is the most confident one

label_0
neutral           367430
curiosity          16914
admiration         15747
approval           12329
sadness             8739
confusion           7748
disapproval         6412
joy                 6179
caring              4243
disappointment      4157
desire              3692
optimism            3497
love                3214
fear                2971
realization         2894
amusement           2818
surprise            2748
annoyance           2373
gratitude           2085
excitement          1923
remorse             1284
anger               1267
disgust              614
embarrassment        358
nervousness          311
pride                 73
relief                 8
Name: count, dtype: int64

In [14]:
emotions = df["label_0"].unique()
# emotions = ["optimism", "excitement"]
for emotion in emotions:
    print(f"Sample text for {emotion}:")
    print(df.loc[df["label_0"] == emotion, "text"].sample(3).values)

Sample text for neutral:
['In pronouncing these words he drew a sharp knife across the guide-rope by which I was suspended, and as we then happened to be precisely over my own house (which, during my peregrinations, had been handsomely rebuilt), it so occurred that I tumbled headlong down the ample chimney and alit upon the dining-room hearth.'
 'What that really means is just make a string with a color, name of the color in it.'
 'The result is simple: we will have a political caste system which emphasizes, which accelerates the collapse of a world.']
Sample text for realization:
["Dad realized he didn't need to be in charge to get some glory, because he didn't need glory at all."
 'The more I thought about it, however, the more I came to the view that this fish knows something.'
 'He realized that I regarded the matter seriously, and he saw me off when I left for the east with a grin tempered by honest sympathy and understanding.']
Sample text for confusion:
["Half the time I don't k

In [15]:
df.to_csv("processed_data/babylm.csv", index=False)

In [16]:
df.to_parquet("processed_data/babylm.parquet", index=False)

  if _pandas_api.is_sparse(col):


In [17]:
df = pd.read_parquet("processed_data/babylm.parquet")


In [18]:
df.head()

Unnamed: 0,source,text,label_0,score_0,label_1,score_1,label_2,score_2,label_3,score_3,...,label_23,score_23,label_24,score_24,label_25,score_25,label_26,score_26,label_27,score_27
0,data/babylm_data/babylm_100M/aochildes.train,and if while you are sheltering beneath palm o...,neutral,0.964169,approval,0.012598,annoyance,0.006894,realization,0.006892,...,remorse,0.000332,nervousness,0.000318,grief,0.000296,relief,0.00025,pride,0.000188
1,data/babylm_data/babylm_100M/aochildes.train,oh here do you want to try and put it in there.,neutral,0.885177,curiosity,0.063852,annoyance,0.019247,confusion,0.01722,...,nervousness,0.000297,relief,0.000204,grief,0.000199,remorse,0.000191,pride,0.000105
2,data/babylm_data/babylm_100M/aochildes.train,let's see if you would try the right hand if i...,neutral,0.79988,curiosity,0.18431,optimism,0.026595,desire,0.016805,...,relief,0.000407,embarrassment,0.000404,gratitude,0.000395,grief,0.000278,pride,0.000169
3,data/babylm_data/babylm_100M/aochildes.train,she's not really going to hurt you she just sa...,neutral,0.556108,caring,0.413813,approval,0.045091,disapproval,0.039041,...,embarrassment,0.000707,pride,0.000591,amusement,0.000536,excitement,0.000473,surprise,0.000315
4,data/babylm_data/babylm_100M/aochildes.train,this guy is in the winter time and this guy is...,neutral,0.525609,confusion,0.486314,curiosity,0.192952,realization,0.027604,...,embarrassment,0.00059,remorse,0.000544,grief,0.000301,relief,0.000279,pride,0.000115
