In [1]:
import re

import pandas as pd

from torch.utils.data import Dataset
from transformers import pipeline

from tqdm import tqdm

In [2]:
dir_path = f"data/babylm_data/babylm_100M/"
data_sources = [
        "aochildes.train", 
        "bnc_spoken.train", 
        "cbt.train",
        "children_stories.train",
        "gutenberg.train",
        "open_subtitles.train",
        "qed.train",
        "simple_wikipedia.train",
        "switchboard.train",
        "wikipedia.train"
    ]

In [3]:
class ListDataset(Dataset):
    def __init__(self, original_list):
        self.original_list = original_list

    def __len__(self):
        return len(self.original_list)

    def __getitem__(self, i):
        return self.original_list[i]

# Parse data and preprocessing
def line_processor(line):
    line = re.sub("[\t\n]", "", line)
    return line

def get_data(data_sources):
    data = {}
    for source in data_sources:
        source_path = dir_path + source
        with open(source_path, "r") as f:
            data[source_path] = ListDataset(list(map(line_processor, f.readlines())))
    return data

# Classify emotions of texts in dictionary format of {source: [line1, line2, ...]}
def classify_emotion(texts, pipe, debug=False):
    results = []
    for source, ds in texts.items():
        print(f"Processing {source}")
        if debug:
            ds = ds[:100] # draw a few samples for debugging purposes
        for i, scores in enumerate(tqdm(pipe(ds))):
            # store the metadata and classification scores in a list of dictionaries
            results += [{"source": source, "text": ds[i], "scores": scores}]
    return results

# Build preprocessed data into a dataframe
def build_results_df(results):
    n_labels = len(results[0]["scores"])

    df = pd.DataFrame(results)

    # expanding the classification scores data structure from classifier output
    scores_df = df["scores"].apply(pd.Series)
    result_dfs = []
    for i in range(n_labels):
        tmp = scores_df[i].apply(pd.Series).add_suffix(f"_{i}")
        result_dfs.append(tmp)

    result_dfs = pd.concat(result_dfs, axis=1)
    df = pd.concat([df[["source", "text"]], result_dfs], axis=1)
    return df

In [4]:
texts = get_data(data_sources)

In [5]:
texts

{'data/babylm_data/babylm_100M/aochildes.train': <__main__.ListDataset at 0x1046ec510>,
 'data/babylm_data/babylm_100M/bnc_spoken.train': <__main__.ListDataset at 0x14f129290>,
 'data/babylm_data/babylm_100M/cbt.train': <__main__.ListDataset at 0x15fbd37d0>,
 'data/babylm_data/babylm_100M/children_stories.train': <__main__.ListDataset at 0x15fbd3810>,
 'data/babylm_data/babylm_100M/gutenberg.train': <__main__.ListDataset at 0x15fbd2090>,
 'data/babylm_data/babylm_100M/open_subtitles.train': <__main__.ListDataset at 0x15fbd10d0>,
 'data/babylm_data/babylm_100M/qed.train': <__main__.ListDataset at 0x15fbd3850>,
 'data/babylm_data/babylm_100M/simple_wikipedia.train': <__main__.ListDataset at 0x15fbd2110>,
 'data/babylm_data/babylm_100M/switchboard.train': <__main__.ListDataset at 0x15fbd0fd0>,
 'data/babylm_data/babylm_100M/wikipedia.train': <__main__.ListDataset at 0x151d98050>}

In [6]:
# Faster model, less emotions category with worse accuracy
pipe = pipeline(
    "text-classification", 
    model="j-hartmann/emotion-english-distilroberta-base", 
    top_k=None,
    framework="pt", # pytorch
    )

# Larger model, more emotions category with better accuracy, slower inference
# NOTE: This model oftens predict neutral emotion
# pipe = pipeline(
#     "text-classification", 
#     model="SamLowe/roberta-base-go_emotions", 
#     top_k=None,
#     framework="pt",
#     )

In [7]:
scores = classify_emotion(texts, pipe, debug=True)

Processing data/babylm_data/babylm_100M/aochildes.train


100%|██████████| 100/100 [00:00<00:00, 854237.07it/s]


Processing data/babylm_data/babylm_100M/bnc_spoken.train


100%|██████████| 100/100 [00:00<00:00, 1078227.25it/s]


Processing data/babylm_data/babylm_100M/cbt.train


100%|██████████| 100/100 [00:00<00:00, 1407484.56it/s]


Processing data/babylm_data/babylm_100M/children_stories.train


100%|██████████| 100/100 [00:00<00:00, 1815716.02it/s]


Processing data/babylm_data/babylm_100M/gutenberg.train


100%|██████████| 100/100 [00:00<00:00, 1997287.62it/s]


Processing data/babylm_data/babylm_100M/open_subtitles.train


100%|██████████| 100/100 [00:00<00:00, 1664406.35it/s]


Processing data/babylm_data/babylm_100M/qed.train


100%|██████████| 100/100 [00:00<00:00, 1726051.03it/s]


Processing data/babylm_data/babylm_100M/simple_wikipedia.train


100%|██████████| 100/100 [00:00<00:00, 1492634.88it/s]


Processing data/babylm_data/babylm_100M/switchboard.train


100%|██████████| 100/100 [00:00<00:00, 2118335.35it/s]


Processing data/babylm_data/babylm_100M/wikipedia.train


100%|██████████| 100/100 [00:00<00:00, 2046001.95it/s]


In [8]:
df = build_results_df(scores)

In [9]:
df.sample(20)

Unnamed: 0,source,text,label_0,score_0,label_1,score_1,label_2,score_2,label_3,score_3,label_4,score_4,label_5,score_5,label_6,score_6
528,data/babylm_data/babylm_100M/open_subtitles.train,"God, debbie, i'm so sorry.",sadness,0.883805,surprise,0.07157,neutral,0.01708,disgust,0.012012,fear,0.009666,anger,0.002942,joy,0.002925
447,data/babylm_data/babylm_100M/gutenberg.train,"for eyes welled up with tears. ""I wish you wou...",sadness,0.928151,surprise,0.027298,neutral,0.021345,joy,0.011798,anger,0.007257,fear,0.002364,disgust,0.001787
370,data/babylm_data/babylm_100M/children_stories....,Then he flew back and told the Prince what he ...,neutral,0.886675,disgust,0.033418,anger,0.032428,surprise,0.026259,joy,0.009127,fear,0.006662,sadness,0.00543
416,data/babylm_data/babylm_100M/gutenberg.train,as the fine fittings which had been ordered fo...,neutral,0.796273,surprise,0.114976,sadness,0.034287,anger,0.025636,joy,0.013292,disgust,0.010629,fear,0.004908
817,data/babylm_data/babylm_100M/switchboard.train,Okay.,neutral,0.534888,disgust,0.206335,anger,0.199148,sadness,0.024637,joy,0.016664,fear,0.016401,surprise,0.001928
487,data/babylm_data/babylm_100M/gutenberg.train,day he revolted when compliments were paid to ...,anger,0.974862,fear,0.012136,disgust,0.007451,sadness,0.002943,neutral,0.001087,joy,0.000905,surprise,0.000616
943,data/babylm_data/babylm_100M/wikipedia.train,"Nebe was a ""conservative nationalist"", who emb...",disgust,0.561372,neutral,0.221573,anger,0.122573,fear,0.040618,joy,0.026834,sadness,0.021167,surprise,0.005864
670,data/babylm_data/babylm_100M/qed.train,I did hundreds of millions of dollars of trans...,neutral,0.925929,anger,0.022791,disgust,0.022459,surprise,0.012914,fear,0.008,sadness,0.004596,joy,0.003311
658,data/babylm_data/babylm_100M/qed.train,It can be low.,neutral,0.834924,disgust,0.068577,fear,0.034787,sadness,0.023082,anger,0.019252,surprise,0.016883,joy,0.002495
251,data/babylm_data/babylm_100M/cbt.train,"When dinner was over , and when the nurse had ...",joy,0.952118,neutral,0.016621,surprise,0.014855,disgust,0.007225,anger,0.004431,sadness,0.003266,fear,0.001484


In [10]:
df["label_0"].value_counts()

label_0
neutral     616
disgust      89
surprise     65
sadness      63
joy          56
anger        56
fear         55
Name: count, dtype: int64

In [11]:
df.to_csv("processed_data/babylm.csv", index=False)