In [1]:
import pickle
import os

import random
from collections import Counter

from pretraining.data_utils import line_processor
"""
Load augmented data chunks for prertraining corpus
"""
dir_name = "babylm_augment/"
fnames = [dir_name + f for f in os.listdir(dir_name) if f.endswith(".pkl")]

data = []
for fname in fnames:
    tmp = pickle.load(open(fname, "rb"))
    print(fname, len(tmp))
    data += tmp

babylm_augment/babylm_augment_17.pkl 871
babylm_augment/babylm_augment_16.pkl 871
babylm_augment/babylm_augment_28.pkl 871
babylm_augment/babylm_augment_14.pkl 871
babylm_augment/babylm_augment_15.pkl 871
babylm_augment/babylm_augment_29.pkl 871
babylm_augment/babylm_augment_11.pkl 871
babylm_augment/babylm_augment_10.pkl 871
babylm_augment/babylm_augment_12.pkl 871
babylm_augment/babylm_augment_13.pkl 871
babylm_augment/babylm_augment_0.pkl 871
babylm_augment/babylm_augment_1.pkl 871
babylm_augment/babylm_augment_3.pkl 871
babylm_augment/babylm_augment_89.pkl 871
babylm_augment/babylm_augment_2.pkl 871
babylm_augment/babylm_augment_6.pkl 871
babylm_augment/babylm_augment_7.pkl 871
babylm_augment/babylm_augment_5.pkl 871
babylm_augment/babylm_augment_4.pkl 871
babylm_augment/babylm_augment_9.pkl 871
babylm_augment/babylm_augment_8.pkl 871
babylm_augment/babylm_augment_50.pkl 871
babylm_augment/babylm_augment_51.pkl 871
babylm_augment/babylm_augment_22.pkl 871
babylm_augment/babylm_augm

In [2]:
"""
Load factual data (simple wikipedia and wikipedia) for pretraining corpus
Use 10M version here to balance the size of the emotional corpus
"""
factuals = ["simple_wikipedia.train", "wikipedia.train"]
dir_name = "data/babylm_data/babylm_10M/"

for fname in factuals:
    source = dir_name + fname
    tmp = line_processor(open(source).read())
    tmp_dict = {"source": source, "text": None, "emotion": None, "augmented_text": tmp}
    data.append(tmp_dict)
data[0]

{'source': 'data/babylm_data/babylm_100M/bnc_spoken.train',
 'text': '‘The secret of happiness is to face the fact that the world is horrible, horrible, horrible.',
 'emotion': 'fear',
 'augmented_text': "Emma had always been afraid of the dark. She would hide under her covers, squeezing her eyes shut and praying for the sun to rise. One night, she heard a strange noise coming from her closet. She shook her head, trying to convince herself it was just her imagination, but the noise grew louder and more persistent. Suddenly, she felt a cold hand on her shoulder, and she froze. She looked up, and her eyes widened in fear as she saw a shadowy figure standing in front of her. Emma tried to scream, but no sound came out. She felt a hand over her mouth, and she struggled to breathe. Just when she thought she was going to be taken away, the hand was lifted, and the figure disappeared. Emma sat up, panting and shaking. She realized that she had been so scared that she had imagined the whole th

In [3]:
"""
Source distribution of corpus
"""
source_count = Counter([d['source'] for d in data])
display(source_count)

"""
Word count of corpus (8.9M words ~ strict-small track)
"""
total_word_count = sum([len(d['augmented_text'].split()) for d in data])
print(total_word_count, total_word_count / len(data))

"""
Save the pretraining corpus
"""
pickle.dump(data, open("babylm_pretrain_corpus/babylm_emo_wiki_10M.pkl", "wb"))

Counter({'data/babylm_data/babylm_100M/bnc_spoken.train': 20368,
         'data/babylm_data/babylm_100M/aochildes.train': 11859,
         'data/babylm_data/babylm_100M/cbt.train': 871,
         'data/babylm_data/babylm_10M/simple_wikipedia.train': 1,
         'data/babylm_data/babylm_10M/wikipedia.train': 1})

8929873 269.7846827794562


In [4]:
"""
EDA for grid search results
"""
dir_name = "babylm_augment_gridsearch/"
fnames = [dir_name + f for f in os.listdir(dir_name) if f.endswith(".pkl")]

n = len(pickle.load(open(fnames[0], "rb")))
idx = random.randint(0, n)

conditions = {
    # "temperature": 0.7,
    "max_new_tokens": 256,
    "top_p": 1.,
    # "do_sample": True,
}

for fname in fnames:
    data = pickle.load(open(fname, "rb"))
    if all([data[idx]["kwargs"][k] == v for k, v in conditions.items()]):
        print(data[idx]["kwargs"])
        print(data[idx]["text"].replace("\n", " "))
        print(data[idx]["augmented_text"].replace("\n", " ")[:200])
        print("------")

{'max_new_tokens': 256, 'temperature': 0.8, 'top_p': 1.0, 'do_sample': True}
i don't really want you to get that job anyway so.
When I was a little girl, I loved to paint. I would spend hours creating beautiful art with all my colors and brushes. My teacher thought it was great and encouraged me to keep drawing. One day, I fou
------
{'max_new_tokens': 256, 'temperature': 0.7, 'top_p': 1.0, 'do_sample': True}
i don't really want you to get that job anyway so.
Once upon a time, there was a young girl named Lily. She had worked hard to get an interview for her dream job, and she was excited to start her new career. However, when she got the job offer, she wa
------
{'max_new_tokens': 256, 'temperature': 1.2, 'top_p': 1.0, 'do_sample': True}
i don't really want you to get that job anyway so.
As soon as Emily landed the job of her dreams, her grandmother praised her effort. "That's great, honey! You worked hard for that!" Emily smiled widely, relieved that her grandmother didn't disapprove