In [1]:
import pickle
import os

import random
from collections import Counter

from pretraining.data_utils import line_processor
"""
Load augmented data chunks for prertraining corpus
"""
dir_name = "babylm_augment/"
fnames = [dir_name + f for f in os.listdir(dir_name) if f.endswith(".pkl")]

data = []
for i, fname in enumerate(fnames):
    tmp = pickle.load(open(fname, "rb"))
    print(i, fname, len(tmp))
    data += tmp

0 babylm_augment/babylm_augment_17.pkl 871
1 babylm_augment/babylm_augment_16.pkl 871
2 babylm_augment/babylm_augment_28.pkl 871
3 babylm_augment/babylm_augment_14.pkl 871
4 babylm_augment/babylm_augment_15.pkl 871
5 babylm_augment/babylm_augment_29.pkl 871
6 babylm_augment/babylm_augment_11.pkl 871
7 babylm_augment/babylm_augment_10.pkl 871
8 babylm_augment/babylm_augment_12.pkl 871
9 babylm_augment/babylm_augment_13.pkl 871
10 babylm_augment/babylm_augment_0.pkl 871
11 babylm_augment/babylm_augment_1.pkl 871
12 babylm_augment/babylm_augment_3.pkl 871
13 babylm_augment/babylm_augment_89.pkl 871
14 babylm_augment/babylm_augment_2.pkl 871
15 babylm_augment/babylm_augment_6.pkl 871
16 babylm_augment/babylm_augment_7.pkl 871
17 babylm_augment/babylm_augment_5.pkl 871
18 babylm_augment/babylm_augment_4.pkl 871
19 babylm_augment/babylm_augment_9.pkl 871
20 babylm_augment/babylm_augment_8.pkl 871
21 babylm_augment/babylm_augment_50.pkl 871
22 babylm_augment/babylm_augment_51.pkl 871
23 babyl

In [2]:
"""
Load factual data (simple wikipedia and wikipedia) for pretraining corpus
Use 10M version here to balance the size of the emotional corpus
"""
factuals = ["simple_wikipedia.train", "wikipedia.train"]
dir_name = "data/babylm_data/babylm_10M/"

for fname in factuals:
    source = dir_name + fname
    tmp = line_processor(open(source).read())
    tmp_dict = {"source": source, "text": None, "emotion": None, "augmented_text": tmp}
    data.append(tmp_dict)
data[0]

{'source': 'data/babylm_data/babylm_100M/bnc_spoken.train',
 'text': '‘The secret of happiness is to face the fact that the world is horrible, horrible, horrible.',
 'emotion': 'fear',
 'augmented_text': "Emma had always been afraid of the dark. She would hide under her covers, squeezing her eyes shut and praying for the sun to rise. One night, she heard a strange noise coming from her closet. She shook her head, trying to convince herself it was just her imagination, but the noise grew louder and more persistent. Suddenly, she felt a cold hand on her shoulder, and she froze. She looked up, and her eyes widened in fear as she saw a shadowy figure standing in front of her. Emma tried to scream, but no sound came out. She felt a hand over her mouth, and she struggled to breathe. Just when she thought she was going to be taken away, the hand was lifted, and the figure disappeared. Emma sat up, panting and shaking. She realized that she had been so scared that she had imagined the whole th

In [3]:
"""
Source distribution of corpus
"""
source_count = Counter([d['source'] for d in data])
display(source_count)

"""
Word count of corpus (8.9M words ~ strict-small track)
"""
total_word_count = sum([len(d['augmented_text'].split()) for d in data])
print(total_word_count, total_word_count / len(data))

"""
Save the pretraining corpus
"""
pickle.dump(data, open("babylm_pretrain_corpus/babylm_emo_wiki_10M.pkl", "wb"))

Counter({'data/babylm_data/babylm_100M/bnc_spoken.train': 20368,
         'data/babylm_data/babylm_100M/aochildes.train': 11859,
         'data/babylm_data/babylm_100M/cbt.train': 871,
         'data/babylm_data/babylm_10M/simple_wikipedia.train': 1,
         'data/babylm_data/babylm_10M/wikipedia.train': 1})

8929873 269.7846827794562


In [6]:
"""
EDA for grid search results
"""
dir_name = "babylm_augment_gridsearch/"
fnames = [dir_name + f for f in os.listdir(dir_name) if f.endswith(".pkl")]

n = len(pickle.load(open(fnames[0], "rb")))
idx = random.randint(0, n)

conditions = {
    # "temperature": 0.7,
    "max_new_tokens": 256,
    "top_p": 1.,
    # "do_sample": True,
}

for fname in fnames:
    data = pickle.load(open(fname, "rb"))
    if all([data[idx]["kwargs"][k] == v for k, v in conditions.items()]):
        print(data[idx]["kwargs"])
        print(line_processor(data[idx]["text"]))
        print(line_processor(data[idx]["augmented_text"][:400]))
        print("------")

{'max_new_tokens': 256, 'temperature': 0.8, 'top_p': 1.0, 'do_sample': True}
it wasn't like riding a real horse but it was kind of fun anyway.
Once upon a time, in a small town, there was a boy named Timmy who loved to ride his pretend horse around the neighborhood. It wasn't like riding a real horse, but it was still fun. Timmy would pretend to gallop down the street, with his hands on the reins and his feet on the ground that he pretended to be a horse's hooves. He would pretend to jump over fences and run through fields.Whenever Tim
------
{'max_new_tokens': 256, 'temperature': 0.7, 'top_p': 1.0, 'do_sample': True}
it wasn't like riding a real horse but it was kind of fun anyway.
Once upon a time, there was a little girl named Emily who loved animals. She dreamed of being a vet and helping all the creatures in need. One day, Emily's birthday surprise was a special adventure to a nearby farm where she could pet and feed all sorts of animals.As Emily walked through the farm, she saw a

In [2]:
"""
Visualize samples from different prompting methods
"""
sample_path = lambda i: f"babylm_augment_samples/babylm_augment_m{i}_samples.pkl"
pickle.load(open(sample_path(1), "rb"))[:2]

[{'source': 'data/babylm_data/babylm_100M/wikipedia.train',
  'text': 'The most famous hurricane in Canadian history struck on October 15, 1954, causing catastrophic flooding. Hurricane Hazel submerged low-lying land from Etobicoke to the Holland Marsh and left 81 people dead. No natural disaster since has led to such a high death toll in Canada. Over 4,000 families were left homeless.',
  'emotion': 'sadness',
  'prompt': "[INST]You are a creative writer who writes emotional stories instead of chatting. Your task is to further generate story given emotion and context.\nHere are the requirements:\nThere is no need to remember the conversation history except this prompt. The history prompts are independent.\nYour response should be in exactly one paragraph with simple children level language.\nYour response should be highly related to the emotion and context without too much plot twist\nYour response should not explain the context behind your generation\nNegative emotions are fictional,

In [3]:
pickle.load(open(sample_path(2), "rb"))[:2]

[{'source': 'data/babylm_data/babylm_100M/wikipedia.train',
  'text': 'The most famous hurricane in Canadian history struck on October 15, 1954, causing catastrophic flooding. Hurricane Hazel submerged low-lying land from Etobicoke to the Holland Marsh and left 81 people dead. No natural disaster since has led to such a high death toll in Canada. Over 4,000 families were left homeless.',
  'emotion': 'sadness',
  'prompt': "[INST]You are a creative writer who can write an emotional story. Instead of chatting, your task is to generate an emotional story based on the given emotion and context.\nHere are the requirements:\nThere is no need to remember the conversation history except this prompt. The history prompts are independent.\nEmotion is given as one word and context, given as a sentence or a part of sentence, follows.\nYour response is a monologue by a girl.\nFor example:\njoy: I am going on a vacation => All of a sudden, I decided that it was time for a change, a breath of fresh a

In [4]:
pickle.load(open(sample_path(3), "rb"))[:2]

[{'source': 'data/babylm_data/babylm_100M/wikipedia.train',
  'text': 'The most famous hurricane in Canadian history struck on October 15, 1954, causing catastrophic flooding. Hurricane Hazel submerged low-lying land from Etobicoke to the Holland Marsh and left 81 people dead. No natural disaster since has led to such a high death toll in Canada. Over 4,000 families were left homeless.',
  'emotion': 'sadness',
  'prompt': "[INST]\nObjective:\nCraft a story that emphasises a given emotion from an initial context, ensuring the narrative is accessible for children.\nGuidelines:\nUse simple, child-friendly language throughout the story, making it easy children to understand and connect with.Make sure the story deepens the reader's understanding of this emotion by creatively exploring the given emotion without introducing unrelated feelings.\nStart your story creatively, avoiding traditional opening lines to engage the reader immediately.\nWhen the story involves sad or negative emotions, 