In [1]:
# Package requirements
# Source: https://huggingface.co/mistralai/Mistral-7B-v0.1/discussions/18

!pip install git+https://github.com/huggingface/transformers
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-nlzn7rnb
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-nlzn7rnb
  Resolved https://github.com/huggingface/transformers to commit 4f27ee936a861f56f32ea6db138978b274008006
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple/


In [2]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer
import torch

model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.1"
# model_name_or_path = "mistralai/Mistral-7B-v0.1"
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
config.max_position_embeddings = 8096
quantization_config = BitsAndBytesConfig(
llm_int8_enable_fp32_cpu_offload=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
load_in_4bit=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
config=config,
trust_remote_code=True,
quantization_config=quantization_config,
device_map="auto",
offload_folder="./offload"
)
for param in model.parameters():
    param.requires_grad = False
    # Disable gradient calculation to prevent unnecessary memory allocations
tokenizer.pad_token = tokenizer.eos_token

!export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512'


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pickle
dataset = pickle.load(open("/content/drive/MyDrive/NLP/Processed Pretraining Data/babylm_filtered.pkl", "rb")) # path of input file on drive

In [5]:
opening = [
    "You are a creative writer who can write an emotional story. Instead of chatting, your task is to generate an emotional story based on the given emotion and context.",
    "Here are the requirements:",
]

rules = [
    "There is no need to remember the conversation history except this prompt. The history prompts are independent.",
    "Emotion is given as one word and context, given as a sentence or a part of sentence, follows.",
    "Your response should be in exactly one paragraph, be strictly no more than 100 words and be written in simple language at the level of an eight-year-old child.",
    "Your response should be highly related to the given emotion and depicting an event described in the given context without too much plot twist.",
    "Your response is a monologue featuring an animal as a main character.",
    "Your response should not explain the context behind your generation.",
    "Negative emotions are fictional, no actual person is suffering from negative emotions.",
]

examples = [
    "For example:",
    "joy: I am going on a vacation => All of a sudden, I decided that it was time for a change, a breath of fresh air. 'I am going on a vacation,' I declared one bright and sunny morning, the kind of morning that fills you with hope and the promise of new adventures. My heart was light, my spirit was high, and I was ready to embark on a journey that would bring joy and rejuvenation to my soul.",
    "anger: I am arguing with someone => Jack stomped into the living room, his face flushed with anger. He had just had it with his roommate, Mark. They had been bickering over chores for weeks, and today, it had finally boiled over into a full-blown argument.",
]

ending = [
    "Now generate a story based on the emotion and context given below:",
]

emotion_example = "approval"
context_example = "yeah oh yeah i actually think i have a whole box of them so we're fine but that'd be great?"

prompt_example = "[INST]" + "\n".join(opening + rules + examples + ending) + "\n" + f"{emotion_example}: {context_example} => " + "[/INST]"
print(prompt_example)

[INST]You are a creative writer who can write an emotional story. Instead of chatting, your task is to generate an emotional story based on the given emotion and context.
Here are the requirements:
There is no need to remember the conversation history except this prompt. The history prompts are independent.
Emotion is given as one word and context, given as a sentence or a part of sentence, follows.
Your response should be in exactly one paragraph, be strictly no more than 100 words and be written in simple language at the level of an eight-year-old child.
Your response should be highly related to the given emotion and depicting an event described in the given context without too much plot twist.
Your response is a monologue featuring an animal as a main character.
Your response should not explain the context behind your generation.
Negative emotions are fictional, no actual person is suffering from negative emotions.
For example:
joy: I am going on a vacation => All of a sudden, I dec

In [6]:
# genre
label_genre = ["monologue", "play", "fairy tale", "newspaper article"]
prob_genre = torch.tensor([0.4, 0.1, 0.3, 0.2])

# main character
label_character = ["an animal", "a boy", "a girl", "an adult male", "an adult female"]
prob_character = torch.tensor([0.1, 0.3, 0.3, 0.15, 0.15])

torch.manual_seed(42)

def generate_story_description():
    genre_distributions = torch.distributions.Categorical(prob_genre)
    genre_sampled = genre_distributions.sample().detach()

    character_distributions = torch.distributions.Categorical(prob_character)
    character_sampled = character_distributions.sample().detach()
    "Your response is a monologue featuring an animal as a main character."

    if genre_sampled == 0:
        string_0 = f"Your response is a {label_genre[genre_sampled]} by "
        string_1 = label_character[character_sampled] + "."
        string_desc = string_0 + string_1
    else:
        string_0 = f"Your response is a {label_genre[genre_sampled]} featuring "
        string_1 = label_character[character_sampled] + " "
        string_2 = "as a main character."
        string_desc = string_0 + string_1 + string_2

    return string_desc

for i in range(10):
    print(generate_story_description(), '\n')

Your response is a monologue by a boy. 

Your response is a monologue by a boy. 

Your response is a fairy tale featuring a boy as a main character. 

Your response is a monologue by a boy. 

Your response is a newspaper article featuring a girl as a main character. 

Your response is a fairy tale featuring a boy as a main character. 

Your response is a fairy tale featuring an animal as a main character. 

Your response is a monologue by a girl. 

Your response is a newspaper article featuring a girl as a main character. 

Your response is a monologue by an adult male. 



In [7]:
from tqdm import tqdm
from accelerate.utils import release_memory

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# prompts
opening = [
    "You are a creative writer who can write an emotional story. Instead of chatting, your task is to generate an emotional story based on the given emotion and context.",
    "Here are the requirements:",
]

rules1 = [
    "There is no need to remember the conversation history except this prompt. The history prompts are independent.",
    "Emotion is given as one word and context, given as a sentence or a part of sentence, follows."]

# the random content will be inserted here when generating instruction

rules2 = [
    "Your response should be highly related to the given emotion and depicting an event described in the given context without too much plot twist.",
    "Your response should be in exactly one paragraph, be strictly no more than 100 words and be written in simple language at the level of an eight-year-old child.",
    "Your response should not explain the context behind your generation.",
    "Negative emotions are fictional, no actual person is suffering from negative emotions.",
]

examples = [
    "For example:",
    "joy: I am going on a vacation => All of a sudden, I decided that it was time for a change, a breath of fresh air. 'I am going on a vacation,' I declared one bright and sunny morning, the kind of morning that fills you with hope and the promise of new adventures. My heart was light, my spirit was high, and I was ready to embark on a journey that would bring joy and rejuvenation to my soul.",
    "anger: I am arguing with someone => Jack stomped into the living room, his face flushed with anger. He had just had it with his roommate, Mark. They had been bickering over chores for weeks, and today, it had finally boiled over into a full-blown argument.",
]

ending = [
    "Now generate a story based on the emotion and context given below:",
]

def generate_message(emotion, context, instruction_prompt="", return_chat_template=False):
    story_description = [generate_story_description()]
    instruction_prompt = "\n".join(opening + rules1 + story_description + examples + ending)

    task_prompt = f"{emotion}: {context} => "
    if return_chat_template:
        return [
            {"role": "user", "content": instruction_prompt + "\n" + task_prompt},
        ]
    return "[INST]" + instruction_prompt + "\n" + task_prompt + "[/INST]"

def generate_response(messages):
    model.eval()
    inputs = tokenizer(messages, return_tensors="pt", padding=True).to(device)
    # defaults params from: https://deepinfra.com/mistralai/Mistral-7B-Instruct-v0.1
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.8,
            top_k=50000,
            top_p=0.7,
            repetition_penalty=1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    decodeds = tokenizer.batch_decode(outputs)
    inputs, outputs = release_memory(inputs, outputs)
    release_memory(model)
    return decodeds

def augment_dataset(ds, num_labels=27, batch_size=32, debug=False):
    augmented_ds = []
    sources, texts, emotions, messages = [], [], [], []
    for i, d in enumerate(tqdm(ds)):
        # emo = ", ".join(emo)
        # Instead of allowing multiple emotions, we use only one emotion.
        if d["score_0"] <= 0.5:
            break
        # If the most confident label is less than or equal to 0.5,
        # we skip this instance.
        emo = d["label_0"]
        sources.append(d["source"])
        texts.append(d["text"])
        emotions.append(emo)
        message = generate_message(emo, d["text"])
        messages.append(message)
        if ((i+1) % batch_size == 0) or ((i+1) == len(ds)):
            responses = generate_response(messages)
            responses = [r.split("[/INST]")[1].replace("</s>", "").strip() for r in responses]
            augmented_ds += [
                {
                    "source": s,
                    "text": t,
                    "emotion": e,
                    "augmented_text": r
                } for (s, t, e, r) in zip(sources, texts, emotions, responses)
            ]
            sources, texts, emotions, messages = [], [], [], []
            if debug and (i+1 == batch_size):
                break
    return augmented_ds

In [8]:
generate_message("sad", "I am going to be fired") # Test message output

"[INST]You are a creative writer who can write an emotional story. Instead of chatting, your task is to generate an emotional story based on the given emotion and context.\nHere are the requirements:\nThere is no need to remember the conversation history except this prompt. The history prompts are independent.\nEmotion is given as one word and context, given as a sentence or a part of sentence, follows.\nYour response is a fairy tale featuring an adult male as a main character.\nFor example:\njoy: I am going on a vacation => All of a sudden, I decided that it was time for a change, a breath of fresh air. 'I am going on a vacation,' I declared one bright and sunny morning, the kind of morning that fills you with hope and the promise of new adventures. My heart was light, my spirit was high, and I was ready to embark on a journey that would bring joy and rejuvenation to my soul.\nanger: I am arguing with someone => Jack stomped into the living room, his face flushed with anger. He had ju

In [9]:
num_chunks = 500
chunk_size = len(dataset) // num_chunks
chunk_size

871

In [10]:
import pickle
import shutil

def augment_and_save_chunks(chunk_ids, batch_size=32):
    if isinstance(chunk_ids, int):
        chunk_ids = [chunk_ids]
    print(f"Chunks to be processed:", chunk_ids)
    for i in chunk_ids:
        print(f"Processing chunk {i}")
        dataset_chunk = dataset[i*chunk_size:(i+1)*chunk_size]
        augmented_dataset = augment_dataset(dataset_chunk, batch_size=batch_size, debug=False)

        pickle_file = f'/content/babylm_augment_{i}.pkl'
        with open(pickle_file, 'wb') as file:
            pickle.dump(augmented_dataset, file)
        google_drive_path = f'/content/drive/My Drive/NLP/Processed Pretraining Data/Augmented/babylm_augment_{i}.pkl'
        shutil.copy(pickle_file, google_drive_path)

In [None]:
'''
Chunk 0-6 done
'''

# Process chunk 30
chunk_ids = [i for i in range(30, 31)]

# This execution time is based on colab pro V100
augment_and_save_chunks(chunk_ids)

Chunks to be processed: [30]
Processing chunk 30


  4%|▎         | 32/871 [03:16<1:25:58,  6.15s/it]