In [1]:
# Package requirements
# Source: https://huggingface.co/mistralai/Mistral-7B-v0.1/discussions/18

!pip install git+https://github.com/huggingface/transformers
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-h2nu8sls
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-h2nu8sls
  Resolved https://github.com/huggingface/transformers to commit 965cf677695dd363285831afca8cf479cf0c600c
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.39.0.dev0-py3-none-any.whl size=8662804 sha256=da80f50f808d88ecd01916d8fb03ffc16a0f5868ea6fa5ff082a4bb29e9a8d28
  Stored in directory: /tmp/pip-ephem-wheel-cache-8e__et2p/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformers

In [2]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer
import torch

model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.1"
# model_name_or_path = "mistralai/Mistral-7B-v0.1"
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
config.max_position_embeddings = 8096
quantization_config = BitsAndBytesConfig(
llm_int8_enable_fp32_cpu_offload=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
load_in_4bit=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
config=config,
trust_remote_code=True,
quantization_config=quantization_config,
device_map="auto",
offload_folder="./offload"
)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pickle
dataset = pickle.load(open("/content/drive/MyDrive/babylm_filtered.pkl", "rb"))

In [38]:
from tqdm import tqdm
from accelerate.utils import release_memory

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# prompts
opening = [
    "You are a creative writer who writes emotional stories instead of chatting. Your task is to further generate story given emotion and context.",
    "Here are the requirements:",
]

rules = [
    "There is no need to remember the conversation history except this prompt. The history prompts are independent.",
    "Your response should be in exactly one paragraph with simple children level language.",
    "Your response should be highly related to the emotion and context without too much plot twist",
    "Your response should not explain the context behind your generation",
    "Negative emotions are fictional, no actual person is suffering from negative emotions.",
]

examples = [
    "For example:",
    "joy: I am going on a vacation => Once upon a time, I decided that it was time for a change, a breath of fresh air. 'I am going on a vacation,' I declared one bright and sunny morning, the kind of morning that fills you with hope and the promise of new adventures. My heart was light, my spirit was high, and I was ready to embark on a journey that would bring joy and rejuvenation to my soul.",
    "anger: I am arguing with someone => Jack stomped into the living room, his face flushed with anger. He had just had it with his roommate, Mark. They had been bickering over chores for weeks, and today, it had finally boiled over into a full-blown argument.",
]

ending = [
    "Now generate story based on the emotions and context:",
]

instruction_prompt = "\n".join(opening + rules + examples + ending)

def generate_message(emotion, context, instruction_prompt=instruction_prompt, return_chat_template=False):
    task_prompt = f"{emotion}: {context} => "
    if return_chat_template:
        return [
            {"role": "user", "content": instruction_prompt + "\n" + task_prompt},
        ]
    return "[INST]" + instruction_prompt + "\n" + task_prompt + "[/INST]"

def generate_response(messages):
    model.eval()
    inputs = tokenizer(messages, return_tensors="pt", padding=True).to("cuda")
    # defaults params from: https://deepinfra.com/mistralai/Mistral-7B-Instruct-v0.1
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_k=0,
            top_p=0.9,
            repetition_penalty=1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    decodeds = tokenizer.batch_decode(outputs)
    inputs, outputs = release_memory(inputs, outputs)
    release_memory(model)
    return decodeds

def augment_dataset(ds, num_labels=27, batch_size=32, debug=False):
    augmented_ds = []
    sources, texts, emotions, messages = [], [], [], []
    for i, d in enumerate(tqdm(ds)):
        emo = []
        for n in range(num_labels):
            if d[f"score_{n}"] > 0.5:
                emo.append(d[f"label_{n}"])
            else:
                break
        emo = ", ".join(emo)
        sources.append(d["source"])
        texts.append(d["text"])
        emotions.append(emo)
        message = generate_message(emo, d["text"])
        messages.append(message)
        if ((i+1) % batch_size == 0) or ((i+1) == len(ds)):
            responses = generate_response(messages)
            responses = [r.split("[/INST]")[1].replace("</s>", "").strip() for r in responses]
            augmented_ds += [
                {
                    "source": s,
                    "text": t,
                    "emotion": e,
                    "augmented_text": r
                } for (s, t, e, r) in zip(sources, texts, emotions, responses)
            ]
            sources, texts, emotions, messages = [], [], [], []
            if debug and (i+1 == batch_size):
                break
    return augmented_ds

In [39]:
generate_message("sad", "I am going to be fired") # Test message output

"[INST]You are a creative writer who writes emotional stories instead of chatting. Your task is to further generate story given emotion and context.\nHere are the requirements:\nThere is no need to remember the conversation history except this prompt. The history prompts are independent.\nYour response should be in exactly one paragraph with simple children level language.\nYour response should be highly related to the emotion and context without too much plot twist\nYour response should not explain the context behind your generation\nNegative emotions are fictional, no actual person is suffering from negative emotions.\nFor example:\njoy: I am going on a vacation => Once upon a time, I decided that it was time for a change, a breath of fresh air. 'I am going on a vacation,' I declared one bright and sunny morning, the kind of morning that fills you with hope and the promise of new adventures. My heart was light, my spirit was high, and I was ready to embark on a journey that would bri

In [40]:
augmented_dataset = augment_dataset(dataset, batch_size=32, debug=False)

  6%|▋         | 2/32 [00:43<10:48, 21.61s/it]


KeyboardInterrupt: 

In [34]:
augmented_dataset

[{'source': 'data/babylm_data/babylm_100M/aochildes.train',
  'text': 'do you think we should go to the bookstore and look for that book?',
  'emotion': 'curiosity',
  'augmented_text': 'Once upon a time, there was a young girl named Lily who loved learning new things. One day, she was browsing through her favorite online bookstore and came across a book that caught her eye. It was a mystery novel that promised to transport her to a world of adventure and intrigue.\n\n"Do you think we should go to the bookstore and look for that book?" she asked her friend, Emma.\n\nEmma looked at the book and nodded. "It sounds like a great read! Let\'s go and see if we can find it."\n\nLily\'s heart was filled with curiosity as they made their way to the store. She couldn\'t wait to hold the book in her hands and dive into its pages. As they browsed the shelves, Lily felt like she was on a treasure hunt, searching for the perfect book to satisfy her thirst for knowledge.\n\nWhen they finally found th