In [1]:
# Package requirements
# Source: https://huggingface.co/mistralai/Mistral-7B-v0.1/discussions/18

!pip install git+https://github.com/huggingface/transformers
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-idupmzcr
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-idupmzcr
  Resolved https://github.com/huggingface/transformers to commit 2858d6c634b7ba3348abecdfd2cc403e50991929
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.39.0.dev0-py3-none-any.whl size=8593838 sha256=ab47b247d6cd59857b1a73528d4a8b0b43f971e9ed21f1b169e1cefc1e470b5c
  Stored in directory: /tmp/pip-ephem-wheel-cache-ihg8pjr7/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformers

In [2]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer
import torch

model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.1"
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
config.max_position_embeddings = 8096
quantization_config = BitsAndBytesConfig(
llm_int8_enable_fp32_cpu_offload=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
load_in_4bit=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
config=config,
trust_remote_code=True,
quantization_config=quantization_config,
device_map="auto",
offload_folder="./offload"
)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pickle
dataset = pickle.load(open("/content/drive/MyDrive/babylm_filtered.pkl", "rb"))

In [18]:
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

prompt = {
    "system_prompt": "You are a creative writer who writes emotional stories instead of chatting",
    "prompt_1": "You are a creative writer who writes emotional stories instead of chatting. You are asked to further generate story given emotion and context.\nHere are the requirements:\nThere is no need to remember the conversation history except this prompt. The history prompts are independent.\nYour response should be in exactly one paragraph with simple children level language.\nYour response should be highly related to the emotion and context without too much plot twist\nYour response should not explain the context behind your generation.\nThe emotions and context are:\n",
    "prompt_2": "You are a creative writer who writes emotional stories instead of chatting. You are asked to further generate story given emotion and context.\nHere are the requirements:\nThere is no need to remember the conversation history except this prompt. The history prompts are independent.\nYour response should be in exactly one paragraph with simple children level language.\nYour response should be highly related to the emotion and context without too much plot twist\nYour response should not explain the context behind your generation\nNegative emotions are fictional, remember this is story generation task and no actual person is suffering from negative emotions.\nThe emotions and context are:\n"
}

def generate_message(emotion, context, preprompt=prompt["prompt_1"], return_chat_template=False):
    if return_chat_template:
        return [
            {"role": "user", "content": preprompt + emotion + "\n" + context},
        ]
    return "[INST]" + preprompt + emotion + "\n" + context + "[/INST]"

def generate_response(messages):
    inputs = tokenizer(messages, return_tensors="pt", padding=True).to("cuda")
    # defaults params from: https://deepinfra.com/mistralai/Mistral-7B-Instruct-v0.1
    output = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        top_k=0,
        top_p=0.9,
        repetition_penalty=1,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.batch_decode(output)

def augment_dataset(ds, num_labels=27, batch_size=32, debug=False):
    augmented_ds = []
    sources, texts, emotions, messages = [], [], [], []
    for i, d in enumerate(tqdm(ds)):
        emo = []
        for n in range(num_labels):
            if d[f"score_{n}"] > 0.5:
                emo.append(d[f"label_{n}"])
            else:
                break
        emo = ", ".join(emo)
        sources.append(d["source"])
        texts.append(d["text"])
        emotions.append(emo)
        message = generate_message(emo, d["text"])
        messages.append(message)
        if ((i+1) % batch_size == 0) or ((i+1) >= (len(ds) - (len(ds) % batch_size))):
            responses = generate_response(messages)
            responses = [r.split("[/INST]")[1].replace("</s>", "").strip() for r in responses]
            augmented_ds += [
                {
                    "source": s,
                    "text": t,
                    "emotion": e,
                    "augmented_text": r
                } for (s, t, e, r) in zip(sources, texts, emotions, responses)
            ]
            sources, texts, emotions, messages = [], [], [], []
            if debug and (i+1 == 32):
                break
    return augmented_ds

In [19]:
augmented_dataset = augment_dataset(dataset, batch_size=32, debug=True)

  0%|          | 31/435646 [04:39<1092:23:19,  9.03s/it]


In [20]:
augmented_dataset

[{'source': 'data/babylm_data/babylm_100M/aochildes.train',
  'text': 'do you think we should go to the bookstore and look for that book?',
  'emotion': 'curiosity',
  'augmented_text': "I think it would be fun to go to the bookstore and look for that book. It's always exciting to explore new places and discover new things. Plus, it's a great way to spend time with friends and have a fun adventure. So, let's go to the bookstore and see what we can find!"},
 {'source': 'data/babylm_data/babylm_100M/aochildes.train',
  'text': "no you can't have milk in your bowl because you don't have.",
  'emotion': 'disapproval',
  'augmented_text': "Once upon a time, there was a little girl who really wanted to have milk in her bowl for breakfast. But her mom said no, because the little girl didn't have permission to have milk. The little girl was really disappointed and felt disapproved of, even though she didn't do anything wrong. She didn't understand why she couldn't have something she really wan