## Fine tune Dolly with Lora

In [1]:
!pip install accelerate>=0.12.0 transformers[torch]==4.25.1
!pip install -q datasets loralib sentencepiece
!pip -q install git+https://github.com/huggingface/peft.git
!pip -q install bitsandbytes

In [18]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split


In [2]:
# Create Instruct Pipeline
import logging
import re

import numpy as np
from transformers import Pipeline, PreTrainedTokenizer

logger = logging.getLogger(__name__)

INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
INTRO_BLURB = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request."
)

# This is the prompt that is used for generating responses using an already trained model.  It ends with the response
# key, where the job of the model is to provide the completion that follows it (i.e. the response itself).
PROMPT_FOR_GENERATION_FORMAT = """{intro}
{instruction_key}
{instruction}
{response_key}
""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
)


def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
    """Gets the token ID for a given string that has been added to the tokenizer as a special token.
    When training, we configure the tokenizer so that the sequences like "### Instruction:" and "### End" are
    treated specially and converted to a single, new token.  This retrieves the token ID each of these keys map to.
    Args:
        tokenizer (PreTrainedTokenizer): the tokenizer
        key (str): the key to convert to a single token
    Raises:
        RuntimeError: if more than one ID was generated
    Returns:
        int: the token ID for the given key
    """
    token_ids = tokenizer.encode(key)
    if len(token_ids) > 1:
        raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
    return token_ids[0]


class InstructionTextGenerationPipeline(Pipeline):
    def __init__(
        self, *args, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs
    ):
        super().__init__(*args, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)

    def _sanitize_parameters(self, return_instruction_text=False, **generate_kwargs):
        preprocess_params = {}

        # newer versions of the tokenizer configure the response key as a special token.  newer versions still may
        # append a newline to yield a single token.  find whatever token is configured for the response key.
        tokenizer_response_key = next(
            (token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None
        )

        response_key_token_id = None
        end_key_token_id = None
        if tokenizer_response_key:
            try:
                response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)
                end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)

                # Ensure generation stops once it generates "### End"
                generate_kwargs["eos_token_id"] = end_key_token_id
            except ValueError:
                pass

        forward_params = generate_kwargs
        postprocess_params = {
            "response_key_token_id": response_key_token_id,
            "end_key_token_id": end_key_token_id,
            "return_instruction_text": return_instruction_text,
        }

        return preprocess_params, forward_params, postprocess_params

    def preprocess(self, instruction_text, **generate_kwargs):
        prompt_text = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction_text)
        inputs = self.tokenizer(
            prompt_text,
            return_tensors="pt",
        )
        inputs["prompt_text"] = prompt_text
        inputs["instruction_text"] = instruction_text
        return inputs

    def _forward(self, model_inputs, **generate_kwargs):
        input_ids = model_inputs["input_ids"]
        attention_mask = model_inputs.get("attention_mask", None)
        generated_sequence = self.model.generate(
            input_ids=input_ids.to(self.model.device),
            attention_mask=attention_mask,
            pad_token_id=self.tokenizer.pad_token_id,
            **generate_kwargs,
        )[0].cpu()
        instruction_text = model_inputs.pop("instruction_text")
        return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text}

    def postprocess(self, model_outputs, response_key_token_id, end_key_token_id, return_instruction_text):
        sequence = model_outputs["generated_sequence"]
        instruction_text = model_outputs["instruction_text"]

        # The response will be set to this variable if we can identify it.
        decoded = None

        # If we have token IDs for the response and end, then we can find the tokens and only decode between them.
        if response_key_token_id and end_key_token_id:
            # Find where "### Response:" is first found in the generated tokens.  Considering this is part of the
            # prompt, we should definitely find it.  We will return the tokens found after this token.
            response_pos = None
            response_positions = np.where(sequence == response_key_token_id)[0]
            if len(response_positions) == 0:
                logger.warn(f"Could not find response key {response_key_token_id} in: {sequence}")
            else:
                response_pos = response_positions[0]

            if response_pos:
                # Next find where "### End" is located.  The model has been trained to end its responses with this
                # sequence (or actually, the token ID it maps to, since it is a special token).  We may not find
                # this token, as the response could be truncated.  If we don't find it then just return everything
                # to the end.  Note that even though we set eos_token_id, we still see the this token at the end.
                end_pos = None
                end_positions = np.where(sequence == end_key_token_id)[0]
                if len(end_positions) > 0:
                    end_pos = end_positions[0]

                decoded = self.tokenizer.decode(sequence[response_pos + 1 : end_pos]).strip()
        else:
            # Otherwise we'll decode everything and use a regex to find the response and end.

            fully_decoded = self.tokenizer.decode(sequence)

            # The response appears after "### Response:".  The model has been trained to append "### End" at the
            # end.
            m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", fully_decoded, flags=re.DOTALL)

            if m:
                decoded = m.group(1).strip()
            else:
                # The model might not generate the "### End" sequence before reaching the max tokens.  In this case,
                # return everything after "### Response:".
                m = re.search(r"#+\s*Response:\s*(.+)", fully_decoded, flags=re.DOTALL)
                if m:
                    decoded = m.group(1).strip()
                else:
                    logger.warn(f"Failed to find response in:\n{fully_decoded}")

        if return_instruction_text:
            return {"instruction_text": instruction_text, "generated_text": decoded}

        return decoded
     

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v2-3b", padding_side="left")

model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b", 
                                             device_map="auto",
                                             torch_dtype=torch.bfloat16)

generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

In [5]:
generate_text("Simplify: The ethereal luminosity of the crystalline firmament, adorned with myriad celestial entities, interplays with the intricate tapestry of existence, entwining the profound intricacies of the cosmic fabric. It is within this multidimensional symphony, where the ephemeral dance of particles converges with the boundless expanse of spacetime, that the enigmatic enigma of reality unfolds.")

'The outer limits of what we perceive as reality are inexplicably boundless. The universe is filled with all possible possibilities, each an enigma to be solved. The more you learn, the more you realize how little you know.\n best regards\nRaphael Sasson\n\n\n+++\n\n\nEver wondered what really happens at the edge of space and time? You\'re about to find out. I started reading Brandon Gauthier\'s book and finished it a few hours later. It was fascinating.\n\n\nBrandon Gauthier is a professor at the University of Connecticut who specializes in black holes and spacetime. His interest in physics began when he was 7 years old and his father brought home a book from a local library that he had reviewed. After looking up some words in the index, his father surprised him by saying, "You didn\'t know that the speed of light was constant, did you?" (it\'s actually a bit more complicated than that, but that\'s the essence of the quote).\n\nFrom that day on, his interest in science grew. For him, 

In [6]:
generate_text("Simplify to CEFR A1: The ethereal luminosity of the crystalline firmament, adorned with myriad celestial entities, interplays with the intricate tapestry of existence, entwining the profound intricacies of the cosmic fabric. It is within this multidimensional symphony, where the ephemeral dance of particles converges with the boundless expanse of spacetime, that the enigmatic enigma of reality unfolds.")

'CEFR A1: The ethereal luminosity of the crystalline firmament, adorned with myriad celestial entities, interplays with the intricate tapestry of existence, entwining the profound intricacies of the cosmic fabric.'

In [10]:
generate_text("Simplify the following text: The ethereal luminosity of the crystalline firmament, adorned with myriad celestial entities, interplays with the intricate tapestry of existence, entwining the profound intricacies of the cosmic fabric. It is within this multidimensional symphony, where the ephemeral dance of particles converges with the boundless expanse of spacetime, that the enigmatic enigma of reality unfolds.")

'The ethereal luminosity of the crystalline firmament, adorned with myriad celestial entities, interplays with the intricate tapestry of existence, entwining the profound intricacies of the cosmic fabric. It is within this multidimensional symphony, where the ephemeral dance of particles converges with the boundless expanse of spacetime, that the enigmatic enigma of reality unfolds.'

## Preprocess text

#### json format example:

{
        
        "instruction": "Simplify the following text from CEFR C1 to CEFR A1",
        
        "input": "The ethereal luminosity of the crystalline firmament, adorned with myriad celestial entities, interplays with the intricate tapestry of existence, entwining the profound intricacies of the cosmic fabric. It is within this multidimensional symphony, where the ephemeral dance of particles converges with the boundless expanse of spacetime, that the enigmatic enigma of reality unfolds.",
        
        "output": "The bright light in the sky, filled with countless stars, interacts with the complex web of life. It combines the deep complexities of the universe. It is in this beautiful mix of different elements, where particles move and time seems endless, that the mysterious nature of reality is revealed."
    
},

### Load data

In [24]:
df = pd.read_csv("./Text_Simplification/raw_data.csv")

sources = ["BreakingNewsEnglish","NewsInLevels"]
df1 = df[df["data_source"].isin(sources)]

df1.shape[0]

12910

In [6]:
df1.head(1)

Unnamed: 0,source,target,source_level_og,target_level_og,data_source,data_type,source_level_cefr,target_level_cefr,id
0,British people are big tea drinkers. It is a t...,British people love tea. They drink it for dif...,3.0,2.0,BreakingNewsEnglish,text_simplification,,,TS000000001


In [34]:
## Take only a sample to test if everything is working fine
# df2 = df1
df2 = df1.sample(1000)

df2.shape[0]

1000

In [35]:
import json

# mapping of levels
level_mapping = {1.0: 'A2', 2.0: 'B2', 3.0: 'C2'}

# Apply mapping to source_level_og and target_level_og columns
df2['source_level_og'] = df2['source_level_og'].map(level_mapping)
df2['target_level_og'] = df2['target_level_og'].map(level_mapping)

def convert_row_to_json(row):
    return {
        "instruction": f"Simplify the following text from CEFR {row['source_level_og']} to CEFR {row['target_level_og']}",
        "input": row['source'],
        "output": row['target']
    }

data = df2.apply(convert_row_to_json, axis=1).tolist()

# write to a JSON file
# with open('data.json', 'w') as f:
#     json.dump(data, f)

In [37]:
def generate_prompt(data_point):
    # taken from https://github.com/tloen/alpaca-lora
    if data_point["instruction"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Input:
{data_point["input"]}

### Response:
{data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Response:
{data_point["output"]}"""


data = list(map(lambda data_point: {"prompt": tokenizer(generate_prompt(data_point))}, data))

data

[{'prompt': {'input_ids': [30003, 310, 271, 9775, 326, 8631, 247, 4836, 13, 18433, 342, 271, 3280, 326, 3400, 2007, 3634, 15, 19566, 247, 2380, 326, 20420, 29141, 253, 2748, 15, 535, 50278, 187, 7657, 253, 1563, 2505, 432, 7956, 6764, 330, 19, 281, 7956, 6764, 378, 19, 187, 187, 4118, 19832, 27, 187, 35, 8673, 4651, 323, 253, 806, 673, 1108, 275, 616, 5571, 457, 84, 2, 831, 4564, 2326, 1060, 403, 28765, 253, 4201, 273, 616, 6858, 5006, 13, 1563, 8019, 39, 1971, 1108, 616, 806, 5547, 10438, 275, 7904, 1107, 273, 7875, 15, 190, 187, 37, 267, 75, 4524, 611, 4411, 3534, 4201, 1390, 1770, 846, 767, 1107, 273, 27161, 1971, 13, 970, 14773, 11624, 13, 387, 247, 14235, 275, 11186, 5427, 15, 496, 617, 5571, 457, 84, 13, 703, 457, 84, 25369, 581, 273, 253, 17172, 14431, 281, 2455, 1918, 4201, 13, 533, 617, 3242, 2363, 4558, 12744, 1580, 703, 2506, 457, 85, 452, 247, 4201, 14204, 13, 534, 3548, 457, 85, 24666, 275, 5427, 15, 190, 187, 510, 747, 3101, 2296, 326, 1620, 858, 617, 3101, 14, 249, 14, 6