### Fine-tuning LLama 2 with QLoRA
https://blog.ovhcloud.com/fine-tuning-llama-2-models-using-a-single-gpu-qlora-and-ai-notebooks/

In [24]:
# !pip install -r requirements_ovhcloud.txt

In [25]:
# !lsb_release -a

In [26]:
# !export CUDA_VISIBLE_DEVICES=0

In [27]:
# !sudo chmod 777 /root

In [22]:
pd.set_option('display.max_colwidth', 150)

In [11]:
import argparse
import bitsandbytes as bnb
from datasets import load_dataset, Dataset
from functools import partial
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

from IPython.display import display, Markdown
import pandas as pd
import os

from peft import AutoPeftModelForCausalLM

### Download the dataset

In [12]:
PATH = "gs://kisai-data-msca310019-capstone/Text_Simplification/simplified_df_cefr_labeled.csv"
PATH = "gs://kisai-data-msca310019-capstone/raw_data.csv"

data = pd.read_csv(PATH)
data.head()

Unnamed: 0,source,target,source_level_og,target_level_og,data_source,data_type,source_level_cefr,target_level_cefr,id
0,British people are big tea drinkers. It is a t...,British people love tea. They drink it for dif...,3.0,2.0,BreakingNewsEnglish,text_simplification,,,TS000000001
1,Many people around the world stay at home and ...,Many people stay at home. They do not want to ...,3.0,2.0,BreakingNewsEnglish,text_simplification,,,TS000000002
2,Most of us don't really take much notice of ca...,We rarely notice car license plates. Maybe we ...,3.0,2.0,BreakingNewsEnglish,text_simplification,,,TS000000003
3,Italy's ruling party may introduce a new law t...,Italy wants to stop people using English words...,3.0,2.0,BreakingNewsEnglish,text_simplification,,,TS000000004
4,"Some people are very forgetful, while others c...","Some people are forgetful, while others rememb...",3.0,2.0,BreakingNewsEnglish,text_simplification,,,TS000000005


In [13]:
len(data)

791159

In [14]:
data.drop(["source_level_cefr", "target_level_cefr"], axis=1, inplace=True)

In [15]:
print("Length before dropping nan", len(data))
data.dropna(subset=['target_level_og', 'source_level_og'], inplace=True)
print("Length after dropping nan", len(data))

Length before dropping nan 791159
Length after dropping nan 12910


In [16]:
data["cefr_mapping"] = data["source_level_og"].astype(str) + "->" + data["target_level_og"].astype(str)
data["cefr_mapping"].value_counts()

cefr_mapping
2.0->1.0    4304
3.0->2.0    4303
3.0->1.0    4303
Name: count, dtype: int64

In [17]:
def generate_instruction_column(source_cefr, target_cefr):
    return f"Simplify the following context from CEFR Level {int(source_cefr)} to CEFR Level {int(target_cefr)}"

data["instruction"] = data.apply(lambda row: generate_instruction_column(row["source_level_og"], row["target_level_og"]), axis=1)

In [18]:
data.iloc[0]["instruction"]

'Simplify the following context from CEFR Level 3 to CEFR Level 2'

In [19]:
data.rename({'source': 'context', 'target': 'response'}, axis=1, inplace=True)
data = data[["instruction", "context", "response"]]

In [20]:
# Convert the DataFrame to a dictionary
data_dict = data.to_dict('list')

# Convert the dictionary to a Hugging Face Dataset
dataset = Dataset.from_dict(data_dict)

In [21]:
dataset

Dataset({
    features: ['instruction', 'context', 'response'],
    num_rows: 12910
})

In [22]:
print(f'Number of prompts: {len(dataset)}')
print(f'Column names are: {dataset.column_names}')

Number of prompts: 12910
Column names are: ['instruction', 'context', 'response']


In [23]:
data.head()

Unnamed: 0,instruction,context,response
0,Simplify the following context from CEFR Level...,British people are big tea drinkers. It is a t...,British people love tea. They drink it for dif...
1,Simplify the following context from CEFR Level...,Many people around the world stay at home and ...,Many people stay at home. They do not want to ...
2,Simplify the following context from CEFR Level...,Most of us don't really take much notice of ca...,We rarely notice car license plates. Maybe we ...
3,Simplify the following context from CEFR Level...,Italy's ruling party may introduce a new law t...,Italy wants to stop people using English words...
4,Simplify the following context from CEFR Level...,"Some people are very forgetful, while others c...","Some people are forgetful, while others rememb..."


### Preprocess the dataset

In [24]:
def create_prompt_formats(sample):
    """
    Format various fields of the sample ('instruction', 'context', 'response')
    Then concatenate them using two newline characters 
    :param sample: Sample dictionnary
    """

    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "Input:"
    RESPONSE_KEY = "### Response:"
    END_KEY = "### End"
    
    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}"
    input_context = f"{INPUT_KEY}\n{sample['context']}" if sample["context"] else None
    response = f"{RESPONSE_KEY}\n{sample['response']}"
    end = f"{END_KEY}"
    
    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)
    
    sample["text"] = formatted_prompt

    return sample

In [25]:
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """
    
    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)
    
    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, 
                                      tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["instruction", "context", "response", "text"],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)
    
    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

### Create a bitsandbytes configuration

In [26]:
def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config

# SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

def create_peft_config(modules):
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=8,  # dimension of the updated matrices
        lora_alpha=16,  # parameter for scaling
        target_modules=modules,
        lora_dropout=0.1,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )
    print("Lora target_modules", str(modules))

    return config

def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )

In [29]:
## Preprocess dataset

max_length = get_max_length(model)

dataset = preprocess_dataset(tokenizer, max_length, seed, dataset)

NameError: name 'model' is not defined

In [71]:
dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 12910
})

In [27]:
model_name = "meta-llama/Llama-2-7b-hf" 
seed = 42

### Train

In [66]:
f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB'

'5GB'

In [28]:
def load_model(model_name, bnb_config):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="cuda:0", # dispatch efficiently the model on the available ressources  
        load_in_8bit=True,
        trust_remote_code=True,
        #max_memory=f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB',
    )
    return model

def load_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)

    # Needed for LLaMA tokenizer
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return tokenizer

In [69]:
# Load model from HF with user's token and with bitsandbytes config

bnb_config = create_bnb_config()
model = load_model(model_name, bnb_config)
tokenizer = load_tokenizer(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [72]:
def train(model, tokenizer, dataset, output_dir):
    model.gradient_checkpointing_enable()

    # 2 - Using the prepare_model_for_kbit_training method from PEFT
    model = prepare_model_for_kbit_training(model)

    # Get lora module names
    modules = find_all_linear_names(model)

    # Create PEFT config for these modules and wrap the model to PEFT
    peft_config = create_peft_config(modules)
    model = get_peft_model(model, peft_config)
    
    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)
    
    # Training parameters
    trainer = Trainer(
        model=model,
        train_dataset=dataset,
        args=TrainingArguments(
            per_device_train_batch_size=4,
            gradient_accumulation_steps=4,
            num_train_epochs = 100,
            max_steps=100,
            learning_rate=2e-4,
            fp16=True,
            logging_steps=5,
            output_dir="outputs",
            optim="paged_adamw_8bit",
        ),
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )
    
    model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs
    
    ### SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
    # Verifying the datatypes before training
    
    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes: dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items(): total+= v
    for k, v in dtypes.items():
        print(k, v, v/total)
     
    do_train = True
    
    # Launch training
    print("Training...")
    
    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)    
    
    ###
    
    # Saving model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)
    
    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()

In [73]:
output_dir = "results/kisai_llama2/final_checkpoint"
train(model, tokenizer, dataset, output_dir)

Lora target_modules ['up_proj', 'v_proj', 'q_proj', 'gate_proj', 'o_proj', 'k_proj', 'down_proj']
all params: 3,520,401,408 || trainable params: 19,988,480 || trainable%: 0.5677897967708119
torch.float32 282398720 0.08021776134910578
torch.uint8 3238002688 0.9197822386508943
Training...


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
5,1.797
10,1.4769
15,1.3805
20,1.3092
25,1.2944
30,1.2409
35,1.2468
40,1.2834
45,1.2764
50,1.2378


***** train metrics *****
  epoch                    =       0.12
  total_flos               = 26981134GF
  train_loss               =     1.2992
  train_runtime            = 0:51:01.98
  train_samples_per_second =      0.523
  train_steps_per_second   =      0.033
{'train_runtime': 3061.9885, 'train_samples_per_second': 0.523, 'train_steps_per_second': 0.033, 'total_flos': 2.897077254311117e+16, 'train_loss': 1.2991509819030762, 'epoch': 0.12}
Saving last checkpoint of the model...


### Model save

In [45]:
output_dir

'results/kisai_llama2/final_checkpoint'

In [75]:
%pwd

'/home/jupyter/keep-it-simple-ai/text_simplification/llama-2-qlora'

### Transfering the model to GCS

gsutil cp -r keep-it-simple-ai/text_simplification/llama-2-qlora/results/kisai_llama2/final_checkpoint/ gs://kisai-data-msca310019-capstone/Text_Simplification/models/kisai-llama-2-4bit/version_2/

## Inference

In [30]:
output_dir = "results/kisai_llama2/final_checkpoint"

In [31]:
model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    low_cpu_mem_usage=True,
    device_map = "auto",
)

# Merge LoRA and base model
# merged_model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [32]:
tokenizer = load_tokenizer(model_name)



In [19]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f9dddf03fd0>

In [108]:
torch.cuda.empty_cache()

In [91]:
type(model)

peft.peft_model.PeftModelForCausalLM

In [92]:
type(merged_model)

transformers.models.llama.modeling_llama.LlamaForCausalLM

In [79]:
merged_model = torch.compile(merged_model)

In [81]:
merged_model.config.use_cache = True
print(model.config)

LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"
  },
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": 

In [97]:
def make_inference(instruction, context = None):
    if context:
        prompt = f"Below is an instruction that describes a task, paired with an input that provides further context.\n\n### Instruction: \n{instruction}\n\n### Input: \n{context}\n\n### Response: \n"
    else:
        prompt = f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction: \n{instruction}\n\n### Response: \n"
    
    inputs = tokenizer(prompt, return_tensors="pt", return_token_type_ids=False).to("cuda:0")
    outputs = model.generate(**inputs, max_new_tokens=150)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
    # New code to trim output
    response = response.split('End of context:')[0]
    
    filtered_response = response.split("End of response:")[0].split("Response:")[1]
    
    # display(Markdown(response))
    return filtered_response


def make_improvement_experimental(instruction, context=None):
    if context: 
        prompt = f"Below is an instruction that describes a task, paired with an input that provides further context.\n\n### Instruction: \n{instruction}\n\n### Input: \n{context}\n\n### Response: \n"
    else:
        prompt = f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction: \n{instruction}\n\n### Response: \n"
    inputs = tokenizer(prompt, return_tensors="pt", return_token_type_ids=False)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids = inputs.input_ids,
            attention_mask = inputs.attention_mask,
            do_sample = True,
            max_new_tokens=250,
            use_cache = True,
        )
    decoded_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    return decoded_response

In [99]:
%%timeit -r 3

answer = make_improvement_experimental(instruction="Summarize the following context. Your answer must be in English. End the response with the phrase 'End of response.'",  
                        context="Today Sunday 3 September (20.00) Serbia-Turkey-Euro 2023 final of women’s volleyball. TO Brussels (Belgium) the continental title is up for grabs and it will be decided who will succeed Italy in the golden register. It promises to be a particularly balanced and compelling, exciting and intense challenge, open to any result. It will be a cross between Italian coaches, given that coach Daniele Santarelli sits on the Anatolian bench, capable of beating Italy in the tie-break, and Giovanni Guidetti is on the Balkan bench.")

1min 37s ± 180 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


### Compiling the model

In [100]:
def make_improvement_experimental(instruction, context=None):
    if context: 
        prompt = f"Below is an instruction that describes a task, paired with an input that provides further context.\n\n### Instruction: \n{instruction}\n\n### Input: \n{context}\n\n### Response: \n"
    else:
        prompt = f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction: \n{instruction}\n\n### Response: \n"
    inputs = tokenizer(prompt, return_tensors="pt", return_token_type_ids=False)
    with torch.inference_mode():
        outputs = merged_model.generate(
            input_ids = inputs.input_ids,
            attention_mask = inputs.attention_mask,
            do_sample = True,
            max_new_tokens=250,
            use_cache = True,
        )
    decoded_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    return decoded_response

### Trying no_grad and .eval

In [104]:
def generate_prompt(instruction, context=None):
    if context: 
        prompt = f"Below is an instruction that describes a task, paired with an input that provides further context.\n\n### Instruction: \n{instruction}\n\n### Input: \n{context}\n\n### Response: \n"
    else:
        prompt = f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction: \n{instruction}\n\n### Response: \n"

In [102]:
def make_improvement_experimental(instruction, context=None):
    prompt = generate_prompt(instruction, context)
    inputs = tokenizer(prompt, return_tensors="pt", return_token_type_ids=False)
    merged_model.eval()
    
    with torch.no_grad():
        outputs = merged_model.generate(
            input_ids = inputs.input_ids,
            attention_mask = inputs.attention_mask,
            do_sample = True,
            max_new_tokens=250,
            use_cache = True,
        )
    decoded_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    return decoded_response

In [103]:
%%timeit -r 3

answer = make_improvement_experimental(instruction="Summarize the following context. Your answer must be in English. End the response with the phrase 'End of response.'",  
                        context="Today Sunday 3 September (20.00) Serbia-Turkey-Euro 2023 final of women’s volleyball. TO Brussels (Belgium) the continental title is up for grabs and it will be decided who will succeed Italy in the golden register. It promises to be a particularly balanced and compelling, exciting and intense challenge, open to any result. It will be a cross between Italian coaches, given that coach Daniele Santarelli sits on the Anatolian bench, capable of beating Italy in the tie-break, and Giovanni Guidetti is on the Balkan bench.")

1min 37s ± 446 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


### 2 GPUs

In [33]:
def generate_prompt(instruction, context=None):
    if context: 
        prompt = f"Below is an instruction that describes a task, paired with an input that provides further context.\n\n### Instruction: \n{instruction}\n\n### Input: \n{context}\n\n### Response: \n"
    else:
        prompt = f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction: \n{instruction}\n\n### Response: \n"
    return prompt

In [44]:
instruction="Summarize the following context. Your answer must be in English. End the response with the phrase 'End of response.'"  
context="Today Sunday 3 September (20.00) Serbia-Turkey-Euro 2023 final of women’s volleyball. TO Brussels (Belgium) the continental title is up for grabs and it will be decided who will succeed Italy in the golden register. It promises to be a particularly balanced and compelling, exciting and intense challenge, open to any result. It will be a cross between Italian coaches, given that coach Daniele Santarelli sits on the Anatolian bench, capable of beating Italy in the tie-break, and Giovanni Guidetti is on the Balkan bench."

In [34]:
def make_improvement_experimental(instruction, context=None):
    prompt = generate_prompt(instruction, context)
    
    inputs = tokenizer(text=prompt, return_tensors="pt", return_token_type_ids=False).to("cuda")
    model.eval()
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids = inputs.input_ids,
            attention_mask = inputs.attention_mask,
            do_sample = True,
            max_new_tokens=250,
            use_cache = True,
        )
    decoded_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    return decoded_response

In [49]:
%%timeit -r 3

answer = make_improvement_experimental(instruction=instruction, context=context)

29.3 s ± 6.3 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


### Inference speed comparison

41.7 seconds with merged model \
29.3 seconds when the quantized model is saved, and reloaded as llama 2 model

### Inferencing

In [26]:
make_inference(instruction="Simplify the following context from CEFR level 3 to CEFR level 2. End the response with the phrase 'End of response.'",  context="Today Sunday 3 September (20.00) Serbia-Turkey-Euro 2023 final of women’s volleyball. TO Brussels (Belgium) the continental title is up for grabs and it will be decided who will succeed Italy in the golden register. It promises to be a particularly balanced and compelling, exciting and intense challenge, open to any result. It will be a cross between Italian coaches, given that coach Daniele Santarelli sits on the Anatolian bench, capable of beating Italy in the tie-break, and Giovanni Guidetti is on the Balkan bench.")

Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction: 
Simplify the following context from CEFR level 3 to CEFR level 2. End the response with the phrase 'End of response.'

### Input: 
Today Sunday 3 September (20.00) Serbia-Turkey-Euro 2023 final of women’s volleyball. TO Brussels (Belgium) the continental title is up for grabs and it will be decided who will succeed Italy in the golden register. It promises to be a particularly balanced and compelling, exciting and intense challenge, open to any result. It will be a cross between Italian coaches, given that coach Daniele Santarelli sits on the Anatolian bench, capable of beating Italy in the tie-break, and Giovanni Guidetti is on the Balkan bench.

### Response: 
Today, the final of women’s volleyball is on. It is a match between Serbia and Turkey. Serbia beat Italy to get to the final. It will be a good match.

### End of response: 

### 

In [27]:

make_inference(instruction="Simplify the following context from CEFR level 2 to CEFR level 1. End the response with the phrase 'End of response.'",  context="Today Sunday 3 September (20.00) Serbia-Turkey-Euro 2023 final of women’s volleyball. TO Brussels (Belgium) the continental title is up for grabs and it will be decided who will succeed Italy in the golden register. It promises to be a particularly balanced and compelling, exciting and intense challenge, open to any result. It will be a cross between Italian coaches, given that coach Daniele Santarelli sits on the Anatolian bench, capable of beating Italy in the tie-break, and Giovanni Guidetti is on the Balkan bench.")

Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction: 
Simplify the following context from CEFR level 2 to CEFR level 1. End the response with the phrase 'End of response.'

### Input: 
Today Sunday 3 September (20.00) Serbia-Turkey-Euro 2023 final of women’s volleyball. TO Brussels (Belgium) the continental title is up for grabs and it will be decided who will succeed Italy in the golden register. It promises to be a particularly balanced and compelling, exciting and intense challenge, open to any result. It will be a cross between Italian coaches, given that coach Daniele Santarelli sits on the Anatolian bench, capable of beating Italy in the tie-break, and Giovanni Guidetti is on the Balkan bench.

### Response: 
Today Sunday 3 September (20.00) Serbia-Turkey-Euro 2023 final of women’s volleyball. The continental title is up for grabs. It promises to be a particularly balanced and compelling challenge, open to any result. It will be a cross between Italian coaches, given that coach Daniele Santarelli sits on the Anatolian bench, capable of beating Italy in the tie-break, and Giovanni Guidetti is on the Balkan bench.

### End of response: 
End of response.

### 

In [28]:
# ANSWER: Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.

make_inference(instruction="When did Virgin Australia start operating?",  
               context="Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.")

Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction: 
When did Virgin Australia start operating?

### Input: 
Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.

### Response: 
Virgin Australia started operating on August 31st, 2000.

### 

In [29]:
# ANSWER: No. Stalemate is a drawn position. It doesn't matter who has captured more pieces or is in a winning position

make_inference(instruction="If I have more pieces at the time of stalemate, have I won?",  
               context="Stalemate is a situation in chess where the player whose turn it is to move is not in check and has no legal move. Stalemate results in a draw. During the endgame, stalemate is a resource that can enable the player with the inferior position to draw the game rather than lose. In more complex positions, stalemate is much rarer, usually taking the form of a swindle that succeeds only if the superior side is inattentive.[citation needed] Stalemate is also a common theme in endgame studies and other chess problems. The outcome of a stalemate was standardized as a draw in the 19th century. Before this standardization, its treatment varied widely, including being deemed a win for the stalemating player, a half-win for that player, or a loss for that player; not being permitted; and resulting in the stalemated player missing a turn. Stalemate rules vary in other games of the chess family.")

Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction: 
If I have more pieces at the time of stalemate, have I won?

### Input: 
Stalemate is a situation in chess where the player whose turn it is to move is not in check and has no legal move. Stalemate results in a draw. During the endgame, stalemate is a resource that can enable the player with the inferior position to draw the game rather than lose. In more complex positions, stalemate is much rarer, usually taking the form of a swindle that succeeds only if the superior side is inattentive.[citation needed] Stalemate is also a common theme in endgame studies and other chess problems. The outcome of a stalemate was standardized as a draw in the 19th century. Before this standardization, its treatment varied widely, including being deemed a win for the stalemating player, a half-win for that player, or a loss for that player; not being permitted; and resulting in the stalemated player missing a turn. Stalemate rules vary in other games of the chess family.

### Response: 
No, I have not won.

### Endpoint: 
Stalemate is a draw.

### Context: 
Stalemate is a situation in chess where the player whose turn it is to move is not in check and has no legal move. Stalemate results in a draw. During the endgame, stalemate is a resource that can enable the player with the inferior position to draw the game rather than lose. In more complex positions, stalemate is much rarer, usually taking the form of a swindle that succeeds only if the superior side is inattentive. Stalemate is also a common theme in endgame studies and other chess problems. The outcome of a stalemate was standardized as a draw in the 19th century. Before this standardization, its treatment varied widely, including being deemed a win for the stalemating player, a half-win for that player, or a loss for that player; not being permitted; and resulting in the stalemated player missing a turn. Stalemate rules vary in other games of the chess family.

### Endpoint: 
Stalem

In [30]:
# ANSWER: The top speed of a Kia Stinger is 269km/h (167mph) according to this text.

make_inference(instruction="Given this paragraph, what is the top speed of a Kia Stinger?",  
               context="Kia claims that the Stinger accelerates from zero to 100 km/h (62 mph) in 7.7, 6 and 4.9 seconds for the 2.2-liter diesel, 2.0-liter petrol and 3.3-liter petrol respectively. Schreyer reportedly drove a pre-production Stinger GT at a top speed of 269 km/h (167 mph) on the Autobahn. During a test by Car and Driver, an all-wheel-drive U.S. spec GT 3.3T with Michelin Pilot Sport 4 tires achieved 0–60 mph (0–97 km/h) in 4.6 seconds on the track, reached 0.91 g on the skidpad and was able to stop from 70 mph (113 km/h) in 164 feet (50 m). According to this publication, the U.S. model's top speed is governed at 167 mph (269 km/h) per Kia specs. In tests conducted by Motor Trend, the four-cylinder U.S. spec Stinger 2.0 RWD on Bridgestone Potenza tires reached 60 mph (97 km/h) in 6.6 seconds, completed the 1⁄4-mile (0.4 km) run in 15 seconds and stopped from 60 mph (97 km/h) in 126 feet (38 m). The average lateral acceleration recorded in track testing was 0.85 g.")

Below is an instruction that describes a task, paired with an input that provides further context.

### Instruction: 
Given this paragraph, what is the top speed of a Kia Stinger?

### Input: 
Kia claims that the Stinger accelerates from zero to 100 km/h (62 mph) in 7.7, 6 and 4.9 seconds for the 2.2-liter diesel, 2.0-liter petrol and 3.3-liter petrol respectively. Schreyer reportedly drove a pre-production Stinger GT at a top speed of 269 km/h (167 mph) on the Autobahn. During a test by Car and Driver, an all-wheel-drive U.S. spec GT 3.3T with Michelin Pilot Sport 4 tires achieved 0–60 mph (0–97 km/h) in 4.6 seconds on the track, reached 0.91 g on the skidpad and was able to stop from 70 mph (113 km/h) in 164 feet (50 m). According to this publication, the U.S. model's top speed is governed at 167 mph (269 km/h) per Kia specs. In tests conducted by Motor Trend, the four-cylinder U.S. spec Stinger 2.0 RWD on Bridgestone Potenza tires reached 60 mph (97 km/h) in 6.6 seconds, completed the 1⁄4-mile (0.4 km) run in 15 seconds and stopped from 60 mph (97 km/h) in 126 feet (38 m). The average lateral acceleration recorded in track testing was 0.85 g.

### Response: 
The top speed of a Kia Stinger is 274 km/h (170 mph).

### End:

### End of Context:

### End of Response:

### Response: 
The top speed of a Kia Stinger is 274 km/h (170 mph).

### End of Context:

### End of Response:

### Feedback:

Good job!

### End of Feedback:

### Begin Extra Response:

### Extra Response:

### End of Extra Response:

### End of Context:

### End of Response:

### Feedback:

Very good job!

### End of Feedback:

### Begin Extra Response:

### Extra Response:

### End of Extra Response:

### End of Context:

### End of Response:

### Feedback:

Great job!

### End of Feed

### Inferencing on Text Simplification Dataset

In [35]:
data.head()

Unnamed: 0,instruction,context,response
0,Simplify the following context from CEFR Level...,British people are big tea drinkers. It is a t...,British people love tea. They drink it for dif...
1,Simplify the following context from CEFR Level...,Many people around the world stay at home and ...,Many people stay at home. They do not want to ...
2,Simplify the following context from CEFR Level...,Most of us don't really take much notice of ca...,We rarely notice car license plates. Maybe we ...
3,Simplify the following context from CEFR Level...,Italy's ruling party may introduce a new law t...,Italy wants to stop people using English words...
4,Simplify the following context from CEFR Level...,"Some people are very forgetful, while others c...","Some people are forgetful, while others rememb..."


In [36]:
data.iloc[0]["context"]

'British people are big tea drinkers. It is a tradition in Britain to drink tea for different occasions and reasons. People have it for breakfast, for when guests visit, and for tea breaks at work. People even "have a cuppa" when they talk about their personal problems. However, research from The Tea Group shows that herbal, fruit and other teas have become more popular than traditional English breakfast tea. Researchers conducted a survey of more than 2,000 tea lovers. Over half of people said their favourite tea was not the traditional variety. Over a fifth of people chose green tea as their favourite brew. Just over 20 per cent said Earl Grey was their number one.\r\n\r\nSales of traditional tea in the U.K. have been declining. Three years ago, a survey found that 54 per cent of Britons preferred English breakfast tea. The new research shows that breakfast tea is likely to continue to decline in popularity. The researchers found many other things about tea-drinking habits in the U.K

In [37]:
data.iloc[0]["response"]

'British people love tea. They drink it for different reasons – for breakfast, to give to guests, for tea breaks at work, and even when talking about their problems. New research shows that herbal, fruit and other teas are more popular than English breakfast tea. A survey of over 2,000 tea lovers showed less than half of people said they preferred traditional tea. Over a fifth of people said green tea was their favourite.\r\n\r\nSales of traditional tea have fallen. The research shows this will continue. The researchers found many other things about tea-drinking in the UK. The biggest reason for drinking tea was to relax. A quarter of people drink up to 10 cups a day. British people love milky and sugary tea. Nearly 45 per cent of them sweeten their tea with sugar. People with a sweet tooth put three teaspoons of sugar in their tea.'

In [38]:
def make_inference(instruction, context=None):
    prompt = generate_prompt(instruction, context)
    
    inputs = tokenizer(text=prompt, return_tensors="pt", return_token_type_ids=False).to("cuda")
    model.eval()
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids = inputs.input_ids,
            attention_mask = inputs.attention_mask,
            do_sample = True,
            max_new_tokens=250,
            use_cache = True,
        )
    decoded_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    return decoded_response

In [41]:
data["model_response"] = ""

In [42]:
PATH = "gs://kisai-data-msca310019-capstone/Text_Simplification/finetuned_llama2_responses.csv"

In [43]:
data_v2 = pd.read_csv(PATH)
data_v2["model_v2_response"] = ""

In [44]:
data_v2.head()

Unnamed: 0,instruction,context,response,model_response,oob_model_response,model_v2_response
0,Simplify the following context from CEFR Level...,Donald Trump is interested in buying Greenland...,Donald Trump is interested in buying Greenland...,\nDonald Trump is interested in buying Greenl...,\nDonald Trump is interested in buying Greenl...,
1,Simplify the following context from CEFR Level...,Everyone knows that children don't like eating...,Everyone knows children don't like eating gree...,\nChildren don't like eating greens. Parents ...,\nEveryone knows that children don't like eat...,
2,Simplify the following context from CEFR Level...,Archaeologists have found a big cemetery under...,Archaeologists found a big cemetery under the ...,\nArchaeologists have found a big cemetery un...,\nThe text has been simplified from CEFR Leve...,
3,Simplify the following context from CEFR Level...,Baby orangutans in Indonesia are going to scho...,Baby orangutans in Indonesia are going to scho...,\nBaby orangutans in Indonesia are going to s...,"\nAfter seven or eight years, the young apes ...",
4,Simplify the following context from CEFR Level...,New research says bald men should not worry ab...,Research says bald men should not worry or get...,\nBald men should not worry about their looks...,"\nThe bald men were rated as more attractive,...",


In [None]:
import time

for idx, row in data_v2.iterrows():
    print("Currently processing index ", idx)
    start = time.perf_counter()
    answer = make_inference(instruction=row["instruction"],  
                            context=row["context"])
    data_v2.at[idx, 'model_v2_response'] = answer
    end = time.perf_counter()

    print(f"Finished processing index {idx} in {end-start} seconds.")
    
    if idx % 20 == 0:
        print(f"Saving for batch {idx//20}")
        start = time.perf_counter()
        data_v2.to_csv(PATH, index=False)
        end = time.perf_counter()
        print(f"Finishes saving the batch {idx//20} in {end-start} seconds")

Currently processing index  0
Finished processing index 0 in 44.44794289399999 seconds.
Saving for batch 0
Finishes saving the batch 0 in 0.3567352040000742 seconds
Currently processing index  1
Finished processing index 1 in 31.098942822000026 seconds.
Currently processing index  2
Finished processing index 2 in 31.23130393100007 seconds.
Currently processing index  3
Finished processing index 3 in 31.160384366000017 seconds.
Currently processing index  4
Finished processing index 4 in 10.434447689999956 seconds.
Currently processing index  5
Finished processing index 5 in 14.49660225699995 seconds.
Currently processing index  6
Finished processing index 6 in 31.131434032000016 seconds.
Currently processing index  7
Finished processing index 7 in 19.292316589999928 seconds.
Currently processing index  8
Finished processing index 8 in 31.24450003000004 seconds.
Currently processing index  9
Finished processing index 9 in 32.066916752 seconds.
Currently processing index  10
Finished pro

In [None]:
print('a')

In [None]:
print('a')

In [None]:
print("a")