# Finetuning notebook

The basics of this notebook is based on the notebooks provided by unsloth
https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length : int = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit : bool = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-2b-it-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
messages : list[dict[str,str]] = [
        {"role": "user", "content": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n",},
        {"role" : "assistant", "content" : "Hi this is me"}
]
inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=False,
        return_tensors="pt",
        tokenize=False
    )
inputs

In [None]:
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from unsloth import FastLanguageModel, is_bf16_supported, is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only
import torch
import datetime
from datasets import load_dataset


model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0.1, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

dataset = load_dataset('chris7374/esg-net-zero', revision='100_per_class_v3')
val_dataset = load_dataset('chris7374/esg-net-zero', revision='validation')

prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an expert ESG (Environmental, Social, and Governance) analyst who conducts ESG research by analyzing texts to identify the presence of climate balance targets. Your primary task is to classify identified targets into one of four predefined classes and determine the target year for the climate balance target. Only consider overall climate balance targets, meaning that they are company-wide.
The possible classes are “Carbon neutral(ity)”, “Emissions reduction target”, “Net zero”, and “No target”.
Each class has equal importance, and the correct classification should reflect the most explicit target mentioned in the text. In cases where multiple classes are present:
	•	“Net zero” should only be prioritized if explicitly mentioned as a company’s overarching target.
	•	“Carbon neutral(ity)” takes precedence over “Emissions reduction target” only if it is the primary focus of the text.
	•	“Emissions reduction target” should be classified if it is directly stated and not overshadowed by “Net zero” or “Carbon neutral(ity)” commitments.
	•	If no explicit target is mentioned, classify as “No target”.
Ensure the classification is based on explicit information from the text, without assuming that one target implies another unless clearly stated.

### Context:
{}

### Response Formatting: 
Only answer in the following XML format:\n<answer><classification><end_target>Target</end_target></classification><extraction><end_target_year>Year</end_target_year></extraction><quote>...</quote></answer>
"""

output = """
<answer>
<classification>
<end_target>{}</end_target>
</classification>
<extraction>
<end_target_year>{}</end_target_year>
</extraction>
<quote>{}</quote>
</answer>
"""

def formatting_prompts_func(examples):
    end_target = examples['end_target']
    end_target_year = examples['end_target_year']
    context = examples['custom_text']
    quote = examples['custom_short_description']
    texts = []
    for end_target, end_target_year, context,quote in zip(end_target, end_target_year, context,quote):
        messages = [
        {"role": "user", "content": f"{prompt.format(context)}",},
        {"role" : "assistant", "content" : output.format(end_target,end_target_year,quote)}
        ]
        #https://huggingface.co/docs/transformers/main/chat_templating
        text = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=False,
            return_tensors="pt",
            tokenize=False,
            add_special_tokens=False,
        )
        texts.append(text)
    return {"text" : texts}
pass

dataset = dataset.map(formatting_prompts_func, batched = True)
dataset = dataset['train']
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset with same seed for reproducable runs.
train_data = dataset
val_dataset = val_dataset.map(formatting_prompts_func, batched = True)
val_dataset = val_dataset['train']
val_dataset = val_dataset.shuffle(seed=1234)  # Shuffle dataset with same seed for reproducable runs.

# Naming for the output folder
project = "esg"
base_model_name = "gemma2"
run_name = base_model_name + "-" + project
output_dir = "outputs" + run_name

num_train_epochs = 3
batch_size = 2
gradient_accumulation_steps = 4

#This is needed for training on completions only and needs to be updated for different models than gemma-2b-it
#Response template should start with the first tokens. So if there is also a system prompt the response template should start with that
#The response template can be read from the tokenizer and its get_chat_template function. Or if the apply_chat_template is applied without tokenizing.
instruction_template = "<start_of_turn>user"
response_template = "<start_of_turn>model"
#This datacollator trains on the completions only and ignores the input tokens https://huggingface.co/docs/transformers/main_classes/data_collator
#Essentially the input tokens get masked and ignored by the model https://huggingface.co/docs/trl/v0.11.4/en/sft_trainer#train-on-completions-only
collator = DataCollatorForCompletionOnlyLM(instruction_template = instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False) 
#https://huggingface.co/docs/trl/main/en/sft_trainer#trl.SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset=train_data,
    eval_dataset=val_dataset,
    dataset_text_field="text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    data_collator=collator,
    #https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
    args = TrainingArguments(
        per_device_train_batch_size = batch_size,
        per_device_eval_batch_size = 2,
        gradient_accumulation_steps = gradient_accumulation_steps,
        learning_rate = 2e-4,
        warmup_ratio= 0.1,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        num_train_epochs=num_train_epochs,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        do_eval=True,
        eval_strategy="steps",
        eval_steps = 5,
        save_strategy="epoch", #This means that the model gets saved every epoch. Alter this to steps for large datasets
        #report_to="wandb",  
        logging_steps=5,
        seed = 3407,
        output_dir = output_dir,
        run_name=f"{run_name}-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')}"
    ),
)

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)

trainer.train()

In [None]:
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

### Examples:
Input: "Our company aims to achieve carbon neutrality by 2040 through a combination of emissions reduction and carbon offsets."
Output:
<end_target>Carbon neutral(ity)</end_target>
<end_target_year>2040</end_target_year>
<quote>company aims to achieve carbon neutrality by 2040</quote>
Input: "We're committed to reducing our greenhouse gas emissions by 50% compared to 2015 levels by the year 2030. Our
journey to 2030 is focused on first reducing our scope 1, 2, and 3 greenhouse gas emissions by 50 percent compared with 2015, and investing in high-quality carbon removal solutions for the remaining emissions"
Output:
<end_target>Emissions reduction target</end_target>
<end_target_year>2030</end_target_year>
<quote>reducing our greenhouse gas emissions by 50% compared to 2015 levels by the year 2030</quote>
Input: "Our long-term vision is to achieve net zero emissions across our entire value chain by 2050."
Output:
<end_target>Net zero</end_target>
<end_target_year>2050</end_target_year>
<quote>long-term vision is to achieve net zero emissions (...) by 2050.</quote>
Input: "We recognize the importance of environmental sustainability and are continuously working to improve our operations."
Output:
<end_target>No target</end_target>
<end_target_year>No target</end_target_year>
<quote>None</quote>