In [1]:
# import holidays
# import pandas as pd
# from tqdm.auto import tqdm

# from src.data import german_regions
# from src.features.aggregation import treatment_unaggregated
# from src.paths import processed_data

# path = processed_data / "propensity_scores/nlp/transformer"
# path.mkdir(exist_ok=True, parents=True)


# def get_text_for_dates(start, end, df, region):
#     region_code = [a.code for a in german_regions if a.name == region][0]
#     df = df[df["region"] == region]
#     holi = holidays.Germany(years=range(2018, 2023), subdiv=region_code)
#     items = []
#     for date in pd.date_range(start, end):
#         text = ""
#         text += f"{region}, "
#         protests = df[df["date"] == date]
#         date_info = (
#             f"{date.strftime('%A')}, {date.day}. {date.month_name()} {date.year}"
#         )
#         if date in holi:
#             date_info += f", {holi[date]}"
#         text += date_info + "\n"
#         text += "has protests: " + ("yes" if len(protests) > 0 else "no") + "\n"
#         if len(protests) > 0:
#             text += "number of protests: " + str(len(protests)) + "\n"
#             for _, protest in protests.iterrows():
#                 text += f"{protest['actor']}: {protest['notes']}\n"
#         text += "\n"
#         items.append(text)
#     return items


# df = treatment_unaggregated("acled")
# df = df[df.country == "Germany"]
# protest_group = None
# if protest_group is not None:
#     df = df[(df.protest == protest_group)]
# X = []
# y = []
# for region in list(df.region.unique())[:1]:
#     for date in tqdm(pd.date_range("2020-02-01", "2022-12-31")):
#         start = date - pd.Timedelta(days=30)
#         items = get_text_for_dates(start, date, df, region)
#         i = items[-1].index("has protests: ") + len("has protests: ")
#         x = "\n".join(items[:-1]) + items[-1][:i]
#         X.append(x)
#         y.append(items[-1][i : i + 3].strip())
# df = pd.DataFrame({"text": X, "label": y})
# train_df = df.sample(frac=0.8, random_state=42)
# test_df = df.drop(train_df.index)
# # train_df.to_json(path / "train.jsonl", orient="records", lines=True)
# # test_df.to_json(path / "test.jsonl", orient="records", lines=True)

Sources:

- Llama 2 https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/
    - https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
- QLoRA https://arxiv.org/abs/2305.14314
    - https://huggingface.co/docs/transformers/v4.32.0/perf_train_gpu_one
    - https://huggingface.co/blog/4bit-transformers-bitsandbytes
    - https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing#scrollTo=jq0nX33BmfaC
    - https://blog.ovhcloud.com/fine-tuning-llama-2-models-using-a-single-gpu-qlora-and-ai-notebooks/
    - https://huggingface.co/docs/transformers/main/main_classes/trainer

In [2]:
# %pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 scikit-learn scipy xformers evaluate

In [3]:
import os

import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    logging,
    pipeline,
    AutoModelForSequenceClassification
)
from trl import SFTTrainer

In [4]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
new_model = "llama-2-7b-custom"
max_seq_length = 1024

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
tokenizer.truncation_side = "left"

In [6]:
path = "../data/processed/propensity_scores/nlp/transformer/"
train_dataset = load_dataset("json", data_files=path+"train.jsonl", split="train").shuffle(seed=100)
valid_dataset = load_dataset("json", data_files=path+"test.jsonl", split="train").shuffle(seed=100)

def mapper(examples):
    return {'text': examples["X"], "label": [int(y == "yes") for y in examples["y"]]}

train_dataset = train_dataset.map(mapper, batched=True, remove_columns=["X", "y"])
valid_dataset = valid_dataset.map(mapper, batched=True, remove_columns=["X", "y"])

def tokenize_(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_seq_length)

train_tokenized = train_dataset.map(tokenize_, batched=True, remove_columns=["text"])
valid_tokenized = valid_dataset.map(tokenize_, batched=True, remove_columns=["text"])
train_tokenized
# train_tokenized["decoder_input_ids"] = train_tokenized["input_ids"].copy()


Map:   0%|          | 0/13632 [00:00<?, ? examples/s]

Map:   0%|          | 0/3408 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 13632
})

In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForSequenceClassification.from_pretrained(model_name, quantization_config=bnb_config, device_map={"": 0}) # max_memory={0: "24564MB"}
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [9]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [10]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)
modules

['o_proj', 'down_proj', 'q_proj', 'gate_proj', 'k_proj', 'v_proj', 'up_proj']

In [11]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=4, 
    lora_alpha=1, 
    # target_modules=modules,
    lora_dropout=0.05, 
    bias="none", 
    task_type="SEQ_CLS"
)

model_ = get_peft_model(model, config)
print_trainable_parameters(model_)

trainable params: 2113536 || all params: 3371454464 || trainable%: 0.06268914566600535


In [12]:
# from sklearn.metrics import f1_score
# from tqdm.auto import tqdm

# def classify(prompt):
#     pipe = pipeline(task="text-classification", model=model, tokenizer=tokenizer)
#     return pipe(prompt)[0]["label"] == "LABEL_1"

# n = 100
# y_pred = [classify(x) for x in tqdm(train_dataset["text"][:n])]
# f1_score(train_dataset["label"][:n], y_pred)

In [28]:
import evaluate
import numpy as np

metric = evaluate.load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from torch import nn, tensor
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # adapted from https://huggingface.co/docs/transformers/main/main_classes/trainer
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (class 0 has 95% of the samples and class 1 has 5% of the samples)
        weights = tensor([1/0.95, 1/0.05], device=model.device)
        loss_fct = nn.CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


In [31]:
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    log_level="info",
    evaluation_strategy="steps",
    eval_steps=10,
    logging_steps=10,
    # save_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    warmup_steps=2,
    # max_steps=10,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_8bit",
    report_to="none",
)
model.config.use_cache = False
trainer = CustomTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized.select(range(256)),
    compute_metrics=compute_metrics,
)

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
The model is loaded in 8-bit precision. To train this model you need to add additional modules inside the model such as adapters using `peft` library and freeze the model weights. Please check  the examples in https://github.com/huggingface/peft for more details.


In [32]:
trainer.train()
# trainer.model.save_pretrained(new_model)

***** Running training *****
  Num examples = 13,632
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 852
  Number of trainable parameters = 2,113,536


Step,Training Loss,Validation Loss,F1
10,133.0787,93.179977,0.0
20,148.0123,162.130783,0.104265
30,154.5587,365.400543,0.0
40,307.4859,247.945389,0.104265
50,122.1298,205.260468,0.104265
60,171.9983,61.229492,0.0
70,96.1027,60.192383,0.0
80,65.8461,279.961365,0.0


***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
