In [1]:
# import holidays
# import pandas as pd
# from tqdm.auto import tqdm

# from src.data import german_regions
# from src.features.aggregation import treatment_unaggregated
# from src.paths import processed_data

# path = processed_data / "propensity_scores/nlp/transformer"
# path.mkdir(exist_ok=True, parents=True)


# def get_text_for_dates(start, end, df, region):
#     region_code = [a.code for a in german_regions if a.name == region][0]
#     df = df[df["region"] == region]
#     holi = holidays.Germany(years=range(2018, 2023), subdiv=region_code)
#     items = []
#     for date in pd.date_range(start, end):
#         text = ""
#         text += f"{region}, "
#         protests = df[df["date"] == date]
#         date_info = (
#             f"{date.strftime('%A')}, {date.day}. {date.month_name()} {date.year}"
#         )
#         if date in holi:
#             date_info += f", {holi[date]}"
#         text += date_info + "\n"
#         text += "has protests: " + ("yes" if len(protests) > 0 else "no") + "\n"
#         if len(protests) > 0:
#             text += "number of protests: " + str(len(protests)) + "\n"
#             for _, protest in protests.iterrows():
#                 text += f"{protest['actor']}: {protest['notes']}\n"
#         text += "\n"
#         items.append(text)
#     return items


# df = treatment_unaggregated("acled")
# df = df[df.country == "Germany"]
# protest_group = None
# if protest_group is not None:
#     df = df[(df.protest == protest_group)]
# X = []
# y = []
# for region in list(df.region.unique())[:1]:
#     for date in tqdm(pd.date_range("2020-02-01", "2022-12-31")):
#         start = date - pd.Timedelta(days=30)
#         items = get_text_for_dates(start, date, df, region)
#         i = items[-1].index("has protests: ") + len("has protests: ")
#         x = "\n".join(items[:-1]) + items[-1][:i]
#         X.append(x)
#         y.append(items[-1][i : i + 3].strip())
# df = pd.DataFrame({"text": X, "label": y})
# train_df = df.sample(frac=0.8, random_state=42)
# test_df = df.drop(train_df.index)
# # train_df.to_json(path / "train.jsonl", orient="records", lines=True)
# # test_df.to_json(path / "test.jsonl", orient="records", lines=True)

In [2]:
# %pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 scikit-learn scipy xformers evaluate

In [3]:
import os

import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    logging,
    pipeline,
    AutoModelForSequenceClassification
)
from trl import SFTTrainer

In [4]:
import torch
import gc

torch.cuda.empty_cache()
gc.collect()

def print_gpu_memory():
    t = torch.cuda.get_device_properties(0).total_memory
    r = torch.cuda.memory_reserved(0) 
    a = torch.cuda.memory_allocated(0)
    f = r-a  # free inside reserved

    print(f"Total GPU Memory: {t:,}")
    print(f"Reserved GPU Memory: {r:,}")
    print(f"Allocated GPU Memory: {a:,}")
    print(f"Free Inside Reserved: {f:,}")

print_gpu_memory()

Total GPU Memory: 25,393,692,672
Reserved GPU Memory: 0
Allocated GPU Memory: 0
Free Inside Reserved: 0


In [5]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
new_model = "llama-2-7b-custom"
max_seq_length = 1024

In [6]:
path = "../data/processed/propensity_scores/nlp/transformer/"
train_dataset = load_dataset("json", data_files=path+"train.jsonl", split="train")
valid_dataset = load_dataset("json", data_files=path+"test.jsonl", split="train")

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
tokenizer.truncation_side = "left"

[len(tokenizer.tokenize(a)) for a in train_dataset["X"][:10]]

[1507, 947, 914, 756, 1689, 952, 816, 868, 1299, 1412]

In [8]:
def mapper(examples):
    return {'text': examples["X"], "label": [int(y == "yes") for y in examples["y"]]}

train_dataset = train_dataset.map(mapper, batched=True, remove_columns=["X", "y"])
valid_dataset = valid_dataset.map(mapper, batched=True, remove_columns=["X", "y"])

The following is copied in large part from https://github.com/mshumer/gpt-llm-trainer/blob/main/One_Prompt___Fine_Tuned_LLaMA_2.ipynb

In [9]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, quantization_config=bnb_config, device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# from sklearn.metrics import f1_score
# from tqdm.auto import tqdm

# def classify(prompt):
#     pipe = pipeline(task="text-classification", model=model, tokenizer=tokenizer)
#     return pipe(prompt)[0]["label"] == "LABEL_1"

# n = 100
# y_pred = [classify(x) for x in tqdm(train_dataset["text"][:n])]
# f1_score(train_dataset["label"][:n], y_pred)

In [11]:
# train_dataset = train_dataset.shuffle().select(range(100))

In [12]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_seq_length)

train_tokenized = train_dataset.map(tokenize_function, batched=True)
valid_tokenized = valid_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3408 [00:00<?, ? examples/s]

In [13]:
[len(a) for a in train_tokenized["text"][:10]]

[4194, 2415, 1944, 1663, 4832, 2369, 1744, 1937, 3306, 3603]

In [14]:
# [len(a) for a in train_tokenized["input_ids"][:10]]

In [15]:
import evaluate
import numpy as np

metric = evaluate.load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# gelectra config:

# training_args = TrainingArguments(
#     output_dir=f"./results/",
#     evaluation_strategy="epoch",
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     lr_scheduler_type="linear",
#     warmup_ratio=0.1,
#     learning_rate=5e-6,
#     weight_decay=0.2,
#     num_train_epochs=6,
# )

In [16]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="SEQ_CLS",
)
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=5,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="all",
    evaluation_strategy="steps",
    eval_steps=5,  # Evaluate every 20 steps
)
model = get_peft_model(model, peft_config)
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
    compute_metrics=compute_metrics,
)

In [17]:
trainer.train()
trainer.model.save_pretrained(new_model)

Step,Training Loss,Validation Loss,F1
5,0.8277,0.269043,0.0


KeyboardInterrupt: 

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

2533