In [3]:
import holidays
import pandas as pd
from tqdm.auto import tqdm

from src.data import german_regions
from src.features.aggregation import treatment_unaggregated
from src.paths import processed_data

path = processed_data / "propensity_scores/nlp/transformer"
path.mkdir(exist_ok=True, parents=True)


def get_text_for_dates(start, end, df, region):
    region_code = [a.code for a in german_regions if a.name == region][0]
    df = df[df["region"] == region]
    holi = holidays.Germany(years=range(2018, 2023), subdiv=region_code)
    items = []
    for date in pd.date_range(start, end):
        text = ""
        text += f"{region}, "
        protests = df[df["date"] == date]
        date_info = (
            f"{date.strftime('%A')}, {date.day}. {date.month_name()} {date.year}"
        )
        if date in holi:
            date_info += f", {holi[date]}"
        text += date_info + "\n"
        text += "has protests: " + ("yes" if len(protests) > 0 else "no") + "\n"
        if len(protests) > 0:
            text += "number of protests: " + str(len(protests)) + "\n"
            for _, protest in protests.iterrows():
                text += f"{protest['actor']}: {protest['notes']}\n"
        text += "\n"
        items.append(text)
    return items


df = treatment_unaggregated("acled")
df = df[df.country == "Germany"]
protest_group = None
if protest_group is not None:
    df = df[(df.protest == protest_group)]
X = []
y = []
for region in list(df.region.unique())[:1]:
    for date in tqdm(pd.date_range("2020-02-01", "2022-12-31")):
        start = date - pd.Timedelta(days=30)
        items = get_text_for_dates(start, date, df, region)
        i = items[-1].index("has protests: ") + len("has protests: ")
        x = "\n".join(items[:-1]) + items[-1][:i]
        X.append(x)
        y.append(items[-1][i : i + 3].strip())
df = pd.DataFrame({"text": X, "label": y})
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)
# train_df.to_json(path / "train.jsonl", orient="records", lines=True)
# test_df.to_json(path / "test.jsonl", orient="records", lines=True)

  0%|          | 0/1065 [00:00<?, ?it/s]

In [55]:
from datasets import load_dataset
path = "../data/processed/propensity_scores/nlp/transformer/"
train_dataset = load_dataset("json", data_files=path + "train.jsonl", split="train")
valid_dataset = load_dataset("json", data_files=path+"test.jsonl", split="train")

In [56]:
from transformers import AutoTokenizer

model_name = "NousResearch/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
[len(tokenizer.tokenize(a)) for a in train_dataset["X"][:10]]

[1507, 947, 914, 756, 1689, 952, 816, 868, 1299, 1412]

In [57]:
def mapper(examples):
    return {'text': examples["X"], "label": [int(y == "yes") for y in examples["y"]]}

train_dataset = train_dataset.map(mapper, batched=True)
valid_dataset = valid_dataset.map(mapper, batched=True)

Map:   0%|          | 0/13632 [00:00<?, ? examples/s]

Map:   0%|          | 0/3408 [00:00<?, ? examples/s]

The following is copied in large part from https://github.com/mshumer/gpt-llm-trainer/blob/main/One_Prompt___Fine_Tuned_LLaMA_2.ipynb

In [None]:
%pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [58]:
import os

import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    logging,
    pipeline,
    AutoModelForSequenceClassification
)
from trl import SFTTrainer

In [59]:
model_name = "NousResearch/Llama-2-7b-hf"
dataset_name = "/content/train.jsonl"
new_model = "llama-2-7b-custom"
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
output_dir = "./results"
num_train_epochs = 1
fp16 = False
bf16 = False
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "constant"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 25
logging_steps = 5
max_seq_length = None
packing = False
device_map = {"": 0}

In [60]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, quantization_config=bnb_config, device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at NousResearch/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [68]:
from sklearn.metrics import f1_score

def classify(prompt):
    pipe = pipeline(task="text-classification", model=model, tokenizer=tokenizer)
    return pipe(prompt)[0]["label"] == "LABEL_1"

n = 50
y_pred = [classify(x) for x in tqdm(train_dataset["text"][:n])]
f1_score(train_dataset["label"][:n], y_pred)

  0%|          | 0/50 [00:00<?, ?it/s]

0.0392156862745098

In [72]:
n = 100

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="SEQ_CLS",
)
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="all",
    evaluation_strategy="steps",
    eval_steps=5,  # Evaluate every 20 steps
)
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset[:n],
    eval_dataset=valid_dataset[:n],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)
trainer.train()
trainer.model.save_pretrained(new_model)



AttributeError: 'dict' object has no attribute 'map'