In [8]:
!pip install transformers datasets evaluate accelerate peft bitsandbytes



In [9]:
  import transformers, peft, datasets
  from datasets import load_dataset
  from transformers import GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, AutoModelForSequenceClassification, BitsAndBytesConfig
  from peft import LoraConfig, get_peft_model
  import pandas as pd

In [10]:
dataset = load_dataset("mteb/tweet_sentiment_extraction")

In [16]:
df = pd.DataFrame(dataset["train"])
df

Unnamed: 0,id,text,label,label_text
0,cb774db0d1,"I`d have responded, if I were going",1,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,negative
2,088c60f138,my boss is bullying me...,0,negative
3,9642c003ef,what interview! leave me alone,0,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0,negative
...,...,...,...,...
26727,4eac33d1c0,wish we could come see u on Denver husband l...,0,negative
26728,4f4c4fc327,I`ve wondered about rake to. The client has ...,0,negative
26729,f67aae2310,Yay good for both of you. Enjoy the break - y...,2,positive
26730,ed167662a5,But it was worth it ****.,2,positive


In [18]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def token_fun(example):
  return tokenizer(example["text"], padding="max_length", truncation=True, max_length = 128)

tokenized_dataset = dataset.map(token_fun, batched=True)

Map:   0%|          | 0/26732 [00:00<?, ? examples/s]

Map:   0%|          | 0/3432 [00:00<?, ? examples/s]

In [23]:
# tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [24]:
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(500))

In [26]:
model_name = "facebook/opt-1.3b"
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           quantization_config=bnb_config,
                                                           num_labels=3,                                                           device_map="auto")

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-1.3b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
from peft import TaskType

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    modules_to_save=["score"]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,579,008 || all params: 1,317,343,232 || trainable%: 0.1199


In [33]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [46]:
training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./results/logs",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=2e-5,
    logging_steps=10,
    save_strategy="steps",
    eval_strategy="steps",
    num_train_epochs=2,
    save_steps=50,
    eval_steps=50,
    report_to="none",
    fp16=False,
    gradient_accumulation_steps=8
)

In [47]:
import evaluate
import numpy as np

metric=evaluate.load("accuracy")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)


In [48]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy
50,0.0,,0.27
100,0.0,,0.27


