In [30]:
from datasets import load_dataset, DatasetDict, Dataset
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score,precision_score,recall_score, f1_score
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    BertForSequenceClassification)

In [31]:
news = pd.read_csv("news_labels_en.csv") # load the dataset 


In [32]:
news['label'] = news['ASTOR'] # take the ASTOR column as the label

In [33]:
X_train,X_test,y_train,y_test = train_test_split(news['text_en'],news['label'],test_size = 0.2) # split the dataset into train and test

In [34]:
dataset = DatasetDict({'train':Dataset.from_dict({'text':X_train,'label': y_train}),'test': Dataset.from_dict({'text': X_test,'label': y_test})}) # create a dataset object

In [35]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 607
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 152
    })
})

In [36]:
model_checkpoint = 'bert-base-cased'

id2label = {0: "Decrasing",1: "Stable", 2: "Increasing"}
label2id = {"Decreasing":0,"Stable":1 ,"Increasing":2}

model = BertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3) # load the model


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [38]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True) # load the tokenizer

In [39]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [40]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" # print the number of trainable parameters
    )

In [41]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=1,#attention heads
                        lora_alpha=1, # alpha scaling
                        lora_dropout=0.01)

model = get_peft_model(model, peft_config)
print_trainable_parameters(model) # print the number of trainable parameters

trainable params: 39171 || all params: 108351750 || trainable%: 0.03615170036478414


In [42]:
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True) # tokenize the dataset


Map:   0%|          | 0/607 [00:00<?, ? examples/s]

Map:   0%|          | 0/152 [00:00<?, ? examples/s]

In [43]:
tokenized_dataset # print the tokenized dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 607
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 152
    })
})

In [44]:
len(tokenized_dataset['train'][0]['text']) # print the length of the tokenized dataset

631

In [45]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # create a data collator


In [46]:
# hyperparameters
lr = 1e-3
batch_size = 2
num_epochs = 3

In [47]:
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification", # output directory
    learning_rate=lr,
    per_device_train_batch_size=batch_size, # batch size per device during training
    per_device_eval_batch_size=batch_size, # batch size for evaluation
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [48]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1) # get the predicted labels

    return {"accuracy": accuracy_score(labels, predictions)} # compute the accuracy

In [49]:
print("Untrained Bert-base model predictions:")
print("----------------------------")
predicted = []
for text in dataset["test"]['text']:
    # tokenize text

    inputs = tokenizer.encode(text, max_length = 512,return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    predicted.append(predictions.tolist())

accuracy_score(dataset['test']['label'],predicted) # compute the accuracy

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Untrained Bert-base model predictions:
----------------------------


0.29605263157894735

In [50]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train() # train the model

  0%|          | 0/912 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/76 [00:00<?, ?it/s]

{'eval_loss': 1.1071287393569946, 'eval_accuracy': 0.3815789473684211, 'eval_runtime': 53.0392, 'eval_samples_per_second': 2.866, 'eval_steps_per_second': 1.433, 'epoch': 1.0}
{'loss': 1.214, 'learning_rate': 0.00045175438596491233, 'epoch': 1.64}


  0%|          | 0/76 [00:00<?, ?it/s]

{'eval_loss': 1.151179313659668, 'eval_accuracy': 0.3355263157894737, 'eval_runtime': 52.4296, 'eval_samples_per_second': 2.899, 'eval_steps_per_second': 1.45, 'epoch': 2.0}


  0%|          | 0/76 [00:00<?, ?it/s]

{'eval_loss': 1.1052850484848022, 'eval_accuracy': 0.34868421052631576, 'eval_runtime': 46.3662, 'eval_samples_per_second': 3.278, 'eval_steps_per_second': 1.639, 'epoch': 3.0}
{'train_runtime': 1726.5121, 'train_samples_per_second': 1.055, 'train_steps_per_second': 0.528, 'train_loss': 1.165413538614909, 'epoch': 3.0}


TrainOutput(global_step=912, training_loss=1.165413538614909, metrics={'train_runtime': 1726.5121, 'train_samples_per_second': 1.055, 'train_steps_per_second': 0.528, 'train_loss': 1.165413538614909, 'epoch': 3.0})