In [1]:
from datasets import load_dataset, DatasetDict, Dataset
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score,precision_score,recall_score, f1_score
from sklearn.model_selection import train_test_split
import evaluate
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

In [2]:
news = pd.read_csv("news_labels_en.csv") # news dataset 


In [3]:
news['label'] = news['ASTOR'] # take stock price label as label

In [4]:
X_train,X_test,y_train,y_test = train_test_split(news['text_en'],news['label'],test_size = 0.2) # split train and test set

In [5]:
dataset = DatasetDict({'train':Dataset.from_dict({'text':X_train,'label': y_train}),'test': Dataset.from_dict({'text': X_test,'label': y_test})}) # create dataset

In [6]:
dataset # show dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 607
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 152
    })
})

In [7]:
model_checkpoint = 'distilbert-base-uncased' # model checkpoint

id2label = {0: "Decrasing",1: "Stable", 2: "Increasing"} # label mapping 
label2id = {"Decreasing":0,"Stable":1 ,"Increasing":2} # label mapping

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=3, id2label=id2label, label2id=label2id)# load model


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True) # load tokenizer

In [10]:
if tokenizer.pad_token is None: 
    tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # add pad token 
    model.resize_token_embeddings(len(tokenizer)) # resize token embedding

In [11]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters(): # pylint: disable=protected-access
        all_param += param.numel() # count all parameters
        if param.requires_grad: # count trainable parameters
            trainable_params += param.numel() # count trainable parameters
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [13]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=32,#attention heads
                        lora_alpha=4, # alpha scaling
                        lora_dropout=0.01,
                        target_modules = ['q_lin']) # target module 

model = get_peft_model(model, peft_config)
print_trainable_parameters(model)

trainable params: 887811 || all params: 67843590 || trainable%: 1.3086144173679488


In [14]:
def tokenize_function(examples):
    # extract text
    text = examples["text"] # extract text 

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/607 [00:00<?, ? examples/s]

Map:   0%|          | 0/152 [00:00<?, ? examples/s]

In [15]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 607
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 152
    })
})

In [16]:
len(tokenized_dataset['train'][0]['text'])

777

In [17]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # data collator


In [18]:
# hyperparameters
lr = 1e-3
batch_size = 2
num_epochs = 10

In [19]:
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [20]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy_score(labels, predictions)}

In [29]:
print("Untrained Distilbert-base model predictions:")
print("----------------------------")
predicted = []
for text in dataset["test"]['text']:
    # tokenize text

    inputs = tokenizer.encode(text, max_length = 512,return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    predicted.append(predictions.tolist())

accuracy_score(dataset['test']['label'],predicted)

Untrained Distilbert-base model predictions:
----------------------------


0.2894736842105263

In [20]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/3040 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/76 [00:00<?, ?it/s]

{'eval_loss': 1.1238776445388794, 'eval_accuracy': 0.42105263157894735, 'eval_runtime': 24.1828, 'eval_samples_per_second': 6.285, 'eval_steps_per_second': 3.143, 'epoch': 1.0}
{'loss': 1.1402, 'learning_rate': 0.0008355263157894737, 'epoch': 1.64}


  0%|          | 0/76 [00:00<?, ?it/s]

{'eval_loss': 1.1225227117538452, 'eval_accuracy': 0.3026315789473684, 'eval_runtime': 23.4736, 'eval_samples_per_second': 6.475, 'eval_steps_per_second': 3.238, 'epoch': 2.0}


  0%|          | 0/76 [00:00<?, ?it/s]

{'eval_loss': 1.2503454685211182, 'eval_accuracy': 0.3223684210526316, 'eval_runtime': 23.9971, 'eval_samples_per_second': 6.334, 'eval_steps_per_second': 3.167, 'epoch': 3.0}
{'loss': 1.0201, 'learning_rate': 0.0006710526315789473, 'epoch': 3.29}


  0%|          | 0/76 [00:00<?, ?it/s]

{'eval_loss': 1.4150484800338745, 'eval_accuracy': 0.32894736842105265, 'eval_runtime': 23.8287, 'eval_samples_per_second': 6.379, 'eval_steps_per_second': 3.189, 'epoch': 4.0}
{'loss': 0.7704, 'learning_rate': 0.0005065789473684211, 'epoch': 4.93}


  0%|          | 0/76 [00:00<?, ?it/s]

{'eval_loss': 1.8546684980392456, 'eval_accuracy': 0.34210526315789475, 'eval_runtime': 23.5357, 'eval_samples_per_second': 6.458, 'eval_steps_per_second': 3.229, 'epoch': 5.0}


  0%|          | 0/76 [00:00<?, ?it/s]

{'eval_loss': 2.310481071472168, 'eval_accuracy': 0.375, 'eval_runtime': 23.3351, 'eval_samples_per_second': 6.514, 'eval_steps_per_second': 3.257, 'epoch': 6.0}
{'loss': 0.4626, 'learning_rate': 0.00034210526315789477, 'epoch': 6.58}


  0%|          | 0/76 [00:00<?, ?it/s]

{'eval_loss': 2.591395616531372, 'eval_accuracy': 0.34868421052631576, 'eval_runtime': 23.5834, 'eval_samples_per_second': 6.445, 'eval_steps_per_second': 3.223, 'epoch': 7.0}


  0%|          | 0/76 [00:00<?, ?it/s]

{'eval_loss': 2.808746576309204, 'eval_accuracy': 0.3157894736842105, 'eval_runtime': 23.3492, 'eval_samples_per_second': 6.51, 'eval_steps_per_second': 3.255, 'epoch': 8.0}
{'loss': 0.2901, 'learning_rate': 0.00017763157894736843, 'epoch': 8.22}


  0%|          | 0/76 [00:00<?, ?it/s]

{'eval_loss': 3.0203635692596436, 'eval_accuracy': 0.3157894736842105, 'eval_runtime': 24.4754, 'eval_samples_per_second': 6.21, 'eval_steps_per_second': 3.105, 'epoch': 9.0}
{'loss': 0.1614, 'learning_rate': 1.3157894736842104e-05, 'epoch': 9.87}


  0%|          | 0/76 [00:00<?, ?it/s]

{'eval_loss': 3.136854648590088, 'eval_accuracy': 0.3092105263157895, 'eval_runtime': 26.129, 'eval_samples_per_second': 5.817, 'eval_steps_per_second': 2.909, 'epoch': 10.0}
{'train_runtime': 2184.4337, 'train_samples_per_second': 2.779, 'train_steps_per_second': 1.392, 'train_loss': 0.6350146127374549, 'epoch': 10.0}


TrainOutput(global_step=3040, training_loss=0.6350146127374549, metrics={'train_runtime': 2184.4337, 'train_samples_per_second': 2.779, 'train_steps_per_second': 1.392, 'train_loss': 0.6350146127374549, 'epoch': 10.0})

In [21]:
text_list = tokenized_dataset['test']['text'][0:5] # take first 5 text from test set

In [22]:
##test
print("Untrained model predictions:")
print("----------------------------")
for text in text_list: # print predictions of trained model
    # tokenize text

    inputs = tokenizer.encode(text, max_length = 512,return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Untrained model predictions:
----------------------------
Public Disclosure Platform (KAP) by the company in a statement, in August 31 Migros, 12 Migros jet, 1 Macrocenter and Cosmetic Merchandise format, including 3 mions, including a total of 47 new stores said.The statement, as of August 31, 2023, the total number of stores of the company was expressed 3 thousand 176. - Decrasing
Russian President Vladimir Putin, at a meeting with government officials in the capital Moscow, the country's gasoline and motorin exports on September 21, evaluated the export ban.Pointing out that the rise in fuel prices in Russia was due to an increase in oil prices, Putin said, "Oil prices are rising, companies want to make maximum profit by exporting. Everything is very understandable," he said.Putin, despite the ban on fuel exports, prices continue to increase in Russia, "measures have been taken, but retail prices are increasing. Consumer needs results. I want you to respond to the events faster," he