In [1]:
task_prefix = "classify single sequence"
labels = ['RESULTS', 'METHODS', 'CONCLUSIONS', 'OBJECTIVE', 'BACKGROUND']


In [2]:
id2label = {}
label2id = {}
label_i2i = {}
label_i2label = {}

for i, label in enumerate(labels):
  id2label[i] = label
  label2id[label] = i
  label_i2i[f"LABEL_{i}"] = i
  label_i2label[f"LABEL_{i}"] = label


In [3]:
import pandas as pd

df_train = pd.read_csv("Dataset/train.csv")
df_test = pd.read_csv("Dataset/test.csv")
df_val = pd.read_csv("Dataset/dev.csv")


In [4]:
from transformers import T5Tokenizer, T5ForSequenceClassification

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForSequenceClassification.from_pretrained("t5-small", num_labels=len(labels))


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at t5-small and are newly initialized: ['classification_head.dense.weight', 'classification_head.dense.bias', 'classification_head.out_proj.weight', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [5]:
from datasets import Dataset

def prep(df):
  text = [f"{task_prefix}: {text}" for text in df['Text'].to_list()]
  inputs = [tokenizer(f"{task_prefix}: {text}", padding=True, truncation=True, return_tensors="pt").input_ids[0] for text in df['Text'].to_list()]
  y = [labels.index(y) for y in df['Label'].to_list()]
  return Dataset.from_dict({"input_ids": inputs, "text": text, "labels": y})


In [6]:
# Get X & y parts
ds_train = prep(df_test)
ds_test = prep(df_test)
ds_val = prep(df_val)


In [7]:
print(ds_train[0])


{'input_ids': [853, 4921, 712, 5932, 10, 1404, 2071, 20853, 2842, 11, 6716, 33, 8, 741, 13, 46, 3, 49, 52, 782, 1162, 5817, 257, 13, 8, 10090, 1990, 6615, 11, 3, 9, 381, 13, 20197, 7, 13, 10090, 43, 2932, 118, 14650, 21, 1181, 18, 15329, 2874, 3, 5, 1], 'text': 'classify single sequence: Many pathogenic processes and diseases are the result of an erroneous activation of the complement cascade and a number of inhibitors of complement have thus been examined for anti-inflammatory actions .', 'labels': 4}


In [8]:
import evaluate
accuracy = evaluate.load("accuracy")


2023-12-01 00:49:11.757415: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-01 00:49:11.775698: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-01 00:49:11.775718: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-01 00:49:11.776233: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-01 00:49:11.779429: I tensorflow/core/platform/cpu_feature_guar

In [9]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [10]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


In [11]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="t5-small-single-sent-classify",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    #evaluation_strategy="epoch",
    save_strategy="epoch",
    #load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    #eval_dataset=ds_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [12]:
trainer.train()


Step,Training Loss
500,1.2133
1000,0.7207
1500,0.5944
2000,0.5947
2500,0.5399
3000,0.5514
3500,0.5385


TrainOutput(global_step=3660, training_loss=0.6719908187949593, metrics={'train_runtime': 447.234, 'train_samples_per_second': 130.884, 'train_steps_per_second': 8.184, 'total_flos': 1610493835362048.0, 'train_loss': 0.6719908187949593, 'epoch': 2.0})

In [13]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

with torch.no_grad():
  text = ds_test[0]['text']
  print(text)
  label = ds_test[0]['labels']
  print(label)
  inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
  print(inputs)
  logits = model(**inputs).logits
  predicted_class_id = logits.argmax().item()
  print(predicted_class_id)
  #pred_class = model.config.id2label[predicted_class_id]
  #print(pred_class)
  pred_class = id2label[predicted_class_id]
  print(pred_class)


classify single sequence: Many pathogenic processes and diseases are the result of an erroneous activation of the complement cascade and a number of inhibitors of complement have thus been examined for anti-inflammatory actions .
4
{'input_ids': tensor([[  853,  4921,   712,  5932,    10,  1404,  2071, 20853,  2842,    11,
          6716,    33,     8,   741,    13,    46,     3,    49,    52,   782,
          1162,  5817,   257,    13,     8, 10090,  1990,  6615,    11,     3,
             9,   381,    13, 20197,     7,    13, 10090,    43,  2932,   118,
         14650,    21,  1181,    18, 15329,  2874,     3,     5,     1]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]], device='cuda:0')}
4
BACKGROUND


In [14]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

y_actual = []
y_pred = []

with torch.no_grad():
  for i in range(len(ds_test)):
    text = ds_test[i]['text']
    label_id = ds_test[i]['labels']
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
    logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()

    y_actual.append(id2label[label_id])
    y_pred.append(id2label[predicted_class_id])


In [15]:
from sklearn.metrics import accuracy_score
accuracy_score(y_actual, y_pred)


0.8123206232062321

In [16]:
from sklearn.metrics import classification_report
print(classification_report(y_actual, y_pred))


              precision    recall  f1-score   support

  BACKGROUND       0.59      0.42      0.49      2586
 CONCLUSIONS       0.69      0.73      0.71      4414
     METHODS       0.87      0.92      0.90      9629
   OBJECTIVE       0.69      0.67      0.68      2377
     RESULTS       0.87      0.88      0.87     10262

    accuracy                           0.81     29268
   macro avg       0.74      0.72      0.73     29268
weighted avg       0.81      0.81      0.81     29268



In [17]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_actual, y_pred))


[[1074  730  197  529   56]
 [ 301 3235   96   34  748]
 [  79   46 8868  125  511]
 [ 337  260  167 1585   28]
 [  18  404  815   12 9013]]
