In [1]:
labels = ['RESULTS', 'METHODS', 'CONCLUSIONS', 'OBJECTIVE', 'BACKGROUND']


In [2]:
id2label = {}
label2id = {}
label_i2i = {}
label_i2label = {}

for i, label in enumerate(labels):
  id2label[i] = label
  label2id[label] = i
  label_i2i[f"LABEL_{i}"] = i
  label_i2label[f"LABEL_{i}"] = label


In [3]:
import pandas as pd

df_train = pd.read_csv("Dataset/train.csv")
df_test = pd.read_csv("Dataset/test.csv")
df_val = pd.read_csv("Dataset/dev.csv")


In [4]:
df_train['Context'] = df_train[['Abstract Name','Text']].groupby(['Abstract Name'])['Text'].transform(lambda x: ' '.join(x))
df_test['Context'] = df_test[['Abstract Name','Text']].groupby(['Abstract Name'])['Text'].transform(lambda x: ' '.join(x))
df_val['Context'] = df_val[['Abstract Name','Text']].groupby(['Abstract Name'])['Text'].transform(lambda x: ' '.join(x))


In [5]:
df_train['Input_Text'] = df_train[['Text','Context']].apply(lambda x: ' [SEP] '.join(x), axis=1)
df_test['Input_Text'] = df_test[['Text','Context']].apply(lambda x: ' [SEP] '.join(x), axis=1)
df_val['Input_Text'] = df_val[['Text','Context']].apply(lambda x: ' [SEP] '.join(x), axis=1)


In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(labels))


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from datasets import Dataset

def prep(df):
  text = [f"{text}" for text in df['Input_Text'].to_list()]
  inputs = [tokenizer(f"{text}", padding=True, truncation=True, return_tensors="pt").input_ids[0] for text in df['Text'].to_list()]
  y = [labels.index(y) for y in df['Label'].to_list()]
  return Dataset.from_dict({"input_ids": inputs, "text": text, "labels": y})


In [17]:
# Get X & y parts
ds_train = prep(df_test)
ds_test = prep(df_test)
ds_val = prep(df_val)


In [18]:
print(ds_train[0])


{'input_ids': [101, 2116, 26835, 2594, 6194, 1998, 7870, 2024, 1996, 2765, 1997, 2019, 9413, 20793, 3560, 13791, 1997, 1996, 13711, 16690, 1998, 1037, 2193, 1997, 25456, 1997, 13711, 2031, 2947, 2042, 8920, 2005, 3424, 1011, 20187, 4506, 1012, 102], 'text': 'Many pathogenic processes and diseases are the result of an erroneous activation of the complement cascade and a number of inhibitors of complement have thus been examined for anti-inflammatory actions . [SEP] Many pathogenic processes and diseases are the result of an erroneous activation of the complement cascade and a number of inhibitors of complement have thus been examined for anti-inflammatory actions . It was recently demonstrated that supraphysiological concentrations of the endogenous complement inhibitor MAp44 ( also denoted MAP1 ) protect against myocardial reperfusion injury . In the present study , we examined the association between outcome after acute myocardial infarction ( MI ) and the plasma levels of MAp44 and i

In [19]:
import evaluate
accuracy = evaluate.load("accuracy")


In [20]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [21]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


In [23]:
from transformers import TrainingArguments, Trainer

# Eval during training disabled due to high memory
training_args = TrainingArguments(
    output_dir="distilbert-multi-sent-classify",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    #evaluation_strategy="epoch",
    save_strategy="epoch",
    #load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    #eval_dataset=ds_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [24]:
trainer.train()


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.6379
1000,0.4786
1500,0.428
2000,0.4042
2500,0.3411
3000,0.3542
3500,0.3464


TrainOutput(global_step=3660, training_loss=0.4224070418727854, metrics={'train_runtime': 297.2368, 'train_samples_per_second': 196.934, 'train_steps_per_second': 12.313, 'total_flos': 1275041051517000.0, 'train_loss': 0.4224070418727854, 'epoch': 2.0})

In [25]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

with torch.no_grad():
  text = ds_test[0]['text']
  print(text)
  label = ds_test[0]['labels']
  print(label)
  inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
  print(inputs)
  logits = model(**inputs).logits
  predicted_class_id = logits.argmax().item()
  print(predicted_class_id)
  #pred_class = model.config.id2label[predicted_class_id]
  #print(pred_class)
  pred_class = id2label[predicted_class_id]
  print(pred_class)


Many pathogenic processes and diseases are the result of an erroneous activation of the complement cascade and a number of inhibitors of complement have thus been examined for anti-inflammatory actions . [SEP] Many pathogenic processes and diseases are the result of an erroneous activation of the complement cascade and a number of inhibitors of complement have thus been examined for anti-inflammatory actions . It was recently demonstrated that supraphysiological concentrations of the endogenous complement inhibitor MAp44 ( also denoted MAP1 ) protect against myocardial reperfusion injury . In the present study , we examined the association between outcome after acute myocardial infarction ( MI ) and the plasma levels of MAp44 and its related proteins MASP-1 and MASP-3 in patients with first-time MI . In addition , we compared plasma levels of MAp44 , MASP-1 , and MASP-3 in MI patients to levels in a healthy control group . A total of 192 MI patients and 140 control persons were include

In [26]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

y_actual = []
y_pred = []

with torch.no_grad():
  for i in range(len(ds_test)):
    text = ds_test[i]['text']
    label_id = ds_test[i]['labels']
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
    logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()

    y_actual.append(id2label[label_id])
    y_pred.append(id2label[predicted_class_id])


In [27]:
from sklearn.metrics import accuracy_score
accuracy_score(y_actual, y_pred)


0.4776547765477655

In [28]:
from sklearn.metrics import classification_report
print(classification_report(y_actual, y_pred))


              precision    recall  f1-score   support

  BACKGROUND       0.31      0.21      0.25      2586
 CONCLUSIONS       0.38      0.06      0.11      4414
     METHODS       0.52      0.57      0.55      9629
   OBJECTIVE       0.76      0.20      0.31      2377
     RESULTS       0.46      0.70      0.56     10262

    accuracy                           0.48     29268
   macro avg       0.49      0.35      0.35     29268
weighted avg       0.48      0.48      0.44     29268



In [29]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_actual, y_pred))


[[ 554   97  565   61 1309]
 [ 327  272 1033   19 2763]
 [ 391   92 5483   30 3633]
 [ 221   30  876  466  784]
 [ 305  226 2487   39 7205]]
