In [36]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DistilBertTokenizer
from datasets import Dataset
from sklearn.metrics import accuracy_score

import torch
import pandas as pd

In [2]:
num_labels = 3
model_ckp = 'distilbert-base-uncased'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [3]:
# Load the data
data = Dataset.from_csv('./synthetic_data.csv')
cats = {cat: i for i, cat in enumerate(set(data['category']))}
labels = [cats[i] for i in data['category']]
data = data.add_column(name='labels', column=labels)
data = data.shuffle()
data = data.train_test_split(test_size=0.1)
data

DatasetDict({
    train: Dataset({
        features: ['category', 'entry', 'has_typo', 'labels'],
        num_rows: 251
    })
    test: Dataset({
        features: ['category', 'entry', 'has_typo', 'labels'],
        num_rows: 28
    })
})

In [4]:
tokenizer = DistilBertTokenizer.from_pretrained(model_ckp)
model = (AutoModelForSequenceClassification.from_pretrained(model_ckp, num_labels=num_labels)).to(device)

def tokenize(batch, tokenizer):
    return tokenizer(batch["entry"], padding=True, truncation=True)

encoded = data.map(tokenize, batched=True, batch_size=None, fn_kwargs={'tokenizer': tokenizer})

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/251 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

In [5]:
batch_size = 16 
logging_steps = len(encoded['train']) // batch_size
model_name = f'{model_ckp}-finetuned-banking'
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=10,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=False,
                                  log_level='error')

In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

In [7]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=encoded['train'],
                  eval_dataset=encoded['test'],
                  tokenizer=tokenizer)
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.957,0.609435,1.0
2,0.4626,0.194522,1.0
3,0.1774,0.072854,1.0
4,0.0723,0.036844,1.0
5,0.0423,0.0248,1.0
6,0.0289,0.019376,1.0
7,0.0247,0.016506,1.0
8,0.0206,0.01491,1.0
9,0.0198,0.014033,1.0
10,0.0176,0.013753,1.0


TrainOutput(global_step=160, training_loss=0.17200825605541467, metrics={'train_runtime': 10.4912, 'train_samples_per_second': 239.248, 'train_steps_per_second': 15.251, 'total_flos': 12988246100400.0, 'train_loss': 0.17200825605541467, 'epoch': 10.0})

In [23]:
x = encoded['test']['entry'][0]
y_hat = model(**tokenizer(x, return_tensors='pt').to(device)).logits.argmax(-1)

In [14]:
print(x)
print(y_hat.detach().cpu().numpy()[0])

No-Contrast Abdomen-Pelvis CT Imaging
2


In [15]:
cats

{'MR Abdomen Adrenal Without then with Contrast': 0,
 'CT Head Angiography with Contrast': 1,
 'CT Abdomen Pelvis without Contrast': 2}

In [29]:
def predict(batch, model, tokenizer, cats):
    x = tokenizer(batch['entry'], return_tensors='pt', padding=True, truncation=True)
    y_hat = model(**x.to(device)).logits.argmax(-1)
    cats_inv = {v: k for k, v in cats.items()}
    y_hat = {'prediction': cats_inv[y_hat.detach().cpu().numpy()[0]]}
    return y_hat

In [31]:
preds = data.map(predict, batched=False, batch_size=None, fn_kwargs={'model': model, 'tokenizer': tokenizer, 'cats': cats})

Map:   0%|          | 0/251 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

In [38]:
pd.DataFrame(preds['test'])

Unnamed: 0,category,entry,has_typo,labels,prediction
0,CT Abdomen Pelvis without Contrast,No-Contrast Abdomen-Pelvis CT Imaging,0,2,CT Abdomen Pelvis without Contrast
1,MR Abdomen Adrenal Without then with Contrast,"MR Adrenal Abdomen: Contrast Progression, None...",0,0,MR Abdomen Adrenal Without then with Contrast
2,MR Abdomen Adrenal Without then with Contrast,Abdomen Adrenal MR Imaging: Transition from No...,0,0,MR Abdomen Adrenal Without then with Contrast
3,MR Abdomen Adrenal Without then with Contrast,"Adrenal Abdomen MR: No Contrast, Followed by W...",0,0,MR Abdomen Adrenal Without then with Contrast
4,MR Abdomen Adrenal Without then with Contrast,Adrenal Abdomn in MR: Contrast Phased Imgaging...,1,0,MR Abdomen Adrenal Without then with Contrast
5,CT Abdomen Pelvis without Contrast,Computed Tomography of Abdomen & Pelvis withou...,0,2,CT Abdomen Pelvis without Contrast
6,MR Abdomen Adrenal Without then with Contrast,"MR Adrenal in the Abdomn: Without Contrast, Fo...",1,0,MR Abdomen Adrenal Without then with Contrast
7,CT Head Angiography with Contrast,Head Angiography and CT: Use of Contrast,0,1,CT Head Angiography with Contrast
8,CT Abdomen Pelvis without Contrast,CT Abdomen & Pelvis - No Contras,1,2,CT Abdomen Pelvis without Contrast
9,CT Abdomen Pelvis without Contrast,CT Abdomen & Pelvis - No Contrast,0,2,CT Abdomen Pelvis without Contrast
