In [2]:
!pip install transformers[torch]
!pip install datasets
!pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import os
import subprocess

dataset_url = "https://archive.ics.uci.edu/static/public/462/drug+review+dataset+drugs+com.zip"
filename = "drug_review_dataset.zip"

if not os.path.exists(filename):
  subprocess.run("wget https://archive.ics.uci.edu/static/public/462/drug+review+dataset+drugs+com.zip", shell=True)
  subprocess.run("unzip drug+review+dataset+drugs+com.zip", shell=True)




## Load Data

In [4]:
from datasets import load_dataset

data_files = {
    'train': 'drugsComTrain_raw.tsv',
    'test': 'drugsComTest_raw.tsv'
}

drug_dataset = load_dataset('csv', data_files=data_files, delimiter='\t')
drug_dataset



  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

## Preprocess Conditions

In [5]:
def null_conditions(batch):
  return batch['condition'] is not None

drug_dataset = drug_dataset.filter(null_conditions, batched=False)
drug_dataset



DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53471
    })
})

In [6]:
drug_dataset.set_format('pandas')
drug_df = drug_dataset['train'][:]
drug_df

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37
...,...,...,...,...,...,...,...
160393,191035,Campral,Alcohol Dependence,"""I wrote my first report in Mid-October of 201...",10.0,"May 31, 2015",125
160394,127085,Metoclopramide,Nausea/Vomiting,"""I was given this in IV before surgey. I immed...",1.0,"November 1, 2011",34
160395,187382,Orencia,Rheumatoid Arthritis,"""Limited improvement after 4 months, developed...",2.0,"March 15, 2014",35
160396,47128,Thyroid desiccated,Underactive Thyroid,"""I&#039;ve been on thyroid medication 49 years...",10.0,"September 19, 2015",79


In [7]:
drug_df['condition'].str.lower().nunique()

884

In [8]:
# Only get top 10 conditions
condition_freqs = drug_df.groupby('condition').agg({'rating': 'count'}).reset_index().sort_values('rating', ascending=False)
conditions_to_pred = condition_freqs.head(10)['condition'].tolist()
conditions_to_pred

['Birth Control',
 'Depression',
 'Pain',
 'Anxiety',
 'Acne',
 'Bipolar Disorde',
 'Insomnia',
 'Weight Loss',
 'Obesity',
 'ADHD']

In [9]:
def is_condition_to_pred(batch):
  return batch['condition'] in conditions_to_pred

drug_dataset = drug_dataset.filter(is_condition_to_pred)
drug_dataset



DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 73951
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 24772
    })
})

## Preprocess reviews

In [10]:
drug_dataset.reset_format()

In [11]:
import html

temp = drug_df.iloc[-1]['review']
print(temp)

print(html.unescape(temp))

"I&#039;ve had chronic constipation all my adult life. Tried Linzess, it worked for a month then stopped. Doctor started me on Amitiza (24 mg) and it is a miracle. I&#039;ve been on for four months now. After a life of 7-10 days between bowel motions, I now go comfortably 3-4 times per week. No pain, very little gas. It&#039;s a miracle for me."
"I've had chronic constipation all my adult life. Tried Linzess, it worked for a month then stopped. Doctor started me on Amitiza (24 mg) and it is a miracle. I've been on for four months now. After a life of 7-10 days between bowel motions, I now go comfortably 3-4 times per week. No pain, very little gas. It's a miracle for me."


In [12]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 73951
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 24772
    })
})

In [13]:
def remove_html_codes(sample):
  sample['review'] = html.unescape(sample['review'])
  return sample

drug_dataset = drug_dataset.map(remove_html_codes, batched=False)
drug_dataset['train'][-1]['review']



'"This would be my second month on Junel. I\'ve been on Birth Control for about 10 years now. I changed due to spotting and increased mood swings with my previous birth control. Since the switch I have had shorter periods about 2-3 days, but I have gained major weight and increased appetite. I switched up my regular exercise routine and still have not managed to drop the extra 7 lbs ;("'

## Load Model

In [14]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels = len(conditions_to_pred),
    ignore_mismatched_sizes=True,
    )
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

## Train Model using TrainerAPI

In [15]:
# Train-test and eval splits
full_dataset = drug_dataset['train'].train_test_split(test_size=0.2, seed=2023)
full_dataset['eval'] = drug_dataset['test']

full_dataset



DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 59160
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 14791
    })
    eval: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 24772
    })
})

In [16]:
# Set up input and target columns
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(conditions_to_pred)

def create_labels(batch):
  batch['label'] = label_encoder.transform(batch['condition'])
  return batch

full_dataset = full_dataset.map(create_labels, batched=True)
full_dataset



DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'label'],
        num_rows: 59160
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'label'],
        num_rows: 14791
    })
    eval: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'label'],
        num_rows: 24772
    })
})

In [17]:
# Tokenize dataset
def tokenize_dataset(batch):
  return tokenizer(batch['review'], truncation=True, padding=False)

tokenized_dataset = full_dataset.map(tokenize_dataset, batched=True, num_proc=2)

unnecessary_cols = [col for col in list(tokenized_dataset['train'].features.keys()) if col not in ['label', 'input_ids', 'attention_mask']]
tokenized_dataset = tokenized_dataset.remove_columns(unnecessary_cols)
tokenized_dataset



DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 59160
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 14791
    })
    eval: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 24772
    })
})

In [18]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./trainer',
    learning_rate=1e-5,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    evaluation_strategy='steps',
    eval_steps=1000,
    fp16=True,
    logging_steps=1000,
    lr_scheduler_type='cosine',
    warmup_steps=0,

)

In [19]:
# Custom metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from transformers import EvalPrediction
import numpy as np

def compute_metrics(eval_pred: EvalPrediction):
  predictions, labels = eval_pred.predictions, eval_pred.label_ids

  predictions = np.argmax(predictions, axis=1)
  accuracy = accuracy_score(labels, predictions)
  balanced_accuracy = balanced_accuracy_score(labels, predictions)

  return {
      'accuracy': accuracy,
      'balanced_accuracy': balanced_accuracy,
  }

In [20]:
# Data Collator
from transformers.data.data_collator import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['eval'],
    compute_metrics = compute_metrics,
    data_collator=data_collator,
)

In [22]:
%%time
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Balanced Accuracy
1000,0.8499,0.426773,0.851889,0.777819
2000,0.4044,0.358447,0.872961,0.812171
3000,0.3313,0.329977,0.882569,0.819616
4000,0.307,0.319923,0.883619,0.826995
5000,0.2741,0.315029,0.887494,0.830304


CPU times: user 23min 57s, sys: 15.6 s, total: 24min 12s
Wall time: 26min 1s


TrainOutput(global_step=5547, training_loss=0.4169653587692038, metrics={'train_runtime': 1561.4764, 'train_samples_per_second': 113.662, 'train_steps_per_second': 3.552, 'total_flos': 9492125698440000.0, 'train_loss': 0.4169653587692038, 'epoch': 3.0})

In [23]:
trainer.evaluate(tokenized_dataset['test'])

{'eval_loss': 0.3089316189289093,
 'eval_accuracy': 0.8923669799202217,
 'eval_balanced_accuracy': 0.8345630959154917,
 'eval_runtime': 31.98,
 'eval_samples_per_second': 462.508,
 'eval_steps_per_second': 28.924,
 'epoch': 3.0}

## Pytorch Training

In [24]:
# Recreate model
if model is not None:
  del model

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels = len(conditions_to_pred),
    ignore_mismatched_sizes=True
    )
model

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [25]:
# Tokenize dataset
def tokenize_dataset(batch):
  return tokenizer(batch['review'], truncation=True, padding=False)

tokenized_dataset = full_dataset.map(tokenize_dataset, batched=True, num_proc=2)

unnecessary_cols = [col for col in list(tokenized_dataset['train'].features.keys()) if col not in ['label', 'input_ids', 'attention_mask']]
tokenized_dataset = tokenized_dataset.remove_columns(unnecessary_cols)
tokenized_dataset

Map (num_proc=2):   0%|          | 0/59160 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/14791 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/24772 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 59160
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 14791
    })
    eval: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 24772
    })
})

In [26]:
# Data Loaders
from torch.utils.data import DataLoader

train_loader = DataLoader(tokenized_dataset['train'], batch_size=32, shuffle=True, collate_fn=data_collator)
test_loader = DataLoader(tokenized_dataset['test'], batch_size=16, shuffle=False, collate_fn=data_collator)
eval_loader = DataLoader(tokenized_dataset['eval'], batch_size=16, shuffle=False, collate_fn=data_collator)

In [27]:
# Optimizer and scheduler
import torch
from torch.optim import AdamW
from transformers import get_scheduler

optimizer = AdamW(params=model.parameters(), lr=1e-5)
scheduler = get_scheduler(
    'cosine',
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_loader)*3,
    )

In [28]:
%%time
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from torch.cuda.amp import autocast, GradScaler

# Set up configuration
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('device =', device)

model.to(device)
model.train()

num_epochs = 3
num_steps = len(train_loader) * num_epochs
progress_bar = tqdm(range(num_steps))

# Initialize train metrics
train_loss = 0
train_num_samples = 0
train_steps = 0
eval_steps = 1000

# Train loop
scaler = GradScaler()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        # fp16
        with autocast():
            output = model(**batch)
            loss = output.loss

        scaler.scale(loss).backward()
        train_loss += loss.item() * len(batch)
        train_num_samples += len(batch)

        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        scheduler.step()

        train_steps += 1
        progress_bar.update(1)

        # eval loop
        if train_steps % eval_steps == 0:

            # Reset train metrics
            print(f'steps = {train_steps} train loss = {train_loss / train_num_samples}')
            train_loss = 0
            train_num_samples = 0

            # Initialize eval metrics
            eval_loss = 0
            eval_preds = []
            eval_trues = []
            eval_num_samples = 0

            model.eval()
            with torch.no_grad():
                for eval_batch in eval_loader:
                    eval_batch = {k: v.to(device) for k, v in eval_batch.items()}
                    output = model(**eval_batch)
                    loss = output.loss

                    y_pred = output.logits.argmax(dim=-1).cpu()
                    y_true = eval_batch['labels'].cpu()

                    eval_preds.extend(y_pred)
                    eval_trues.extend(y_true)

                    eval_loss += loss.item() * len(eval_batch)
                    eval_num_samples += len(eval_batch)

            # Calculate eval metrics
            eval_loss /= eval_num_samples
            eval_acc = accuracy_score(eval_trues, eval_preds)
            eval_balanced_acc = balanced_accuracy_score(eval_trues, eval_preds)

            print(f'eval_loss = {eval_loss} // eval_acc = {eval_acc} // eval_balanced_acc = {eval_balanced_acc}')

            model.train()


device = cuda


  0%|          | 0/5547 [00:00<?, ?it/s]

steps = 1000 train loss = 0.8417338743805886
eval_loss = 0.41071684495095284 // eval_acc = 0.857782980784757 // eval_balanced_acc = 0.785810066026381
steps = 2000 train loss = 0.3943223807513714
eval_loss = 0.3436121447720835 // eval_acc = 0.8755853382851607 // eval_balanced_acc = 0.8102444507659499
steps = 3000 train loss = 0.3233245058059692
eval_loss = 0.32838996919412766 // eval_acc = 0.8815194574519619 // eval_balanced_acc = 0.8167069249606808
steps = 4000 train loss = 0.3027351135313511
eval_loss = 0.31048738875216053 // eval_acc = 0.8880994671403197 // eval_balanced_acc = 0.8283095443705504
steps = 5000 train loss = 0.26741051191091536
eval_loss = 0.30649930324765945 // eval_acc = 0.8880994671403197 // eval_balanced_acc = 0.8291792258477118
CPU times: user 31min 16s, sys: 5.85 s, total: 31min 22s
Wall time: 31min 36s


In [29]:
# torch.save(model.state_dict(), 'torch_model.pt')

In [30]:
# Test model on the test data
progress_bar = tqdm(range(len(test_loader)))

model.eval()
preds = []
with torch.no_grad():
  for batch in test_loader:
    batch = {k: v.to(device) for k,v in batch.items()}
    output = model(**batch)

    pred = output.logits.argmax(dim=-1)
    preds.extend(pred.cpu())

    progress_bar.update(1)

true_labels = tokenized_dataset['test']['label']

test_acc = accuracy_score(true_labels, preds)
test_balanced_acc = balanced_accuracy_score(true_labels, preds)

print(f'Test accuracy = {test_acc:.4f} // Test balanced accuracy = {test_balanced_acc:.4f}')

  0%|          | 0/463 [00:00<?, ?it/s]

Test accuracy = 0.8934 // Test balanced accuracy = 0.8372
