In [None]:
from datasets import load_from_disk
drug_dataset = load_from_disk("/content/drive/MyDrive/Datasets/Drug-Condition-Classification/drug-reviews")

In [None]:
print(drug_dataset)

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 126743
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 31686
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 52825
    })
})


In [None]:
from transformers import AutoTokenizer
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
print(drug_dataset["train"][0])

{'patient_id': 72006, 'drugName': 'Tramadol', 'condition': 'chronic pain', 'review': '"Works well with controlling UTI.  This medication  will cause drowsiness.  Exactly what I needed was rest.  In my experience, about 30 minutes later, pain subsidies enough to tolerate."', 'rating': 10.0, 'date': 'September 21, 2015', 'usefulCount': 37, 'review_length': 186}


In [None]:
import html
drug_dataset = drug_dataset.map(lambda x: {"condition": [html.unescape(o) for o in x["condition"]]}, batched= True)

In [None]:
import re

# This pattern looks for HTML tags or the specific "users found..." phrase
# It also flags very short conditions that are likely abbreviations or errors.
junk_pattern = re.compile(r'<.*?>|users found this comment helpful|^\w{1,2}$')

def is_clean(example):
    """
    Checks if the 'condition' field is clean.
    Returns True if clean, False if it contains junk.
    """
    condition = example['condition']
    if condition is None:
        return False
    if not condition.strip() or junk_pattern.search(condition):
        return False
    return True

In [None]:
cleaned_drug_dataset = drug_dataset.filter(is_clean)

In [None]:
# Get a sorted list of unique conditions from the cleaned dataset
all_unique_conditions = sorted(list(set(cleaned_drug_dataset['train']['condition']).union(set(cleaned_drug_dataset['validation']['condition'])).union(set(cleaned_drug_dataset['test']['condition']))))
num_labels = len(all_unique_conditions)

print(f"Number of clean, unique conditions: {num_labels}")


# Create the label mappings
labels = {condition: i for i, condition in enumerate(all_unique_conditions)}
id_to_label = {i: condition for i, condition in enumerate(all_unique_conditions)}

Number of clean, unique conditions: 821


In [None]:
def condition_to_label(examples):
  return {"label" : [labels[condition] for condition in examples['condition']]}
cleaned_drug_dataset = cleaned_drug_dataset.map(condition_to_label, batched = True)

In [None]:
def tokenize_function(example):
  return tokenizer(example['review'], truncation = True)
tokenized_dataset = cleaned_drug_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/126024 [00:00<?, ? examples/s]

In [None]:
print(tokenized_dataset["train"][0])

{'patient_id': 72006, 'drugName': 'Tramadol', 'condition': 'chronic pain', 'review': '"Works well with controlling UTI.  This medication  will cause drowsiness.  Exactly what I needed was rest.  In my experience, about 30 minutes later, pain subsidies enough to tolerate."', 'rating': 10.0, 'date': 'September 21, 2015', 'usefulCount': 37, 'review_length': 186, 'label': 146, 'input_ids': [101, 107, 5853, 1218, 1114, 9783, 158, 21669, 119, 1188, 15683, 1209, 2612, 173, 20876, 8405, 119, 18342, 1184, 146, 1834, 1108, 1832, 119, 1130, 1139, 2541, 117, 1164, 1476, 1904, 1224, 117, 2489, 24708, 1536, 1106, 21073, 119, 107, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
tokenized_dataset = tokenized_dataset.remove_columns(['patient_id', 'review', 'date', 'review_length', 'drugName', 'usefulCount', 'rating', 'condition'])

In [None]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 126024
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 31509
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 52554
    })
})


In [None]:
tokenized_dataset.save_to_disk("/content/drive/MyDrive/Datasets/Drug-Condition-Classification/cleaned-drug-reviews")

Saving the dataset (0/1 shards):   0%|          | 0/126024 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/31509 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/52554 [00:00<?, ? examples/s]

# Training Model

In [None]:
from datasets import load_from_disk
mapped_tokenized_dataset = load_from_disk("/content/drive/MyDrive/Datasets/Drug-Condition-Classification/cleaned-drug-reviews")

In [None]:
from transformers import AutoTokenizer
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments("bert-drug-classification", eval_strategy="epoch")

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
import evaluate
import numpy as np

def compute_metric(eval_preds):
  metric = evaluate.load("accuracy")
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=num_labels, # Use the new count of clean labels
    id2label=id_to_label,  # Pass the mapping directly
    label2id=labels)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset = mapped_tokenized_dataset["train"],
    eval_dataset = mapped_tokenized_dataset["validation"],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metric
)

  trainer = Trainer(


In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdevangborkar3[0m ([33mdevangborkar3-uc-davis[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,1.4875,1.437285,0.667428
2,1.1351,1.182855,0.721826
3,0.775,1.079257,0.754165


Downloading builder script: 0.00B [00:00, ?B/s]

TrainOutput(global_step=47259, training_loss=1.3506367387837999, metrics={'train_runtime': 3734.3687, 'train_samples_per_second': 101.241, 'train_steps_per_second': 12.655, 'total_flos': 3.706344216797966e+16, 'train_loss': 1.3506367387837999, 'epoch': 3.0})

# Testing the **model**

In [None]:
test_results = trainer.evaluate(mapped_tokenized_dataset["test"])
print(test_results)

{'eval_loss': 1.0535321235656738, 'eval_accuracy': 0.7593522852684857, 'eval_runtime': 157.2599, 'eval_samples_per_second': 334.186, 'eval_steps_per_second': 41.778, 'epoch': 3.0}


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

# Get predictions from the model on the test set
predictions = trainer.predict(mapped_tokenized_dataset["test"])

# Extract logits and labels
logits = predictions.predictions
labels = predictions.label_ids

# Get the predicted class indices
predicted_labels = np.argmax(logits, axis=-1)

# Calculate metrics
accuracy = accuracy_score(labels, predicted_labels)
precision, recall, f1, _ = precision_recall_fscore_support(labels, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision (weighted): {precision}")
print(f"Recall (weighted): {recall}")
print(f"F1-score (weighted): {f1}")

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x79fcb412f560>> (for post_run_cell):



KeyboardInterrupt



KeyboardInterrupt: 

# Save model to Hugging Face

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Push the model to the Hugging Face Hub
trainer.push_to_hub(commit_message="Finished 3 epochs on cleaned data, 75.4% accuracy")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ication/model.safetensors:   0%|          |  558kB /  436MB            

  ...091765.83723f457ee4.671.0:   2%|1         | 1.35kB / 82.0kB            

  ...095727.83723f457ee4.671.1:   1%|1         |  6.00B /   417B            

  ...ication/training_args.bin:   2%|1         |  96.0B / 5.84kB            

CommitInfo(commit_url='https://huggingface.co/devangb4/bert-drug-classification/commit/b99530daebbab15092b094f2bf41d7df3a039380', commit_message='Finished 3 epochs on cleaned data, 75.4% accuracy', commit_description='', oid='b99530daebbab15092b094f2bf41d7df3a039380', pr_url=None, repo_url=RepoUrl('https://huggingface.co/devangb4/bert-drug-classification', endpoint='https://huggingface.co', repo_type='model', repo_id='devangb4/bert-drug-classification'), pr_revision=None, pr_num=None)

In [None]:
# Add the id_to_label mapping to the model's configuration
model.config.id2label = id_to_label

# Push the model to the Hugging Face Hub
model.push_to_hub("bert-drug-classification")

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmpdorvf8pu/model.safetensors    :   8%|7         | 33.5MB /  436MB            

CommitInfo(commit_url='https://huggingface.co/devangb4/bert-drug-classification/commit/3784e5d46888b756f9f76b39b153a893e3d4307d', commit_message='Upload BertForSequenceClassification', commit_description='', oid='3784e5d46888b756f9f76b39b153a893e3d4307d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/devangb4/bert-drug-classification', endpoint='https://huggingface.co', repo_type='model', repo_id='devangb4/bert-drug-classification'), pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("bert-drug-classification")

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/devangb4/bert-drug-classification/commit/deb3db1f82f8452f15cdcc369c274da1b7c9f7c7', commit_message='Upload tokenizer', commit_description='', oid='deb3db1f82f8452f15cdcc369c274da1b7c9f7c7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/devangb4/bert-drug-classification', endpoint='https://huggingface.co', repo_type='model', repo_id='devangb4/bert-drug-classification'), pr_revision=None, pr_num=None)

# Playing Around

In [None]:
from transformers import pipeline
classifier = pipeline("text-classification", model="devangb4/bert-drug-classification")

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
# Test the pipeline with an example review
example_review = "A doctor in the ER prescribed me 200 mg of Provigil when I was first diagnosed with Narcolepsy. It didn\'t seem to have any effect on me at all. Then I went to see my sleep doctor and he prescribed me 250 mg of Nuvigil. It finally allowed me to stay awake for an entire day. But if I went out somewhere I would be able to come home and take a nap and be fully awake again. Taking that with Xyrem makes me feel 98%..not quite 100, but still pretty great."
prediction = classifier(example_review)
print(prediction)

[{'label': 'narcolepsy', 'score': 0.9791269898414612}]


In [None]:
print(drug_dataset['test'][14])

{'patient_id': 213376, 'drugName': 'Nuvigil', 'condition': 'narcolepsy', 'review': '"A doctor in the ER prescribed me 200 mg of Provigil when I was first diagnosed with Narcolepsy. It didn\'t seem to have any effect on me at all. Then I went to see my sleep doctor and he prescribed me 250 mg of Nuvigil. It finally allowed me to stay awake for an entire day. But if I went out somewhere I would be able to come home and take a nap and be fully awake again. Taking that with Xyrem makes me feel 98%..not quite 100, but still pretty great."', 'rating': 9.0, 'date': 'June 30, 2010', 'usefulCount': 14, 'review_length': 459}
