In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import joblib

In [6]:
import os
os.environ["TRANSFORMERS_NO_TORCH_LOAD_VERSION_CHECK"] = "1"

In [7]:
df = pd.read_csv("symptoms.csv")   # downloaded from Kaggle
# combine all symptom columns into one text field
symptom_cols = [c for c in df.columns if "Symptom" in c]
df["text"] = df[symptom_cols].fillna("").agg(" ".join, axis=1)
df = df[["text", "Disease"]]

df.head()

Unnamed: 0,text,Disease
0,itching skin_rash nodal_skin_eruptions disc...,Fungal infection
1,skin_rash nodal_skin_eruptions dischromic _...,Fungal infection
2,itching nodal_skin_eruptions dischromic _pat...,Fungal infection
3,itching skin_rash dischromic _patches ...,Fungal infection
4,itching skin_rash nodal_skin_eruptions ...,Fungal infection


In [8]:
# encode labels
le = LabelEncoder()
df["label"] = le.fit_transform(df["Disease"])
num_labels = len(le.classes_)
joblib.dump(le, "label_encoder.joblib")

['label_encoder.joblib']

In [9]:
# convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)

In [10]:
# -------- Tokenizer --------
MODEL = "emilyalsentzer/Bio_ClinicalBERT"   # medical BERT
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/4920 [00:00<?, ? examples/s]Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 4920/4920 [00:00<00:00, 21132.66 examples/s]


In [11]:
# train/test split
train_test = dataset.train_test_split(test_size=0.2)
train_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=num_labels,
    trust_remote_code=True,
    use_safetensors=False,   # 🚨 disable safetensors
    ignore_mismatched_sizes=True
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
args = TrainingArguments(
    "symptom-checker-model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="logs",
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_test["train"],
    eval_dataset=train_test["test"],
    tokenizer=tokenizer,
)
trainer.train()
trainer.evaluate()

 34%|███▍      | 500/1476 [26:32<58:29,  3.60s/it]  

{'loss': 1.7568, 'grad_norm': 4.2133378982543945, 'learning_rate': 1.3224932249322495e-05, 'epoch': 1.02}


 68%|██████▊   | 1000/1476 [1:18:30<45:56,  5.79s/it]

{'loss': 0.1743, 'grad_norm': 0.8908286690711975, 'learning_rate': 6.449864498644986e-06, 'epoch': 2.03}


100%|██████████| 1476/1476 [2:07:44<00:00,  5.19s/it]  


{'train_runtime': 7664.6837, 'train_samples_per_second': 1.541, 'train_steps_per_second': 0.193, 'train_loss': 0.6691516408429237, 'epoch': 3.0}


100%|██████████| 123/123 [02:53<00:00,  1.41s/it]


{'eval_loss': 0.026451895013451576,
 'eval_runtime': 175.2727,
 'eval_samples_per_second': 5.614,
 'eval_steps_per_second': 0.702,
 'epoch': 3.0}

In [11]:
# -------- Save everything --------
model.save_pretrained("model/")
tokenizer.save_pretrained("model/")

('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/vocab.txt',
 'model/added_tokens.json',
 'model/tokenizer.json')

In [13]:
from datasets import load_from_disk
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report

from datasets import load_from_disk
import torch

# Assuming you did:
train_test = dataset.train_test_split(test_size=0.2)

test_dataset = train_test["test"]
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])


# Load model & tokenizer
model = AutoModelForSequenceClassification.from_pretrained("model/")
tokenizer = AutoTokenizer.from_pretrained("model/")
model.eval()

y_true = []
y_pred = []

for batch in test_dataset:
    inputs = {
        "input_ids": batch["input_ids"].unsqueeze(0),       # add batch dim
        "attention_mask": batch["attention_mask"].unsqueeze(0)
    }
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=1).item()

    y_pred.append(pred)
    y_true.append(batch["label"].item())
