In [1]:
#!jupyter nbconvert --to script config_template.ipynb
#jupyter: create interactive window

In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader

# Load and preprocess the VAERS data and symptoms
vaers_data_path = 'data/2023VAERSDATA.csv'
vaers_symptoms_path = 'data/2023VAERSSYMPTOMS.csv'
vaers_data = pd.read_csv(vaers_data_path, encoding='ISO-8859-1')
vaers_symptoms = pd.read_csv(vaers_symptoms_path, encoding='ISO-8859-1')

# Merge datasets on VAERS_ID
merged_data = vaers_data.merge(vaers_symptoms, on='VAERS_ID')
merged_data['SYMPTOM_TEXT'] = merged_data['SYMPTOM_TEXT'].astype(str)

# Convert SYMPTOM1 to numerical labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
merged_data['encoded_labels'] = label_encoder.fit_transform(merged_data['SYMPTOM1'])


# Get the unique labels count
number_of_symptom_codes = len(vaers_symptoms['SYMPTOM1'].unique())  

  from .autonotebook import tqdm as notebook_tqdm


bin c:\Users\User\text-generation-webui-main\installer_files\env\Lib\site-packages\bitsandbytes\libbitsandbytes_cpu.so
function 'cadam32bit_grad_fp32' not found


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [3]:
# Reduce to just a few rows for testing
merged_data = merged_data[0:200]

In [4]:
# Preprocess the data for DistilBERT
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Split the data with labels
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    merged_data['SYMPTOM_TEXT'].tolist(), 
    merged_data['encoded_labels'].tolist(), 
    test_size=0.1
)

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [5]:
# PyTorch Dataset updated to include labels
class VAERSSymptomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets with labels
train_dataset = VAERSSymptomDataset(train_encodings, train_labels)
val_dataset = VAERSSymptomDataset(val_encodings, val_labels)

# Load Pretrained DistilBERT Model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=number_of_symptom_codes)

# DataLoader for validation set
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [6]:
# Evaluation Function
def evaluate_model(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)
            total += len(batch)
            correct += (predicted == batch['labels']).sum().item()
    return correct / total



In [7]:
# Evaluate the Model Without Fine-Tuning
print("Evaluating Pretrained Model...")
pretrained_accuracy = evaluate_model(model, val_loader)
print(f'Pretrained Model Accuracy: {pretrained_accuracy:.4f}')

Evaluating Pretrained Model...
Pretrained Model Accuracy: 0.0000


In [8]:
import os
os.environ['WANDB_DISABLED'] = 'true'

# Fine-Tuning the Model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

print("Starting Fine-Tuning...")
trainer.train()

# Evaluate the Fine-Tuned Model
print("Evaluating Fine-Tuned Model...")
fine_tuned_accuracy = evaluate_model(model, val_loader)
print(f'Fine-Tuned Model Accuracy: {fine_tuned_accuracy:.4f}')

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Starting Fine-Tuning...


100%|██████████| 36/36 [28:46<00:00, 47.95s/it]


{'train_runtime': 1726.1195, 'train_samples_per_second': 0.313, 'train_steps_per_second': 0.021, 'train_loss': 8.41167958577474, 'epoch': 3.0}
Evaluating Fine-Tuned Model...
Fine-Tuned Model Accuracy: 1.5000


In [9]:
# Evaluate the Fine-Tuned Model
print("Evaluating Fine-Tuned Model...")
fine_tuned_accuracy = evaluate_model(model, val_loader)
print(f'Fine-Tuned Model Accuracy: {fine_tuned_accuracy:.4f}')

Evaluating Fine-Tuned Model...
Fine-Tuned Model Accuracy: 1.5000
