### A Fine-Tuned LLM DistilBERT to Classify User VARES Adverse Event Symptoms Text Descriptions 
#### To predict more than just the first symptom (SYMPTOM1), a different approach is needed to handle multiple label prediction. This is typically done using a multi-label classification setup, where each symptom is treated as a separate label, and the model learns to predict the presence or absence of each symptom independently (using a MultiLabelBinarizer).

In [1]:
import os
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
import torch
from torch.utils.data import Dataset
from transformers import DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

# Load and preprocess the VAERS data and symptoms
vaers_data_path = 'data/2023VAERSDATA.csv'
vaers_symptoms_path = 'data/2023VAERSSYMPTOMS.csv'
vaers_data = pd.read_csv(vaers_data_path, encoding='ISO-8859-1')
vaers_symptoms = pd.read_csv(vaers_symptoms_path, encoding='ISO-8859-1')

# Merge datasets on VAERS_ID
merged_data = vaers_data.merge(vaers_symptoms, on='VAERS_ID')
merged_data['SYMPTOM_TEXT'] = merged_data['SYMPTOM_TEXT'].astype(str)

# Concatenate symptoms into a single string for each row
merged_data['ALL_SYMPTOMS'] = merged_data[['SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']].apply(lambda x: ', '.join(x.dropna().astype(str)), axis=1)

# Group by VAERS_ID and aggregate data
grouped = merged_data.groupby('VAERS_ID').agg({
    # Include all necessary columns here
    'SYMPTOM_TEXT': 'first',
    'ALL_SYMPTOMS': ' '.join
}).reset_index()

# Split the 'ALL_SYMPTOMS' into a list of symptoms
grouped['ALL_SYMPTOMS'] = grouped['ALL_SYMPTOMS'].str.split(', ')

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Transform the symptoms into a multi-label format with 'Encoded_Symptoms' column contains a binary matrix suitable for multi-label classification
grouped['Encoded_Symptoms'] = list(mlb.fit_transform(grouped['ALL_SYMPTOMS']))

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
grouped = grouped[0:5000]
grouped

NameError: name 'groupedsys' is not defined

In [None]:
# Split the data with labels
train_texts, val_texts, train_labels, val_labels = train_test_split(
    grouped['SYMPTOM_TEXT'].tolist(), 
    grouped['Encoded_Symptoms'].tolist(), 
    test_size=0.1
)


In [None]:
# Tokenize the data
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [None]:
# Create a dataset for pytorch

class VAERSSymptomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])  # Correctly return the length of the dataset


In [None]:
# Split the data and load the Pretrained DistilBERT Model for Multi-Label Classification

train_dataset = VAERSSymptomDataset(train_encodings, train_labels)
val_dataset = VAERSSymptomDataset(val_encodings, val_labels)

# Number of unique labels (symptoms) to classify
num_labels = len(mlb.classes_)

# Load Pretrained DistilBERT Model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
# Fine-tune the model
import os
os.environ['WANDB_DISABLED'] = 'true'

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Start fine-tuning
trainer.train()

# 1000 in 2.5 hours, predicts the same lables for all text

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(preds, labels):
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='micro')
    recall = recall_score(labels, preds, average='micro')
    f1 = f1_score(labels, preds, average='micro')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [None]:
# Evaluate the fine-tuned model
results = trainer.evaluate()
print(results)


In [None]:
# Get predictions
predictions, labels, _ = trainer.predict(val_dataset)
# Convert predictions to binary format (0 or 1)
threshold = 0.5  # You might need to adjust this threshold
predicted_labels = (predictions > threshold).astype(int)

# Inverse transform to get symptom names from binary labels
predicted_symptoms = mlb.inverse_transform(predicted_labels)
actual_symptoms = mlb.inverse_transform(labels)

# Creating the DataFrame
val_texts_series = pd.Series(val_texts, name='SYMPTOM_TEXT')
df_results = pd.DataFrame({
    'VAERS_ID': merged_data.loc[val_texts_series.index, 'VAERS_ID'],
    'SYMPTOM_TEXT': val_texts_series.values,
    'ActualSymptoms': ['; '.join(symptoms) for symptoms in actual_symptoms],
    'PredictedSymptoms': ['; '.join(symptoms) for symptoms in predicted_symptoms]
})


In [None]:
df_results

In [None]:
# Compute evaluation metrics
metrics = compute_metrics(predicted_labels, labels)

print(metrics)
