In [1]:
#!jupyter nbconvert --to script config_template.ipynb
#jupyter: create interactive window
import os
print("Current Working Directory:", os.getcwd())

Current Working Directory: c:\Users\User\Git-Repo\cdc-vaers-llm


In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader

# Load and preprocess the VAERS data and symptoms
vaers_data_path = 'data/2023VAERSDATA.csv'
vaers_symptoms_path = 'data/2023VAERSSYMPTOMS.csv'
vaers_data = pd.read_csv(vaers_data_path, encoding='ISO-8859-1')
vaers_symptoms = pd.read_csv(vaers_symptoms_path, encoding='ISO-8859-1')

# Merge datasets on VAERS_ID
merged_data = vaers_data.merge(vaers_symptoms, on='VAERS_ID')
merged_data['SYMPTOM_TEXT'] = merged_data['SYMPTOM_TEXT'].astype(str)

# Convert SYMPTOM1 to numerical labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
merged_data['encoded_labels'] = label_encoder.fit_transform(merged_data['SYMPTOM1'])

# Get the unique labels count
number_of_symptom_codes = len(vaers_symptoms['SYMPTOM1'].unique())  


  from .autonotebook import tqdm as notebook_tqdm


bin c:\Users\User\text-generation-webui-main\installer_files\env\Lib\site-packages\bitsandbytes\libbitsandbytes_cpu.so
function 'cadam32bit_grad_fp32' not found


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [3]:
# Rows have more than 1 row, if more that 5 symptoms 
merged_data

Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,SYMPTOMVERSION1,SYMPTOM2,SYMPTOMVERSION2,SYMPTOM3,SYMPTOMVERSION3,SYMPTOM4,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5,encoded_labels
0,2547730,01/01/2023,DE,53.00,,,F,,The adverse event is that the patient went int...,,...,25.1,COVID-19,25.1,Coma,25.1,Computerised tomogram,25.1,Exposure to SARS-CoV-2,25.1,796
1,2547730,01/01/2023,DE,53.00,,,F,,The adverse event is that the patient went int...,,...,25.1,Headache,25.1,Laboratory test,25.1,Magnetic resonance imaging,25.1,SARS-CoV-2 antibody test negative,25.1,2133
2,2547730,01/01/2023,DE,53.00,,,F,,The adverse event is that the patient went int...,,...,25.1,Unresponsive to stimuli,25.1,X-ray,25.1,,,,,3810
3,2547731,01/01/2023,MA,6.00,6.0,,M,,Error: Incorrect Reconstitution-,,...,25.1,,,,,,,,,3544
4,2547732,01/01/2023,MA,38.00,38.0,,F,,Error: Patient Accidentally Stuck by Needle-,,...,25.1,,,,,,,,,2487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136138,2728410,12/29/2023,MA,61.00,,,F,,Alkaline Phosphate had gone up to 91 to 188 (u...,,...,26.1,Alanine aminotransferase increased,26.1,Aspartate aminotransferase,26.1,Aspartate aminotransferase increased,26.1,Blood alkaline phosphatase,26.1,173
136139,2728410,12/29/2023,MA,61.00,,,F,,Alkaline Phosphate had gone up to 91 to 188 (u...,,...,26.1,,,,,,,,,664
136140,2728411,12/29/2023,,35.00,35.0,,F,,localized pain; pregnant patient who was accid...,,...,26.1,Extra dose administered,26.1,Pain,26.1,,,,,1781
136141,2728412,12/29/2023,AZ,60.00,60.0,,F,,"HSV2 outbreak after 10 years without, severe h...",,...,26.1,Herpes simplex,26.1,Pain,26.1,,,,,2134


In [4]:
# Concatenate symptoms into a single string or list for each row
merged_data['ALL_SYMPTOMS'] = merged_data[['SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']].apply(lambda x: ', '.join(x.dropna().astype(str)), axis=1)

# Group by VAERS_ID and aggregate data
grouped = merged_data.groupby('VAERS_ID').agg({
    'RECVDATE': 'first',
    'STATE': 'first',
    'AGE_YRS': 'first',
    'CAGE_YR': 'first',
    'CAGE_MO': 'first',
    'SEX': 'first',
    'RPT_DATE': 'first',
    'SYMPTOM_TEXT': 'first',
    'DIED': 'first',
    'DATEDIED': 'first',
    'L_THREAT': 'first',
    'ER_VISIT': 'first',
    'HOSPITAL': 'first',
    'HOSPDAYS': 'first',
    'X_STAY': 'first',
    'DISABLE': 'first',
    'RECOVD': 'first',
    'VAX_DATE': 'first',
    'ONSET_DATE': 'first',
    'NUMDAYS': 'first',
    'LAB_DATA': 'first',
    'V_ADMINBY': 'first',
    'V_FUNDBY': 'first',
    'OTHER_MEDS': 'first',
    'CUR_ILL': 'first',
    'HISTORY': 'first',
    'PRIOR_VAX': 'first',
    'SPLTTYPE': 'first',
    'FORM_VERS': 'first',
    'TODAYS_DATE': 'first',
    'BIRTH_DEFECT': 'first',
    'OFC_VISIT': 'first',
    'ER_ED_VISIT': 'first',
    'ALLERGIES': 'first',
    'encoded_labels': 'first',
    'ALL_SYMPTOMS': ' '.join  # Combine all symptoms for each ID
}).reset_index()

# Split the 'ALL_SYMPTOMS' into a list of symptoms
grouped['ALL_SYMPTOMS'] = grouped['ALL_SYMPTOMS'].str.split(', ')

# Find the maximum number of symptoms in any row
max_symptoms = grouped['ALL_SYMPTOMS'].apply(len).max()

# Create new columns for each symptom
for i in range(1, max_symptoms + 1):
    column_name = f'SYMPTOM{i}'
    grouped[column_name] = grouped['ALL_SYMPTOMS'].apply(lambda x: x[i-1] if i <= len(x) else None)

# Can drop the 'ALL_SYMPTOMS' column if it's no longer needed
#grouped.drop('ALL_SYMPTOMS', axis=1, inplace=True)

#grouped.to_clipboard()

  grouped[column_name] = grouped['ALL_SYMPTOMS'].apply(lambda x: x[i-1] if i <= len(x) else None)
  grouped[column_name] = grouped['ALL_SYMPTOMS'].apply(lambda x: x[i-1] if i <= len(x) else None)
  grouped[column_name] = grouped['ALL_SYMPTOMS'].apply(lambda x: x[i-1] if i <= len(x) else None)
  grouped[column_name] = grouped['ALL_SYMPTOMS'].apply(lambda x: x[i-1] if i <= len(x) else None)
  grouped[column_name] = grouped['ALL_SYMPTOMS'].apply(lambda x: x[i-1] if i <= len(x) else None)
  grouped[column_name] = grouped['ALL_SYMPTOMS'].apply(lambda x: x[i-1] if i <= len(x) else None)
  grouped[column_name] = grouped['ALL_SYMPTOMS'].apply(lambda x: x[i-1] if i <= len(x) else None)
  grouped[column_name] = grouped['ALL_SYMPTOMS'].apply(lambda x: x[i-1] if i <= len(x) else None)
  grouped[column_name] = grouped['ALL_SYMPTOMS'].apply(lambda x: x[i-1] if i <= len(x) else None)
  grouped[column_name] = grouped['ALL_SYMPTOMS'].apply(lambda x: x[i-1] if i <= len(x) else None)
  grouped[column_nam

In [5]:
grouped

Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,SYMPTOM132,SYMPTOM133,SYMPTOM134,SYMPTOM135,SYMPTOM136,SYMPTOM137,SYMPTOM138,SYMPTOM139,SYMPTOM140,SYMPTOM141
0,2547730,01/01/2023,DE,53.00,,,F,,The adverse event is that the patient went int...,,...,,,,,,,,,,
1,2547731,01/01/2023,MA,6.00,6.0,,M,,Error: Incorrect Reconstitution-,,...,,,,,,,,,,
2,2547732,01/01/2023,MA,38.00,38.0,,F,,Error: Patient Accidentally Stuck by Needle-,,...,,,,,,,,,,
3,2547733,01/01/2023,CA,63.00,63.0,,M,,Error: Dose in Series Given Too Early-,,...,,,,,,,,,,
4,2547734,01/01/2023,IL,30.00,30.0,,F,,Systemic: EYE TWITCHING-Medium,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105720,2728410,12/29/2023,MA,61.00,,,F,,Alkaline Phosphate had gone up to 91 to 188 (u...,,...,,,,,,,,,,
105721,2728411,12/29/2023,,35.00,35.0,,F,,localized pain; pregnant patient who was accid...,,...,,,,,,,,,,
105722,2728412,12/29/2023,AZ,60.00,60.0,,F,,"HSV2 outbreak after 10 years without, severe h...",,...,,,,,,,,,,
105723,2728413,12/29/2023,SC,1.33,1.0,0.4,M,,"Administration error, patient was not supposed...",,...,,,,,,,,,,


In [6]:
merged_data = grouped.copy()

In [7]:
# Reduce rows for learning how the fine-tuning is working
merged_data = merged_data[0:2000]

In [8]:
merged_data

Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,SYMPTOM132,SYMPTOM133,SYMPTOM134,SYMPTOM135,SYMPTOM136,SYMPTOM137,SYMPTOM138,SYMPTOM139,SYMPTOM140,SYMPTOM141
0,2547730,01/01/2023,DE,53.00,,,F,,The adverse event is that the patient went int...,,...,,,,,,,,,,
1,2547731,01/01/2023,MA,6.00,6.0,,M,,Error: Incorrect Reconstitution-,,...,,,,,,,,,,
2,2547732,01/01/2023,MA,38.00,38.0,,F,,Error: Patient Accidentally Stuck by Needle-,,...,,,,,,,,,,
3,2547733,01/01/2023,CA,63.00,63.0,,M,,Error: Dose in Series Given Too Early-,,...,,,,,,,,,,
4,2547734,01/01/2023,IL,30.00,30.0,,F,,Systemic: EYE TWITCHING-Medium,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,2551008,01/05/2023,,,,,M,,Guillain-Barré syndrome; This spontaneous case...,,...,,,,,,,,,,
1996,2551030,01/05/2023,,,,,U,,Had shingles after the vaccine/Suspected vacci...,,...,,,,,,,,,,
1997,2551031,01/05/2023,,70.00,,,U,,GOT COVID; This spontaneous report received fr...,,...,,,,,,,,,,
1998,2551032,01/05/2023,,1.58,1.0,0.6,F,,rash on the abdomen and chest; This spontaneou...,,...,,,,,,,,,,


In [9]:
# Preprocess the data for DistilBERT
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Split the data with labels
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    merged_data['SYMPTOM_TEXT'].tolist(), 
    merged_data['encoded_labels'].tolist(), 
    test_size=0.1
)

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [10]:
# PyTorch Dataset updated to include labels
class VAERSSymptomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets with labels
train_dataset = VAERSSymptomDataset(train_encodings, train_labels)
val_dataset = VAERSSymptomDataset(val_encodings, val_labels)

# Load Pretrained DistilBERT Model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=number_of_symptom_codes)

# DataLoader for validation set
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [11]:
# Evaluation Function
def evaluate_model(model, data_loader, return_results=False):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)
            if return_results:
                predictions.extend(predicted.cpu().numpy())
                actuals.extend(labels.cpu().numpy())
    if return_results:
        return predictions, actuals



In [12]:
# Evaluate the Pretrained Model
print("Evaluating Pretrained Model...")
pretrained_preds, pretrained_actuals = evaluate_model(model, val_loader, return_results=True)

# Calculate accuracy for the Pretrained Model
pretrained_accuracy = sum(np.array(pretrained_preds) == np.array(pretrained_actuals)) / len(pretrained_preds)
print(f'Pretrained Model Accuracy: {pretrained_accuracy:.4f}')


Evaluating Pretrained Model...
Pretrained Model Accuracy: 0.0000


In [13]:
# Reset index on merged_data if necessary
merged_data.reset_index(drop=True, inplace=True)

# Ensure val_texts is a pandas Series with correct indices
# If val_texts is a list, convert it to a pandas Series
if isinstance(val_texts, list):
    val_texts = pd.Series(val_texts)

# When splitting, retain indices
train_texts, val_texts, train_labels, val_labels = train_test_split(
    merged_data['SYMPTOM_TEXT'], 
    merged_data['encoded_labels'], 
    test_size=0.1
)

# Create a DataFrame for the pre-fine-tuning evaluation
pretrained_df = pd.DataFrame({
    'VAERS_ID': merged_data.loc[val_texts.index, 'VAERS_ID'],
    'SYMPTOM_TEXT': val_texts,
    'ActualLabel': [label_encoder.inverse_transform([label])[0] for label in pretrained_actuals],
    'PredictedLabel': [label_encoder.inverse_transform([label])[0] for label in pretrained_preds]
})


In [14]:
# Print results for the pre-fine tuned 
pretrained_df

Unnamed: 0,VAERS_ID,SYMPTOM_TEXT,ActualLabel,PredictedLabel
1078,2549132,Vaccine frozen rather than refrigerated. Dose ...,No adverse event,Barrett's oesophagus
904,2548956,No adverse event. Vaccine given after beyond u...,Cough,Monocyte count
1353,2549638,"1st dose in 2018, 2nd not received; This case ...",COVID-19,Epididymitis
380,2548183,Patient was administered vaccine that was foun...,COVID-19,Monocyte count
1705,2550258,Menveo was ordered and Bexsero was administere...,Nasal mucosal discolouration,Epididymitis
...,...,...,...,...
1338,2549616,COVID 19 Treatment; COVID 19 Treatment; This i...,COVID-19,Monocyte count
1942,2550952,Treatment of COVID-19; Treatment of COVID-19; ...,Expired product administered,Sexually transmitted disease test
705,2548525,Patient was administered vaccine that was foun...,Asthenia,Monocyte count
700,2548520,vaccine was frozen rather than refrigerated. p...,Blood sodium decreased,Monocyte count


In [15]:
# Fine-Tuning the Model
os.environ['WANDB_DISABLED'] = 'true'

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

print("Starting Fine-Tuning...")
trainer.train()

# Trained on 200 records in 26 minutes, Fine-Tunded Model Accuracy: 0.1000
# Trained on 20,000 records in 252 minutes (4 hours), Fine-Tunded Model Accuracy: 0.5550

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Starting Fine-Tuning...


100%|██████████| 339/339 [4:12:36<00:00, 44.71s/it]  


{'train_runtime': 15156.731, 'train_samples_per_second': 0.356, 'train_steps_per_second': 0.022, 'train_loss': 5.833323250829646, 'epoch': 3.0}


TrainOutput(global_step=339, training_loss=5.833323250829646, metrics={'train_runtime': 15156.731, 'train_samples_per_second': 0.356, 'train_steps_per_second': 0.022, 'train_loss': 5.833323250829646, 'epoch': 3.0})

In [16]:
# Evaluate the Fine-Tuned Model
print("Evaluating Fine-Tuned  Model...")
fine_tuned_preds, fine_tuned_actuals = evaluate_model(model, val_loader, return_results=True)

# Calculate accuracy for the Fine-Tuned Model
fine_tuned_accuracy = sum(np.array(fine_tuned_preds) == np.array(fine_tuned_actuals)) / len(fine_tuned_preds)
print(f'Fine-Tunded Model Accuracy: {fine_tuned_accuracy:.4f}')


Evaluating Fine-Tuned  Model...
Fine-Tunded Model Accuracy: 0.5550


In [17]:
'''
# Reset index on merged_data if necessary
merged_data.reset_index(drop=True, inplace=True)

# Ensure val_texts is a pandas Series with correct indices
# If val_texts is a list, convert it to a pandas Series
if isinstance(val_texts, list):
    val_texts = pd.Series(val_texts)

# When splitting, retain indices
train_texts, val_texts, train_labels, val_labels = train_test_split(
    merged_data['SYMPTOM_TEXT'], 
    merged_data['encoded_labels'], 
    test_size=0.1
)

# Create a DataFrame for the pre-fine-tuning evaluation
pretrained_df = pd.DataFrame({
    'VAERS_ID': merged_data.loc[val_texts.index, 'VAERS_ID'],
    'SYMPTOM_TEXT': val_texts,
    'ActualLabel': [label_encoder.inverse_transform([label])[0] for label in pretrained_actuals],
    'PredictedLabel': [label_encoder.inverse_transform([label])[0] for label in pretrained_preds]
})
'''
# Create a DataFrame for the post-fine-tuning evaluation
fine_tuned_df = pd.DataFrame({
    'VAERS_ID': merged_data.loc[val_texts.index, 'VAERS_ID'],
    'SYMPTOM_TEXT': val_texts,
    'ActualLabel': [label_encoder.inverse_transform([label])[0] for label in fine_tuned_actuals],
    'PredictedLabel': [label_encoder.inverse_transform([label])[0] for label in fine_tuned_preds]
})

In [21]:
fine_tuned_df

Unnamed: 0,VAERS_ID,SYMPTOM_TEXT,ActualLabel,PredictedLabel
1078,2549132,Vaccine frozen rather than refrigerated. Dose ...,No adverse event,No adverse event
904,2548956,No adverse event. Vaccine given after beyond u...,Cough,COVID-19
1353,2549638,"1st dose in 2018, 2nd not received; This case ...",COVID-19,COVID-19
380,2548183,Patient was administered vaccine that was foun...,COVID-19,COVID-19
1705,2550258,Menveo was ordered and Bexsero was administere...,Nasal mucosal discolouration,COVID-19
...,...,...,...,...
1338,2549616,COVID 19 Treatment; COVID 19 Treatment; This i...,COVID-19,COVID-19
1942,2550952,Treatment of COVID-19; Treatment of COVID-19; ...,Expired product administered,No adverse event
705,2548525,Patient was administered vaccine that was foun...,Asthenia,COVID-19
700,2548520,vaccine was frozen rather than refrigerated. p...,Blood sodium decreased,COVID-19
