# Dataset processing

This notebook processes the raw csv outputs from VAERS into Huggingface datasets. It shouldn't generally need to be run by the end user. 

In [None]:
import pandas as pd
import numpy as np
import datasets
import glob
import tqdm.notebook as tqdm
from sklearn.model_selection import train_test_split
from typing import Tuple
from datetime import datetime
from skmultilearn.problem_transform import LabelPowerset

pd.set_option('future.no_silent_downcasting', True)

In [None]:
HF_URL: str = "chrisvoncsefalvay/vaers-outcomes"

FLAG_COLUMNS: list = ["DIED", "ER_VISIT", "HOSPITAL"]
DEMOGRAPHIC_COLUMNS: list = ["AGE_YRS", "SEX"]
ID_COLUMNS: list = ["VAERS_ID"]
TEXT_COLUMNS: list = ["SYMPTOM_TEXT"]

TEST_TRAIN_FRACTION: float = 0.3
TRAIN_VAL_FRACTION: float = 0.5

## Reading data files

In [None]:
def read_aggregate(pattern: str) -> pd.DataFrame:
    files = glob.glob(f"../data/{pattern}")
    dfs = []
    for file in tqdm.tqdm(files):
        dfs.append(pd.read_csv(file, encoding="latin-1", low_memory=False))

    res = pd.concat(dfs, ignore_index=True)
    
    print(f"Processed {len(dfs)} files for a total of {len(res)} records.")
        
    return res

In [None]:
data = read_aggregate("*VAERSDATA.csv")

In [None]:
_keep: list = ID_COLUMNS + DEMOGRAPHIC_COLUMNS + TEXT_COLUMNS + FLAG_COLUMNS + ["ER_ED_VISIT"]
data = data[_keep]

## Recoding

We recode as follows:

* For the outcome flags, `NaN` is recoded as `0` and `Y` is recoded as `1`.
* `ER_VISIT` and `ER_ED_VISIT` are coalesced into a single column called `ER_VISIT` that is `1`-valued if either is `1`-valued, otherwise it is `0`-valued. This is to manage the renaming of the column in the VAERS data.
* `NaN`s in the symptom text will drop the record.

In [None]:
def recode(df: pd.DataFrame) -> pd.DataFrame:
    for column in FLAG_COLUMNS + ["ER_ED_VISIT"]:
        df[column] = df[column].replace("Y", 1).fillna(0).astype(int)
    
    df['ER_VISIT'] = df[['ER_VISIT', 'ER_ED_VISIT']].max(axis=1)
    
    df = df.drop(columns=['ER_ED_VISIT'])
    
    df = df.dropna(subset=['SYMPTOM_TEXT'])
    
    return df

In [None]:
data = recode(data)

In [None]:
data

## Powerset encoding

We use powerset encoding to encode the outcomes as a single label, turning a multilabel problem into a multiclass problem.

In [None]:
clf = LabelPowerset()

In [None]:
data['label'] = clf.transform(data[FLAG_COLUMNS].values)

Because `datasets` actually stores labels as integers, we can put them in as human-readable strings and `datasets` will take care of the rest under the hood.

In [None]:
labels_dict = {}

for index, row in enumerate(clf.inverse_transform(np.unique(clf.transform(data[FLAG_COLUMNS].values))).toarray()):
    # Initialize the list to store the labels for the current row
    row_labels = []

    # Iterate over each column in the row
    for i, value in enumerate(row):
        # If the value is 1, add the corresponding label to the list
        if value == 1:
            row_labels.append(FLAG_COLUMNS[i])
            
    # If 'DIED' is in the list, remove it and append it at the end – this is purely
    # for aesthetic reasons
    if 'DIED' in row_labels:
        row_labels.remove('DIED')
        row_labels.append('DIED')

    row_labels = ", ".join(row_labels)

    # Add the list of labels for the current row to the dictionary with the row index as the key
    labels_dict[index] = row_labels
    
labels_dict[0] = "No event"

print(labels_dict)

In [None]:
data["label"] = data["label"].map(labels_dict)

## Test/train/validate split

We do a stratified split by age quintile and gender into test, train and validate sets.

In [None]:
def stratified_split(df: pd.DataFrame, test_train_fraction: float, train_val_fraction: float, random_state: int = None) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    df['AGE_QUINTILE'] = pd.qcut(df['AGE_YRS'], 5, labels = False)
    df['STRATIFICATION_VARIABLE'] = df['SEX'].astype(str) + "_" + df['AGE_QUINTILE'].astype(str)
    df = df.drop(columns=['AGE_QUINTILE'])
     
    _, train = train_test_split(df, train_size=test_train_fraction, random_state=random_state, stratify=df.STRATIFICATION_VARIABLE)
    
    val, test = train_test_split(_, train_size=train_val_fraction, random_state=random_state, stratify=_.STRATIFICATION_VARIABLE)
    
    train = train.drop(columns="STRATIFICATION_VARIABLE")
    val = val.drop(columns="STRATIFICATION_VARIABLE")
    test = test.drop(columns="STRATIFICATION_VARIABLE") 
    
    return train, test, val

In [None]:
train, test, val = stratified_split(data, TEST_TRAIN_FRACTION, TRAIN_VAL_FRACTION)

## Converting to labels

In [None]:
def convert_to_dataset(df: pd.DataFrame) -> datasets.Dataset:
    df = df.loc[:, ID_COLUMNS + TEXT_COLUMNS + ["label"]]
    
    # We rename the remaining columns
    df = df.rename(columns={"SYMPTOM_TEXT": "text", "VAERS_ID": "id"})
    
    features = datasets.Features({
        "id": datasets.Value("int32"),
        "text": datasets.Value("string"),
        "label": datasets.ClassLabel(
            names=list(labels_dict.values())
            )})

    ds = datasets.Dataset.from_pandas(df, 
                                      preserve_index=False,
                                      features=features)
    
    return ds

In [None]:
ds = datasets.DatasetDict()
ds["train"] = convert_to_dataset(train)
ds["test"] = convert_to_dataset(test)
ds["val"] = convert_to_dataset(val)

In [None]:
ds["train"].features

In [None]:
ds["train"].features["label"].int2str(ds["train"][5]["label"])

## Saving to Huggingface Hub

In [None]:
commit_message = f"""Data set commit of {len(train) + len(test) + len(val)} records of VAERS data at {datetime.now().isoformat()} from 1990 to 2023, encoded using a powerset multiclass encoding."""

ds.push_to_hub(HF_URL, 
               commit_message=commit_message,
               create_pr=True)