<a href="https://colab.research.google.com/github/dakotamurdock/bardi_experiments/blob/main/MOSSAIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BARDI-less Research Path Forward

## Setup

In [1]:
%pip install datasets transformers bardi

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting bardi
  Downloading bardi-0.4.0-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.7/53.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import re

import datasets
import pandas as pd
from bardi.nlp_engineering.regex_library.pathology_report import PathologyReportRegexSet
from bardi.nlp_engineering.utils import tokenizers_lib
from sklearn.model_selection import GroupShuffleSplit
from transformers import AutoTokenizer

## Create a Sample Dataset

In practice, replace this with a database query.

In [3]:
df = pd.DataFrame([
    {
        "id": 1,
        "group": 1,
        "text": "Patient shows mild symptoms of fatigue and cough.",
        "diagnosis": "negative"
    },
    {
        "id": 2,
        "group": 1,
        "text": "Patient reports high fever and loss of taste.",
        "diagnosis": "positive"
    },
    {
        "id": 3,
        "group": 2,
        "text": "Patient exhibits occasional headaches and dizziness.",
        "diagnosis": "positive"
    },
    {
        "id": 4,
        "group": 3,
        "text": "Patient complains of shortness of breath and chest pain.",
        "diagnosis": "positive"
    },
    {
        "id": 5,
        "group": 4,
        "text": "Patient experiences intermittent muscle aches.",
        "diagnosis": "positive"
    },
    {
        "id": 6,
        "group": 4,
        "text": "Patient has persistent dry cough and sore throat.",
        "diagnosis": "positive"
    },
    {
        "id": 7,
        "group": 4,
        "text": "Patient reports no significant symptoms, feels generally well.",
        "diagnosis": "negative"
    },
    {
        "id": 8,
        "group": 5,
        "text": "Patient shows signs of respiratory distress and fatigue.",
        "diagnosis": "positive"
    },
    {
        "id": 9,
        "group": 6,
        "text": "Patient experiences mild nausea and occasional vomiting.",
        "diagnosis": "positive"
    },
    {
        "id": 10,
        "group": 7,
        "text": "Patient has severe headache and high fever.",
        "diagnosis": "positive"
    },
    {
        "id": 11,
        "group": 8,
        "text": "Patient exhibits mild cold symptoms and fatigue.",
        "diagnosis": "positive"
    },
    {
        "id": 12,
        "group": 8,
        "text": "Patient reports persistent cough and difficulty breathing.",
        "diagnosis": "positive"
    },
    {
        "id": 13,
        "group": 9,
        "text": "Patient shows no symptoms, regular check-up.",
        "diagnosis": "negative"
    },
    {
        "id": 14,
        "group": 10,
        "text": "Patient complains of body aches and high fever.",
        "diagnosis": "positive"
    },
    {
        "id": 15,
        "group": 11,
        "text": "Patient has mild symptoms of runny nose and sneezing.",
        "diagnosis": "negative"
    },
    {
        "id": 16,
        "group": 12,
        "text": "Patient reports loss of smell and taste, severe cough.",
        "diagnosis": "positive"
    },
    {
        "id": 17,
        "group": 12,
        "text": "Patient experiences mild fatigue and occasional headache.",
        "diagnosis": "negative"
    },
    {
        "id": 18,
        "group": 12,
        "text": "Patient shows high fever and shortness of breath.",
        "diagnosis": "positive"
    },
    {
        "id": 19,
        "group": 12,
        "text": "Patient has mild cold symptoms, feels generally well.",
        "diagnosis": "positive"
    },
    {
        "id": 20,
        "group": 13,
        "text": "Patient exhibits severe symptoms, including chest pain.",
        "diagnosis": "positive"
    }
])

dataset = datasets.Dataset.from_pandas(df)

## Normalizer

Use the regular expression set from BARDI.

Replace the normalizer with <20 lines of code.

In [4]:
# grabbing a pre-made regex set for normalizing pathology reports
path_report_regex_set = PathologyReportRegexSet().get_regex_set()


# apply regex normalizations
def normalize(row):
    for regex in path_report_regex_set:
        row["text"] = re.sub(
            regex['regex_str'],
            regex['sub_str'],
            row["text"].lower()
        )

    return row


dataset = dataset.map(lambda row: normalize(row))

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

## Tokenizer Encoder

Utilize tokenizer utilities from BARDI.

Replace the tokenizer encoder with <20 lines of code.

In [5]:
# apply tokenizer
tokenizer = AutoTokenizer.from_pretrained('BioMistral/BioMistral-7B')
tokenizer = tokenizers_lib.set_tokenizer_params(tokenizer)  # correct common issues


def tokenize(text):
    return tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=tokenizer.model_max_length,
        return_tensors='np',
    )


dataset = dataset.map(
    lambda row: tokenize(row['text']),
    batched=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

## Label Processor

Replace the label processor with a couple lines of code.

In [6]:
# label processor
dataset = dataset.class_encode_column('diagnosis')
id_to_label = {
    idx: name for idx, name in enumerate(dataset.features['diagnosis'].names)
}

Casting to class labels:   0%|          | 0/20 [00:00<?, ? examples/s]

## Splitter

Replace the splitter with <20 lines of code.

In [18]:
# split into train/test
ds_split_train_test = dataset.train_test_split(
    test_size=0.3,
    stratify_by_column='diagnosis'
)
train_ds, test_ds = ds_split_train_test["train"], ds_split_train_test["test"]

# further split test into test/val
ds_split_test_val = ds_split_train_test["test"].train_test_split(test_size=0.5)
test_ds, val_ds = ds_split_test_val["train"], ds_split_test_val["test"]

In [36]:
# alternative scikit learn method of splitting
# stratify across "group" column - similar to something like a unique patient

# split into train/test
splitter = GroupShuffleSplit(test_size=0.3, n_splits=5, random_state=42)
split_dataset = splitter.split(dataset, groups=dataset['group'])
train_ids, test_ids = next(split_dataset)
train_ds = dataset.select(train_ids)
test_ds = dataset.select(test_ids)

# further split test into test/val
splitter = GroupShuffleSplit(test_size=0.5, n_splits=5, random_state=42)
split_test_dataset = splitter.split(test_ds, groups=test_ds['group'])
test_ids, val_ids = next(split_test_dataset)
val_ds = test_ds.select(val_ids)
test_ds = test_ds.select(test_ids)

In [37]:
split_dataset = datasets.DatasetDict({
    'train': train_ds,
    'test': test_ds,
    'val': val_ds
})

## Output

In [38]:
print(f'dataset structure: {split_dataset}')
print()
print(f'example record: {split_dataset["test"][0]}')
print()
print(f'id_to_label: {id_to_label}')
print()
print(f'test table: {split_dataset["test"].num_rows}')
print()
print(f'train table: {split_dataset["train"].num_rows}')

dataset structure: DatasetDict({
    train: Dataset({
        features: ['id', 'group', 'text', 'diagnosis', 'input_ids', 'attention_mask'],
        num_rows: 12
    })
    test: Dataset({
        features: ['id', 'group', 'text', 'diagnosis', 'input_ids', 'attention_mask'],
        num_rows: 3
    })
    val: Dataset({
        features: ['id', 'group', 'text', 'diagnosis', 'input_ids', 'attention_mask'],
        num_rows: 5
    })
})

example record: {'id': 1, 'group': 1, 'text': 'patient shows mild symptoms of fatigue and cough.', 'diagnosis': 0, 'input_ids': [32000, 32000, 32000, 1, 7749, 4370, 16583, 12380, 302, 6370, 12216, 304, 26245, 28723], 'attention_mask': [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

id_to_label: {0: 'negative', 1: 'positive'}

test table: 3

train table: 12
