<a href="https://colab.research.google.com/github/dakotamurdock/bardi_experiments/blob/main/MOSSAIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BARDI-less Research Path Forward

## Setup

In [None]:
import re

import datasets
import pandas as pd
from bardi.nlp_engineering.regex_library.pathology_report import PathologyReportRegexSet
from bardi.nlp_engineering.utils import tokenizers_lib
from transformers import AutoTokenizer

## Create a Sample Dataset

In practice, replace this with a database query.

In [None]:
df = pd.DataFrame([
    {
        "id": 1,
        "text": "Patient shows mild symptoms of fatigue and cough.",
        "diagnosis": "negative"
    },
    {
        "id": 2,
        "text": "Patient reports high fever and loss of taste.",
        "diagnosis": "positive"
    },
    {
        "id": 3,
        "text": "Patient exhibits occasional headaches and dizziness.",
        "diagnosis": "positive"
    },
    {
        "id": 4,
        "text": "Patient complains of shortness of breath and chest pain.",
        "diagnosis": "positive"
    },
    {
        "id": 5,
        "text": "Patient experiences intermittent muscle aches.",
        "diagnosis": "positive"
    },
    {
        "id": 6,
        "text": "Patient has persistent dry cough and sore throat.",
        "diagnosis": "positive"
    },
    {
        "id": 7,
        "text": "Patient reports no significant symptoms, feels generally well.",
        "diagnosis": "negative"
    },
    {
        "id": 8,
        "text": "Patient shows signs of respiratory distress and fatigue.",
        "diagnosis": "positive"
    },
    {
        "id": 9,
        "text": "Patient experiences mild nausea and occasional vomiting.",
        "diagnosis": "positive"
    },
    {
        "id": 10,
        "text": "Patient has severe headache and high fever.",
        "diagnosis": "positive"
    },
    {
        "id": 11,
        "text": "Patient exhibits mild cold symptoms and fatigue.",
        "diagnosis": "positive"
    },
    {
        "id": 12,
        "text": "Patient reports persistent cough and difficulty breathing.",
        "diagnosis": "positive"
    },
    {
        "id": 13,
        "text": "Patient shows no symptoms, regular check-up.",
        "diagnosis": "negative"
    },
    {
        "id": 14,
        "text": "Patient complains of body aches and high fever.",
        "diagnosis": "positive"
    },
    {
        "id": 15,
        "text": "Patient has mild symptoms of runny nose and sneezing.",
        "diagnosis": "negative"
    },
    {
        "id": 16,
        "text": "Patient reports loss of smell and taste, severe cough.",
        "diagnosis": "positive"
    },
    {
        "id": 17,
        "text": "Patient experiences mild fatigue and occasional headache.",
        "diagnosis": "negative"
    },
    {
        "id": 18,
        "text": "Patient shows high fever and shortness of breath.",
        "diagnosis": "positive"
    },
    {
        "id": 19,
        "text": "Patient has mild cold symptoms, feels generally well.",
        "diagnosis": "positive"
    },
    {
        "id": 20,
        "text": "Patient exhibits severe symptoms, including chest pain.",
        "diagnosis": "positive"
    }
])

dataset = datasets.Dataset.from_pandas(df)

## Normalizer

Use the regular expression set from BARDI.

Replace the normalizer with <20 lines of code.

In [None]:
# grabbing a pre-made regex set for normalizing pathology reports
path_report_regex_set = PathologyReportRegexSet().get_regex_set()


# apply regex normalizations
def normalize(row):
    for regex in path_report_regex_set:
        row["text"] = re.sub(
            regex['regex_str'],
            regex['sub_str'],
            row["text"].lower()
        )

    return row


dataset = dataset.map(lambda row: normalize(row))

## Tokenizer Encoder

Utilize tokenizer utilities from BARDI.

Replace the tokenizer encoder with <20 lines of code.

In [None]:
# apply tokenizer
tokenizer = AutoTokenizer.from_pretrained('BioMistral/BioMistral-7B')
tokenizer = tokenizers_lib.set_tokenizer_params(tokenizer)  # correct common issues


def tokenize(text):
    return tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=tokenizer.model_max_length,
        return_tensors='np',
    )


dataset = dataset.map(
    lambda row: tokenize(row['text']),
    batched=True
)

## Label Processor

Replace the label processor with a couple lines of code.

In [None]:
# label processor
dataset = dataset.class_encode_column('diagnosis')
id_to_label = {
    idx: name for idx, name in enumerate(dataset.features['diagnosis'].names)
}

## Splitter

Replace the splitter with <20 lines of code.

In [None]:
# split into train/test
ds_split_train_test = dataset.train_test_split(
    test_size=0.3,
    stratify_by_column='diagnosis'
)
train_ds, test_ds = ds_split_train_test["train"], ds_split_train_test["test"]

# further split test into test/val
ds_split_test_val = ds_split_train_test["test"].train_test_split(test_size=0.5)
test_ds, val_ds = ds_split_test_val["train"], ds_split_test_val["test"]

split_dataset = datasets.DatasetDict({
    'train': train_ds,
    'test': test_ds,
    'val': val_ds
})

## Output

In [None]:
print(f'dataset structure: {split_dataset}')
print(f'example record: {split_dataset["test"][0]}')
print(f'id_to_label: {id_to_label}')