<a href="https://colab.research.google.com/github/dakotamurdock/bardi_experiments/blob/main/bardi%2Bhf_custom_step.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install --upgrade polars
%pip install datasets
%pip install bardi

In [2]:
import pandas as pd
import polars as pl
import datasets
from bardi.pipeline import Pipeline as BardiPipeline
from bardi.pipeline import Step as BardiStep
from bardi.data import data_handlers
from bardi import nlp_engineering
from bardi.nlp_engineering.regex_library.pathology_report import PathologyReportRegexSet
from tokenizers import Tokenizer
from transformers import AutoTokenizer

In [3]:
# Create a toy dataset
df = pd.DataFrame([
    {
        "patient_id_number": 1,
        "text": "The patient presented with notable changes in behavior, exhibiting increased aggression, impulsivity, and a distinct deviation from the Jedi Code. Preliminary examinations reveal a heightened midichlorian count and an unsettling connection to the dark side of the Force. Further analysis is warranted to explore the extent of exposure to Sith teachings. It is imperative to monitor the individual closely for any worsening symptoms and to engage in therapeutic interventions aimed at preventing further descent into the dark side. Follow-up assessments will be crucial in determining the efficacy of intervention strategies and the overall trajectory of the individual's alignment with the Force.",
        "dark_side_dx": "positive",
    },
    {
        "patient_id_number": 2,
        "text": "Patient exhibits no signs of succumbing to the dark side. Preliminary assessments indicate a stable midichlorian count and a continued commitment to Jedi teachings. No deviations from the Jedi Code or indicators of dark side influence were observed. Regular check-ins with the Jedi Council will ensure the sustained well-being and alignment of the individual within the Jedi Order.",
        "dark_side_dx": "negative",
    },
    {
        "patient_id_number": 3,
        "text": "The individual manifested heightened aggression, impulsivity, and a palpable deviation from established ethical codes. Initial examinations disclosed an elevated midichlorian count and an unmistakable connection to the dark side of the Force. Further investigation is imperative to ascertain the depth of exposure to Sith doctrines. Close monitoring is essential to track any exacerbation of symptoms, and therapeutic interventions are advised to forestall a deeper embrace of the dark side. Subsequent evaluations will be pivotal in gauging the effectiveness of interventions and the overall trajectory of the individual's allegiance to the Force.",
        "dark_side_dx": "positive",
    }
])

In [4]:
# Create a bardi dataset
bardi_dataset = data_handlers.from_pandas(df)

In [5]:
# Creating a custom step to test out HF interop within bardi pipeline
class HuggingFaceTokenizer(BardiStep):
  def __init__(self, tokenizer_name: str):
    self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

  def run(self, data, artifacts):
    # Create a HuggingFace dataset from the cleaned data
    hf_dataset = datasets.Dataset(data)

    # Apply the tokenizer
    hf_dataset = hf_dataset.map(lambda examples: self.tokenizer(examples["text"]), batched=True)

    # Grab the data from the HF Dataset object
    data = hf_dataset.data.table

    return data, None

In [6]:
# Initialize a bardi pipeline
bardi_pipeline = BardiPipeline(dataset=bardi_dataset, write_outputs=False)

# Define the normalization step
path_report_regex_set = PathologyReportRegexSet().get_regex_set()
bardi_pipeline.add_step(step=nlp_engineering.CPUNormalizer(fields=['text'],
                                                           regex_set=path_report_regex_set,
                                                           lowercase=True))
bardi_pipeline.add_step(step=HuggingFaceTokenizer(tokenizer_name='bert-base-cased'))
bardi_pipeline.add_step(step=nlp_engineering.CPULabelProcessor(fields=['dark_side_dx']))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [7]:
# Run the bardi pipeline to clean the text
bardi_pipeline.run_pipeline()
data = bardi_pipeline.processed_data

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [8]:
final_data = bardi_pipeline.processed_data.to_pandas()
final_data

Unnamed: 0,patient_id_number,text,dark_side_dx,input_ids,token_type_ids,attention_mask
0,1,the patient presented with notable changes in ...,1,"[101, 1103, 5351, 2756, 1114, 3385, 2607, 1107...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,2,patient exhibits no signs of succumbing to the...,0,"[101, 5351, 10877, 1185, 5300, 1104, 28117, 19...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,3,the individual manifested heightened aggressio...,1,"[101, 1103, 2510, 23487, 1174, 23442, 16843, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [10]:
label_map = bardi_pipeline.artifacts['id_to_label']
print(label_map)

{'dark_side_dx': {'0': 'negative', '1': 'positive'}}
