<a href="https://colab.research.google.com/github/dakotamurdock/DatabricksDeveloperFoundations/blob/main/bardi%2Bhf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install --upgrade polars
%pip install datasets
%pip install bardi

In [10]:
import pandas as pd
import polars as pl
import datasets
from bardi.pipeline import Pipeline as BardiPipeline
from bardi.data import data_handlers
from bardi import nlp_engineering
from bardi.nlp_engineering.regex_library.pathology_report import PathologyReportRegexSet
from tokenizers import Tokenizer
from transformers import AutoTokenizer

In [11]:
df = pd.DataFrame([
    {
        "patient_id_number": 1,
        "text": "The patient presented with notable changes in behavior, exhibiting increased aggression, impulsivity, and a distinct deviation from the Jedi Code. Preliminary examinations reveal a heightened midichlorian count and an unsettling connection to the dark side of the Force. Further analysis is warranted to explore the extent of exposure to Sith teachings. It is imperative to monitor the individual closely for any worsening symptoms and to engage in therapeutic interventions aimed at preventing further descent into the dark side. Follow-up assessments will be crucial in determining the efficacy of intervention strategies and the overall trajectory of the individual's alignment with the Force.",
        "dark_side_dx": "positive",
    },
    {
        "patient_id_number": 2,
        "text": "Patient exhibits no signs of succumbing to the dark side. Preliminary assessments indicate a stable midichlorian count and a continued commitment to Jedi teachings. No deviations from the Jedi Code or indicators of dark side influence were observed. Regular check-ins with the Jedi Council will ensure the sustained well-being and alignment of the individual within the Jedi Order.",
        "dark_side_dx": "negative",
    },
    {
        "patient_id_number": 3,
        "text": "The individual manifested heightened aggression, impulsivity, and a palpable deviation from established ethical codes. Initial examinations disclosed an elevated midichlorian count and an unmistakable connection to the dark side of the Force. Further investigation is imperative to ascertain the depth of exposure to Sith doctrines. Close monitoring is essential to track any exacerbation of symptoms, and therapeutic interventions are advised to forestall a deeper embrace of the dark side. Subsequent evaluations will be pivotal in gauging the effectiveness of interventions and the overall trajectory of the individual's allegiance to the Force.",
        "dark_side_dx": "positive",
    }
])

In [12]:
# register a dataset
bardi_dataset = data_handlers.from_pandas(df)

In [16]:
# initialize a pipeline
bardi_pipeline = BardiPipeline(dataset=bardi_dataset, write_outputs=False)

path_report_regex_set = PathologyReportRegexSet().get_regex_set()
bardi_pipeline.add_step(step=nlp_engineering.CPUNormalizer(fields=['text'],
                                                                      regex_set=path_report_regex_set,
                                                                      lowercase=True))

In [17]:
bardi_pipeline.run_pipeline()
data = bardi_pipeline.processed_data

In [18]:
hf_dataset = datasets.Dataset(data)

In [19]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [20]:
hf_dataset = hf_dataset.map(lambda examples: tokenizer(examples["text"]), batched=True)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [24]:
data = hf_dataset.data
final_df = pl.from_arrow(data.table)
print(final_df)

shape: (3, 6)
┌────────────────┬────────────────┬──────────────┬────────────────┬────────────────┬───────────────┐
│ patient_id_num ┆ text           ┆ dark_side_dx ┆ input_ids      ┆ token_type_ids ┆ attention_mas │
│ ber            ┆ ---            ┆ ---          ┆ ---            ┆ ---            ┆ k             │
│ ---            ┆ str            ┆ str          ┆ list[i32]      ┆ list[i8]       ┆ ---           │
│ i64            ┆                ┆              ┆                ┆                ┆ list[i8]      │
╞════════════════╪════════════════╪══════════════╪════════════════╪════════════════╪═══════════════╡
│ 1              ┆ the patient    ┆ positive     ┆ [101, 1103, …  ┆ [0, 0, … 0]    ┆ [1, 1, … 1]   │
│                ┆ presented with ┆              ┆ 102]           ┆                ┆               │
│                ┆ notab…         ┆              ┆                ┆                ┆               │
│ 2              ┆ patient        ┆ negative     ┆ [101, 5351, …  ┆ [0, 0, … 