In [31]:
import pandas as pd
from pathlib import Path

# Preprocessing / Checking Data

In [32]:
# path to the csv file(data)
data_path = Path("../data/training-dataset.csv")


In [33]:
#reading the csv file ... most of the time there is issue with the text type that is why I put encoding 'utf-8'

df = pd.read_csv(data_path, encoding="utf-8")

In [34]:
#checking top rows 
df.head()

Unnamed: 0,note_id,text,label
0,N2013,Patient reports flooding during menses since m...,1
1,N2015,Patient reports heavy menses since stopping OC...,1
2,N2022,"Patient reports HMB for the past 6 months, des...",1
3,N2042,Patient no menorrhagia; menses reported as reg...,0
4,N2030,Patient bleeding within normal limits; menses ...,0


In [35]:

# the shape of file ( num of rows and columns)
# list the name of columns

print(" Shape (rows, cols):", df.shape)
print(" Columns:", list(df.columns))

 Shape (rows, cols): (50, 3)
 Columns: ['note_id', 'text', 'label']


In [36]:
#Checking how many missing values each column has
null_fraction = df.isna().mean().sort_values(ascending=False)
display(null_fraction.head(10))

note_id    0.0
text       0.0
label      0.0
dtype: float64

In [37]:
# counting exact duplicate rows
dup_count = df.duplicated().sum()

print(f" Duplicate rows: {dup_count}")

 Duplicate rows: 0


In [38]:
# checking if label column is 0/1 all of them 
print("Unique (raw):", pd.unique(df['label']))

Unique (raw): [1 0]


In [39]:
# if we had longer and larger dataset 
#checking if texts are empty or whitespaced

s = df['text'].astype(str)
s_norm = s.str.replace(r"\s+", " ", regex=True).str.strip()

empty_mask = df['text'].isna() | s_norm.eq("")
print("Empty/whitespace-only texts:", int(empty_mask.sum()))
df.loc[empty_mask, ['note_id','text']].head(10) if empty_mask.any() else print(" None found")


Empty/whitespace-only texts: 0
 None found


In [66]:
df = df[["text", "label"]].dropna()   
df.head()

Unnamed: 0,text,label
0,Patient reports flooding during menses since m...,1
1,Patient reports heavy menses since stopping OC...,1
2,"Patient reports HMB for the past 6 months, des...",1
3,Patient no menorrhagia; menses reported as reg...,0
4,Patient bleeding within normal limits; menses ...,0


## 2.	Tokenize the text field using a Hugging Face model (e.g., `bert-base-uncased`).

In [40]:
from transformers import AutoTokenizer

max_length = 256   # max length for tokens
tokenizer_name = "bert-base-uncased"


tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True)
print("Loaded tokenizer:", tokenizer.name_or_path)



Loaded tokenizer: bert-base-uncased


In [59]:
texts = df['text'].astype(str).tolist()

enc_all = tokenizer(
    texts,
    truncation=True,              # cut extra tokens
    max_length=max_length,
    padding='max_length'          
)

print("Rows tokenized:", len(enc_all["input_ids"]))


Rows tokenized: 50


## 3.	Train a small classifier (e.g., `distilbert-base-uncased`) for 1–2 epochs.

In [60]:
labels = df["label"].astype(int).tolist()
ids = df["note_id"].tolist()

In [61]:
# building a HF Dataset from enc_all + labels
from datasets import Dataset

ds = Dataset.from_dict({
    "input_ids": enc_all["input_ids"],
    "attention_mask": enc_all["attention_mask"],
    "label": labels,
    "id_note": ids,
})

ds


Dataset({
    features: ['input_ids', 'attention_mask', 'label', 'id_note'],
    num_rows: 50
})

In [62]:
from sklearn.model_selection import train_test_split
import numpy as np
#splitting to train and validation 
SEED = 42
idx = np.arange(len(ds))
y = ds["label"]          

train_idx, val_idx = train_test_split(idx, test_size=0.2, random_state=SEED, stratify=y)#80% train and 20% validation

train_ds = ds.select(train_idx)
val_ds   = ds.select(val_idx)

#  dropping extra columns not used by the model
for col in ["id_note"]:
    if col in train_ds.column_names:
        train_ds = train_ds.remove_columns([col])
        val_ds   = val_ds.remove_columns([col])

print("Train size:", len(train_ds), "| Val size:", len(val_ds))


Train size: 40 | Val size: 10


In [63]:
train_ds

Dataset({
    features: ['input_ids', 'attention_mask', 'label'],
    num_rows: 40
})