# Section 3: Python & Hugging Face

## Load the Dataset

In [1]:
import pandas as pd

df = pd.read_csv('../data/traininig-dataset.csv')

print(f"Dataset loaded: {len(df)} samples")

print(f"\nLabel distribution:")
label_counts = df['label'].value_counts().reset_index()
label_counts.columns = ['label', 'count']
print(label_counts.to_string(index=False))

print(f"\nFirst few rows:")
print(df.head(3).to_string(index=False))

  from pandas.core import (


Dataset loaded: 50 samples

Label distribution:
 label  count
     1     26
     0     24

First few rows:
note_id                                                                                                                                                                                                                                                                                                                                                                                                          text  label
  N2013 Patient reports flooding during menses since menarche, describing soaking a pad every 1–2 hours with episodes of 'flooding'. Associated symptoms include sleep disruption from overnight changes; iron studies pending. Relevant history: endometriosis history. Impact: requires planning around menses. Management discussed: discussed tranexamic acid; patient prefers non-hormonal options at this time.      1
  N2015                                 Patient reports heavy mense

## Tokenize the Text Field

### Split into Train and Validation Sets

In [2]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

print(f"\nTrain: {len(train_df)} samples | Validation: {len(val_df)} samples")


Train: 40 samples | Validation: 10 samples


### Load Tokenizer

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

### Tokenization Function

In [4]:
from datasets import Dataset

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

### Convert to Hugging Face Datasets and Tokenize

In [5]:
train_dataset = Dataset.from_pandas(train_df[['text', 'label']]).map(tokenize_function, batched=True)
val_dataset = Dataset.from_pandas(val_df[['text', 'label']]).map(tokenize_function, batched=True)

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

print("\nTokenization complete!")

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]


Tokenization complete!


## Train a Small Classifier

In [None]:
# Check tokenization on a sample text
sample_text = train_df.iloc[0]['text']
print("Original text:")
print(sample_text)
print("\n" + "="*50 + "\n")

# Tokenize the sample
tokens = tokenizer(sample_text, padding='max_length', truncation=True, max_length=128)

print("Tokenized output:")
print(f"Input IDs (first 20): {tokens['input_ids'][:20]}")
print(f"Attention mask (first 20): {tokens['attention_mask'][:20]}")
print(f"\nTotal length: {len(tokens['input_ids'])} tokens")
print(f"Number of real tokens (non-padding): {sum(tokens['attention_mask'])}")

print("\n" + "="*50 + "\n")

# Decode back to text to verify
decoded_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)
print("Decoded text (should match original):")
print(decoded_text)

print("\n" + "="*50 + "\n")

# Check a few individual tokens
print("First 10 tokens decoded:")
for i in range(10):
    token_id = tokens['input_ids'][i]
    token_text = tokenizer.decode([token_id])
    print(f"  Token {i}: ID={token_id} → '{token_text}'")