# Lab 3 Implementation

###  (TF-IDF + LinearSVC)

In [None]:
import pandas as pd
import numpy as np
import random
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    f1_score,
    confusion_matrix
)

np.random.seed(42)
random.seed(42)

BASE_PATH = 'dataset_split/discharge-notes/'
TRAIN_FILE = os.path.join(BASE_PATH, 'train.csv')
VAL_FILE = os.path.join(BASE_PATH, 'val.csv')
TEST_FILE = os.path.join(BASE_PATH, 'test.csv')


print("1. Loading data...")

try:
    df_train = pd.read_csv(TRAIN_FILE)
    df_val = pd.read_csv(VAL_FILE)
    df_test = pd.read_csv(TEST_FILE)

    df_train_full = pd.concat([df_train, df_val], ignore_index=True)

except FileNotFoundError as e:
    print(f"\nERROR: File not found: {os.path.abspath(TRAIN_FILE).replace('train.csv', '...')}")
    raise

print(f"Train/Val samples: {len(df_train_full)}")
print(f"Test samples: {len(df_test)}")

REQUIRED_COLUMNS = ['chief_complaint', 'history_of_present_illness', 'major_surgical_procedure']
for col in REQUIRED_COLUMNS:
    if col not in df_train_full.columns or col not in df_test.columns:
        raise ValueError(f"ERROR: Missing column '{col}'")


# 2. FEATURE ENGINEERING
def create_text_feature(df):
    return df['chief_complaint'].fillna('') + " " + df['history_of_present_illness'].fillna('')

X_train = create_text_feature(df_train_full)
X_test = create_text_feature(df_test)

y_train_str = df_train_full['major_surgical_procedure'].fillna('UNKNOWN')
y_test_str = df_test['major_surgical_procedure'].fillna('UNKNOWN')

all_unique_labels = pd.concat([y_train_str, y_test_str]).unique()

le = LabelEncoder()
le.fit(all_unique_labels)

y_train = le.transform(y_train_str)
y_test = le.transform(y_test_str)

target_names = le.classes_
print(f"Number of classes: {len(target_names)}")
print(f"First 5 classes: {target_names[:5]}...")

print("2. Building Pipeline (TF-IDF + LinearSVC)...")
model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=(1, 2),
        stop_words='english',
        min_df=5,
        max_features=50000
    )),
    ('clf', LinearSVC(
        C=1.0,
        class_weight='balanced',
        random_state=42
    )),
])

print("3. Training model...")
model_pipeline.fit(X_train, y_train)
print("Training complete.")
print("-" * 50)

print("4. Evaluating model on Test Set (Sklearn metrics)...")
y_pred = model_pipeline.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)
test_f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)
test_f1_weighted = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print(f"\n## Primary Results")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"F1-Macro: {test_f1_macro:.4f}")
print(f"F1-Weighted: {test_f1_weighted:.4f}")

print("\n## Classification Report")
if len(target_names) > 20:
    print(f"Warning: {len(target_names)} classes. Printing full report.")

print(classification_report(y_test, y_pred, target_names=target_names, zero_division=0))

cm = confusion_matrix(y_test, y_pred)
print("\n## Confusion Matrix (Shape)")
print(f"Matrix dimension: {cm.shape[0]} x {cm.shape[1]}")
print("Actual matrix should be included in your lab report.")
print("-" * 50)

print("5. CONCLUSION FOR LAB REPORT")
print(f"Classical model (TF-IDF + LinearSVC) achieved a Weighted F1-Score of {test_f1_weighted:.4f}.")
print("This result serves as a strong **benchmark** for comparison against SOTA Transformer models.")

1. Loading data...
Train/Val samples: 298614
Test samples: 33180
Number of classes: 122477
First 5 classes: ['"Very mild localized erythema and loss of vascularity without bleeding were noted at 20-30cm. These findings are compatible with very mild colitis versus scope trauma and are far less pronounced than would be expected given the CT report.... Previous end to side ileo-colonic anastomosis of the ascending colon Extremely subtle mucosal changes suggestive of mild colitis in the sigmoid colon Otherwise normal colonoscopy to ileum.... The findings on this exam do not explain the degree of abdominal pain. Suggest continued consultation with the Pain Service to optimize therapy for functional abdominal pain. Consider topical therapy (cortifoam or rowasa) to treat subtle features of colitis." EGD with gastritis and retained fluids in the stomach, otherwise WNL to third part of duodenum.'
 '#  1. Open reduction internal fixation right bicondylar tibial plateau fracture with internal fix

### RoBerta

In [None]:
import pandas as pd
import numpy as np
import random
import os
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

np.random.seed(42)
random.seed(42)
torch.manual_seed(42)

BASE_PATH = 'dataset_split/discharge-notes/'
TRAIN_FILE = os.path.join(BASE_PATH, 'train.csv')
VAL_FILE = os.path.join(BASE_PATH, 'val.csv')
TEST_FILE = os.path.join(BASE_PATH, 'test.csv')


print("1. Loading and Pre-processing data...")

try:
    df_train = pd.read_csv(TRAIN_FILE)
    df_val = pd.read_csv(VAL_FILE)
    df_test = pd.read_csv(TEST_FILE)
    df_train_full = pd.concat([df_train, df_val], ignore_index=True)
except FileNotFoundError as e:
    print(f"\nERROR: File not found: {os.path.abspath(TRAIN_FILE).replace('train.csv', '...')}")
    raise

print(f"Train/Val samples: {len(df_train_full)}")
print(f"Test samples: {len(df_test)}")

REQUIRED_COLUMNS = ['chief_complaint', 'history_of_present_illness', 'major_surgical_procedure']
for col in REQUIRED_COLUMNS:
    if col not in df_train_full.columns or col not in df_test.columns:
        raise ValueError(f"ERROR: Missing column '{col}'")

def create_text_feature(df):
    return df['chief_complaint'].fillna('') + " " + df['history_of_present_illness'].fillna('')

X_train = create_text_feature(df_train_full)
X_test = create_text_feature(df_test)

y_train_str = df_train_full['major_surgical_procedure'].fillna('UNKNOWN')
y_test_str = df_test['major_surgical_procedure'].fillna('UNKNOWN')

TOP_K = 100
print(f"Filtering labels: Keeping Top {TOP_K} classes + 'OTHER_PROCEDURE'")

def clean_label(label):
    return ''.join(c.lower() if c.isalnum() or c.isspace() else '' for c in label).strip()

y_train_str_cleaned = y_train_str.apply(clean_label)
y_test_str_cleaned = y_test_str.apply(clean_label)

label_counts = y_train_str_cleaned.value_counts()
top_labels = label_counts.head(TOP_K).index.tolist()

def filter_labels(label, top_labels_list):
    if label in top_labels_list:
        return label
    return 'otherprocedure'

y_train_str_filtered = y_train_str_cleaned.apply(lambda x: filter_labels(x, top_labels))
y_test_str_filtered = y_test_str_cleaned.apply(lambda x: filter_labels(x, top_labels))

all_unique_labels_filtered = pd.concat([y_train_str_filtered, y_test_str_filtered]).unique()
le = LabelEncoder()
le.fit(all_unique_labels_filtered)

y_train = le.transform(y_train_str_filtered)
y_test = le.transform(y_test_str_filtered)

target_names = le.classes_
num_labels = len(target_names)
print(f"Numar final de clase: {num_labels}")

class_weights_array = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = {cls: w for cls, w in zip(np.unique(y_train), class_weights_array)}
class_weights_tensor = torch.from_numpy(class_weights_array.astype(np.float32))


MODEL_CHECKPOINT = 'roberta-base'
MAX_LENGTH = 512
BATCH_SIZE = 8
GRADIENT_ACCUMULATION_STEPS = 8

print(f"\n2. Incarcare Model: {MODEL_CHECKPOINT}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_labels,
)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)

print("3. Conversie la formatul Hugging Face Dataset...")

train_data = pd.DataFrame({'text': X_train.tolist(), 'label': y_train.tolist()})
test_data = pd.DataFrame({'text': X_test.tolist(), 'label': y_test.tolist()})

raw_train_dataset = Dataset.from_pandas(train_data)
raw_test_dataset = Dataset.from_pandas(test_data)

tokenized_train_dataset = raw_train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = raw_test_dataset.map(tokenize_function, batched=True)

SAMPLE_SIZE = 50000
if len(tokenized_train_dataset) > SAMPLE_SIZE:
    print(f"Sampling set de antrenare la {SAMPLE_SIZE} mostre din {len(tokenized_train_dataset)}.")
    indices = np.random.choice(len(tokenized_train_dataset), SAMPLE_SIZE, replace=False)
    tokenized_train_dataset = tokenized_train_dataset.select(indices)
else:
    print("Nu se aplica sampling.")

tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["text"])

tokenized_train_dataset.set_format("torch")
tokenized_test_dataset.set_format("torch")

tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")
tokenized_test_dataset = tokenized_test_dataset.rename_column("label", "labels")

class WeightedTrainer(Trainer):
   def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')

        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1_w = f1_score(labels, predictions, average="weighted", zero_division=0)
    return {"accuracy": acc, "f1_weighted": f1_w}

training_args = TrainingArguments(
    output_dir="./Roberta_results",
    num_train_epochs=1,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./Roberta_logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("\n4. Icepe Fine-Tuning cu RoBERTa-Base (1 Epoca, 50k mostre)...")
trainer.train()

print("\n5. Evaluare Finala (Pe Test Set):")
metrics = trainer.evaluate()
print(metrics)

print("\nAntrenare terminata. Metricele sunt disponibile în 'Roberta_logs'.")

  from .autonotebook import tqdm as notebook_tqdm


1. Loading and Pre-processing data...
Train/Val samples: 298614
Test samples: 33180
Filtering labels: Keeping Top 100 classes + 'OTHER_PROCEDURE'


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /roberta-base/resolve/main/tokenizer_config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000280C5FB82F0>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 28df95fd-ecc2-45b1-a803-f2541ba396d2)')' thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


Număr final de clase: 101

2. Încărcare Model: roberta-base


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /roberta-base/resolve/main/tokenizer_config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000280C5F68F50>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: feaf1840-3498-4b10-b742-1d88d7a58da0)')' thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json
Retrying in 2s [Retry 2/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /roberta-base/resolve/main/tokenizer_config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000280C5F69590>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: b8462544-d53e-4d36-bc3a-87a1cea45c0b)')' thrown while requesting HEAD https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json
Re

3. Conversie la formatul Hugging Face Dataset...


Map: 100%|██████████| 298614/298614 [00:56<00:00, 5248.82 examples/s]
Map: 100%|██████████| 33180/33180 [00:06<00:00, 5142.91 examples/s]
  trainer = WeightedTrainer(


Sampling set de antrenare la 50000 mostre din 298614.

4. Începe Fine-Tuning cu RoBERTa-Base (1 Epocă, 50k mostre)...


Epoch,Training Loss,Validation Loss
