In [4]:
import pandas as pd
import numpy as np
import random
import os
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import csv
import re

# --- 1. CONFIGURARE PATH & SETARI ---
import os
current_dir = os.getcwd()
BASE_PATH = os.path.join(current_dir, 'dataset', 'discharge')

if not os.path.exists(BASE_PATH):
    BASE_PATH = 'dataset/discharge/'

TRAIN_FILE = os.path.join(BASE_PATH, 'train.csv')
VAL_FILE = os.path.join(BASE_PATH, 'val.csv')
TEST_FILE = os.path.join(BASE_PATH, 'test.csv')

SEED = 42
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)

MODEL_CHECKPOINT = 'roberta-base'
MAX_LENGTH = 512
BATCH_SIZE = 8
GRADIENT_ACCUMULATION_STEPS = 4
NUM_EPOCHS = 3
TRAIN_SAMPLE_SIZE = 5000
TEST_SAMPLE_SIZE = 500
TARGET_COLUMN = 'discharge_disposition'

print("=== 1. Incarcare Date ===")
try:
    df_train = pd.read_csv(TRAIN_FILE, on_bad_lines='skip', engine='python')
    df_val = pd.read_csv(VAL_FILE, on_bad_lines='skip', engine='python')
    df_test = pd.read_csv(TEST_FILE, on_bad_lines='skip', engine='python')

    df_train_full = pd.concat([df_train, df_val], ignore_index=True)
except FileNotFoundError:
    print(f"\nCRITICAL ERROR: Nu gasesc fisierele in {os.path.abspath(BASE_PATH)}")
    raise

# --- 2. CURATARE SI MAPARE STRICTA (Cele 5 clase cerute) ---
def map_to_target_labels(label):
    # 1. Gestionare NAN (valori lipsa reale sau string-ul 'nan')
    if pd.isna(label) or str(label).lower() == 'nan' or str(label).lower() == 'unknown':
        return 'nan'

    text = str(label).upper().strip()

    # Filtrare zgomot (texte lungi care sunt erori de parsare)
    if len(text) > 60 or "SINCERELY" in text or "TEAM" in text:
        return None

    # --- LOGICA DE MAPARE STRICTA ---

    # A. Home
    if text == "HOME":
        return "Home"

    # B. Long Term Care (Inlocuieste Home With Service / Nursing Facility)
    if "HOME HEALTH" in text or "SERVICE" in text or "LONG TERM" in text or "NURSING FACILITY" in text:
        return "Long Term Care"

    # C. Extended Care (Include SNF, Rehab)
    if "SNF" in text or "SKILLED" in text or "REHAB" in text or "EXTENDED CARE" in text:
        return "Extended Care"

    # D. Expired
    if "EXPIRED" in text or "DIED" in text:
        return "Expired"

    # Orice altceva ramane None (se sterge) sau il poti baga la 'nan' daca vrei
    return None

print("=== 2. Curatare si Mapare Etichete ===")
df_train_full['clean_label'] = df_train_full[TARGET_COLUMN].apply(map_to_target_labels)
df_test['clean_label'] = df_test[TARGET_COLUMN].apply(map_to_target_labels)

# Eliminam doar ce e None (cele 5 clase raman, inclusiv 'nan' ca string)
df_train_clean = df_train_full.dropna(subset=['clean_label']).copy()
df_test_clean = df_test.dropna(subset=['clean_label']).copy()

# Sampling
if len(df_train_clean) > TRAIN_SAMPLE_SIZE:
    df_train_final = df_train_clean.sample(n=TRAIN_SAMPLE_SIZE, random_state=SEED).reset_index(drop=True)
else:
    df_train_final = df_train_clean

if len(df_test_clean) > TEST_SAMPLE_SIZE:
    df_test_final = df_test_clean.sample(n=TEST_SAMPLE_SIZE, random_state=SEED).reset_index(drop=True)
else:
    df_test_final = df_test_clean

print(f"Dataset final antrenare: {len(df_train_final)}")
print(f"Dataset final testare: {len(df_test_final)}")
print(f"Clasele finale: {sorted(df_train_final['clean_label'].unique())}")
# Ar trebui sa fie: ['Expired', 'Extended Care', 'Home', 'Long Term Care', 'nan']

# --- 3. PREPROCESARE TEXT ---
def clean_text(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def combine_text(row):
    t1 = clean_text(row.get('chief_complaint', ''))
    t2 = clean_text(row.get('history_of_present_illness', ''))
    return t1 + " " + t2

X_train = df_train_final.apply(combine_text, axis=1).tolist()
X_test = df_test_final.apply(combine_text, axis=1).tolist()

y_train = df_train_final['clean_label'].tolist()
y_test = df_test_final['clean_label'].tolist()

# --- 4. ENCODING ---
le = LabelEncoder()
le.fit(y_train + y_test)

y_train_enc = le.transform(y_train)
y_test_enc = le.transform(y_test)
target_names = le.classes_

print(f"Numar clase finale: {len(target_names)}")
print(f"Nume clase: {target_names}")

# Class Weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_enc), y=y_train_enc)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

# --- 5. TOKENIZATION ---
print(f"\n=== Tokenizare ({MODEL_CHECKPOINT}) ===")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def tokenize_function(texts):
    return tokenizer(texts, truncation=True, padding="max_length", max_length=MAX_LENGTH)

train_encodings = tokenize_function(X_train)
test_encodings = tokenize_function(X_test)

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SimpleDataset(train_encodings, y_train_enc)
test_dataset = SimpleDataset(test_encodings, y_test_enc)

# --- 6. MODEL & TRAINER ---
print("\n=== Initializare Model ===")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=len(target_names))

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1_w = f1_score(labels, predictions, average="weighted", zero_division=0)
    return {"accuracy": acc, "f1_weighted": f1_w}

training_args = TrainingArguments(
    output_dir="./roberta_results_5classes",
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=50,
    weight_decay=0.01,
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

print("\n=== Start Antrenare ===")
trainer.train()

# --- 7. EVALUARE FINALA & GRAFIC ---
print("\n=== Evaluare Finala ===")
preds_output = trainer.predict(test_dataset)
y_pred = np.argmax(preds_output.predictions, axis=1)
final_acc = accuracy_score(y_test_enc, y_pred)

print(f"Final Accuracy: {final_acc:.4f}")

cm = confusion_matrix(y_test_enc, y_pred)
plt.figure(figsize=(10, 8))

sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=target_names,
    yticklabels=target_names
)

plt.xticks(rotation=45, ha='right')
plt.title(f'RoBERTa - Discharge Disposition (5 Target Labels)\nAcc: {final_acc:.2f}')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.savefig('confusion_matrix_roberta_5classes.png', dpi=300)
print("Matricea a fost salvata ca 'confusion_matrix_roberta_5classes.png'")

report = classification_report(y_test_enc, y_pred, target_names=target_names, zero_division=0)
with open("classification_report_roberta.txt", "w") as f:
    f.write(report)

1. Loading Data...
2. Preprocessing & Filtering...
Training on 5000 samples.
Testing on 500 samples.
Number of classes: 101

3. Tokenization (roberta-base)...


Map: 100%|██████████| 5000/5000 [00:00<00:00, 12515.88 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 10983.53 examples/s]



4. Initializing Model...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



5. Starting Training...




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 