# HMM Baseline

This notebook implements a baseline Hidden Markov Model for detecting negation and uncertainty markers in multilingual Spanish and Catalan medical texts

- Load the preprocessed data
- Train a basic HMM model
- Evaluate performance on test data
- Save the model and results

The baseline model uses only word tokens as observations (without POS tags)

## Imports and Setup

In [None]:
import os
import sys
import json
import pickle
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

sys.path.append("..")

from src.hmm import HMMBaseline
from src.evaluation import *

PATH_ROOT = os.path.dirname(os.getcwd()) 

os.makedirs(os.path.join(PATH_ROOT, "data", "results", "models"), exist_ok=True) 
os.makedirs(os.path.join(PATH_ROOT, "data", "results", "evaluation"), exist_ok=True)

np.random.seed(42)

## Load Preprocessed Data

In [None]:
def load_processed_data(file_path):
    """Load preprocessed data from pickle file"""
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data

# Load the preprocessed data
train_file = os.path.join(PATH_ROOT, "data", "processed", "train_baseline.pkl")
test_file = os.path.join(PATH_ROOT, "data", "processed", "test_baseline.pkl")

train_data = load_processed_data(train_file)  
test_data = load_processed_data(test_file)  

print(f"Loaded {len(train_data['observations'])} training sequences")
print(f"Loaded {len(test_data['observations'])} test sequences")
print(f"Vocabulary size: {len(train_data['vocabulary'])}")
print(f"State space: {train_data['state_space']}")

## Train the Baseline HMM Model

In [None]:
hmm_model = HMMBaseline(
    state_space=train_data["state_space"],
    vocabulary=train_data["vocabulary"],
    smoothing=0.01
)

print("Training HMM model...")
hmm_model.train(train_data["observations"], train_data["states"])  # Train using observations and states

model_path = os.path.join(PATH_ROOT, "data", "results", "models", "hmm_baseline.pkl")  # Path for saving model
hmm_model.save(model_path)  # Save the trained model
print(f"Model saved to {model_path}")

## Evaluate the Model

In [None]:
print("Making predictions on test data...")
test_predictions = hmm_model.predict(test_data["observations"])  # Predict on test data

print("Evaluating predictions...") # Using evaluation functions 
metrics = compute_metrics(test_data["states"], test_predictions)  # Calculate token-level metrics
entity_metrics = get_entity_based_metrics(test_data["states"], test_predictions)  # Calculate entity-level metrics

print("\nClassification Report (Token Level):")
print_classification_report(test_data["states"], test_predictions)

print("\nEntity-Level F1 Scores:")
for label, metrics_dict in entity_metrics.items():
    if label != "macro_avg":
        print(f"{label}: F1 = {metrics_dict['f1']:.4f}, Precision = {metrics_dict['precision']:.4f}, "
              f"Recall = {metrics_dict['recall']:.4f}, Support = {metrics_dict.get('support', 'N/A')}")
print(f"Macro Average: F1 = {entity_metrics['macro_avg']['f1']:.4f}\n")

# Save evaluation results
eval_path = os.path.join(PATH_ROOT, "data", "results", "evaluation", "evaluation_baseline.json")  # Path for saving evaluation
save_metrics({
    "token_metrics": metrics,
    "entity_metrics": entity_metrics
}, eval_path)  # Save metrics to file

print(f"Evaluation results saved to {eval_path}")

## Language-Specific Analysis

In [None]:
def extract_language_info(data):
    token_languages = []

    for sequence in data["sequences"]:
        seq_langs = []
        for token in sequence["tokens"]:
            seq_langs.append(token.get("language", "es"))  # Default to Spanish if not specified
        token_languages.append(seq_langs)
    
    return token_languages

token_languages = extract_language_info(test_data)  # Get language information

language_metrics = analyze_by_language(
    test_data["states"],
    test_predictions,
    token_languages
)

print("\nLanguage-Specific Performance:")
print(f"Spanish Token-level F1: {language_metrics['spanish']['macro_avg']['f1']:.4f}")
print(f"Catalan Token-level F1: {language_metrics['catalan']['macro_avg']['f1']:.4f}")

eval_path_lang = os.path.join(PATH_ROOT, "data", "results", "evaluation", "evaluation_baseline_language.json")  # Path for language metrics
save_metrics(language_metrics, eval_path_lang)  # Save language-specific evaluation


## Visualize Transition Probabilities

In [None]:
hmm_model.transition_probs

In [None]:
plt.figure(figsize=(10, 8))
plt.imshow(hmm_model.transition_probs, cmap="viridis")  # Plot transition matrix
plt.colorbar()
plt.title("Transition Probabilities")
plt.xlabel("To State")  
plt.ylabel("From State")
plt.xticks(range(len(hmm_model.state_space)), hmm_model.state_space, rotation=45)
plt.yticks(range(len(hmm_model.state_space)), hmm_model.state_space)  
plt.tight_layout() 
plt.savefig(os.path.join(PATH_ROOT, "data", "results", "evaluation", "baseline_transitions.png"))
plt.show()  

## Visualize Top Emission Probabilities

In [None]:
def get_top_words(state_idx, n=10):
    """Function to find top words for each state"""
    probs = hmm_model.emission_probs[state_idx, :]  # Probabilities for  state
    top_indices = np.argsort(probs)[-n:][::-1]  # Descending order indices
    top_words = [hmm_model.vocabulary[i] for i in top_indices]  # Get words for indices
    top_probs = [probs[i] for i in top_indices]  # Get corresponding probabilities
    return top_words, top_probs

plt.figure(figsize=(14, 10))

states_to_plot = ["NEG", "NSCO", "UNC", "USCO"]
for i, state in enumerate(states_to_plot):
    plt.subplot(2, 2, i+1) 
    if state in hmm_model.state_to_idx:
        state_idx = hmm_model.state_to_idx[state]  
        words, probs = get_top_words(state_idx, 10)
        
        plt.bar(range(len(words)), probs)  
        plt.xticks(range(len(words)), words, rotation=45, ha="right")  
        plt.title(f"Top words for {state}")  
        plt.tight_layout()  

plt.tight_layout()
plt.savefig(os.path.join(PATH_ROOT, "data", "results", "evaluation", "baseline_top_words.png")) 
plt.show()