# Embedding approach

Let's use the embeddings to classify the documents.


# Load the cleaned results



In [1]:
import json 

with open('cleaned_results.json', 'r') as file:
    cleaned_results = json.load(file)

print(cleaned_results)

{'Compliance Report 4.pdf': {'text': '0\nResiVied 06/08/2023 - 07/07/2023\nAirView\nCompliance Report\nInitial compliance period 06/08/2028 - 07/07/2023\nCompliance met Yes\nCompliance percentage 70 %\nPayor Standard\nUsage days 22/30 days 73 % 4 hours 21 days 70 %\n4 hours 1 days 3 %\nUsage hours 149 hours 48 minutes\nAverage usage total days 5 hours 0 minutes\nAverage usage days used 6 hours 49 minutes\nMedian usage days used 6 hours 55 minutes\nTotal used hours value since last reset - 07/07/2023 276 hours: MirSanca AutoSe\nSerial number 23231350306\nMode CPAP\nSet pressure: 15 cemH20\nEPR Fulltime\nEPR level 3\nerap\nLeaks - L/min Median: 34, 5 95th percentile: 62. 6 Maximum: 72. 7\nEvents per hour Al: 4. 0 Hk: 21 AHI: 6. 1\nApnea Index Central: 1. 8 Obstructive: 0. 4 Unknown: 1. 7\nRERA Index 1. 3\nCheyne-Stokes respiration average duration per night 7 minutes 2 %\nUsage - hours\n8 10 12 14 16 18 20 22 24 26 28 30 2. 4 6\nPrinted on 07/11/2023 - ResMed Airview version 4. 41. 0-9. 

# EmbeddingClassifier class
Initialization and Model Loading: The EmbeddingClassifier class initializes a document classifier using the SentenceTransformer model, which helps generate embeddings for text data to classify documents.

Enhanced Descriptions: The create_enhanced_descriptions method defines multiple, detailed descriptions for each label, capturing different aspects of each document type (e.g., "Compliance," "Sleep").

Embedding Cache Creation: The create_embeddings_cache method generates and stores embeddings for each label's descriptions, caching them for efficient classification.

Document Classification: The classify_document method encodes a given document, calculates its similarity to each label's descriptions, and returns the best match and confidence level, defaulting to "Unknown" if below a threshold.

Evaluation: The evaluate_classification method compares predicted classifications against true labels, creating a classification report and computing average confidence per class.

Fit Method: The fit method prepares the classifier by generating enhanced descriptions and caching embeddings, optionally accepting training data for further description refinement.








In [2]:
import json
from sentence_transformers import SentenceTransformer, util
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import torch
from typing import Dict, Tuple, List
import pandas as pd

class EmbbedingClassifier:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        """
        Initialize the document classifier with a more powerful model and improved architecture.
        
        Args:
            model_name: Name of the SentenceTransformer model to use
        """
        self.model = SentenceTransformer(model_name)
        self.label_embeddings = None
        self.label_descriptions = None
        
    def create_enhanced_descriptions(self) -> Dict[str, List[str]]:
        """
        Create multiple detailed descriptions per class to capture various aspects.
        """
        return {
            "Compliance": [
                "Document showing patient compliance with medical device usage and therapy adherence",
                "Report containing compliance percentage, usage hours, and therapy effectiveness metrics",
                "Medical device usage tracking report with detailed usage statistics and compliance data",
                "Patient therapy compliance summary with usage patterns and achievement metrics"
            ],
            "Sleep": [
                "Clinical sleep study report with detailed polysomnography results and analysis",
                "Sleep disorder diagnostic report with sleep patterns and respiratory events",
                "Overnight sleep study data with comprehensive sleep metrics and observations",
                "Sleep laboratory report containing detailed sleep architecture and parameters"
            ],
            "Order": [
                "Medical equipment or supply order form with patient and provider details",
                "Healthcare supply requisition document with order specifications",
                "Medical device order authorization with insurance and billing information",
                "Equipment order form with delivery instructions and product details"
            ],
            "Delivery": [
                "Medical equipment delivery confirmation document with receipt details",
                "Healthcare supply delivery ticket with shipping and handling information",
                "Equipment delivery acknowledgment form with customer signatures",
                "Medical supply delivery record with inventory and tracking details"
            ],
            "Physician": [
                "Clinical progress notes from physician consultation or examination",
                "Doctor's medical assessment and treatment recommendations",
                "Patient consultation notes with medical observations and plan",
                "Physician documentation of patient encounter and clinical findings"
            ],
            "Prescription": [
                "Medical prescription with medication details and dosing instructions",
                "Drug prescription form with pharmacy instructions and refill information",
                "Medication order with specific dosage and administration details",
                "Prescription document with drug name, strength, and usage directions"
            ]
        }

    def create_embeddings_cache(self, descriptions: Dict[str, List[str]]) -> Dict[str, torch.Tensor]:
        """
        Create and cache embeddings for all descriptions.
        """
        embeddings_cache = {}
        for label, desc_list in descriptions.items():
            # Create embeddings for all descriptions of this label
            label_embeddings = self.model.encode(desc_list, convert_to_tensor=True)
            embeddings_cache[label] = label_embeddings
        return embeddings_cache

    def classify_document(self, 
                        text: str, 
                        threshold: float = 0.3) -> Tuple[str, float, Dict[str, float]]:
        """
        Classify a document using enhanced similarity calculation.
        
        Args:
            text: Document text to classify
            threshold: Minimum confidence threshold
            
        Returns:
            Tuple of (predicted_class, confidence, all_similarities)
        """
        # Generate document embedding
        doc_embedding = self.model.encode(text, convert_to_tensor=True)
        
        # Calculate similarities with all descriptions for each class
        similarities = {}
        max_similarities = {}
        
        for label, label_embeddings in self.embeddings_cache.items():
            # Calculate similarity with all descriptions for this label
            similarity_scores = util.cos_sim(doc_embedding, label_embeddings)
            
            # Take the maximum similarity score for this label
            max_similarity = torch.max(similarity_scores).item()
            similarities[label] = max_similarity
            max_similarities[label] = max_similarity
        
        # Find best class and confidence
        best_class = max(similarities, key=similarities.get)
        confidence = similarities[best_class]
        
        # If confidence is below threshold, return "Unknown"
        if confidence < threshold:
            return "Unknown", confidence, similarities
            
        return best_class, confidence, similarities

    def evaluate_classification(self, test_data: Dict) -> pd.DataFrame:
        """
        Evaluate classification performance on test data.
        
        Args:
            test_data: Dictionary of test documents with their true labels
            
        Returns:
            DataFrame with evaluation metrics
        """
        predictions = []
        true_labels = []
        confidences = []
        
        for doc_name, doc_data in test_data.items():
            pred_class, confidence, _ = self.classify_document(doc_data['text'])
            predictions.append(pred_class)
            true_labels.append(doc_data['label'])
            confidences.append(confidence)
        
        # Create evaluation report
        report = classification_report(true_labels, predictions, output_dict=True)
        df_report = pd.DataFrame(report).transpose()
        
        # Add average confidence per class
        class_confidences = pd.Series(confidences, index=true_labels).groupby(level=0).mean()
        df_report['avg_confidence'] = class_confidences
        
        return df_report

    def fit(self, train_data: Dict = None):
        """
        Prepare the classifier by creating enhanced descriptions and embeddings cache.
        
        Args:
            train_data: Optional training data to enhance descriptions
        """
        self.label_descriptions = self.create_enhanced_descriptions()
        self.embeddings_cache = self.create_embeddings_cache(self.label_descriptions)
        
        if train_data:
            # Could potentially use training data to enhance descriptions
            pass


  from tqdm.autonotebook import tqdm, trange


In [3]:
# Example usage:
# Initialize classifier
classifier = EmbbedingClassifier()
classifier.fit()

# Classify documents
for doc_name, doc_data in cleaned_results.items():
    predicted_class, confidence, similarities = classifier.classify_document(doc_data['text'])
    print(f"Document: {doc_name}")
    print(f"Predicted: {predicted_class} (confidence: {confidence:.3f})")
    print("Similarities:", {k: f"{v:.3f}" for k, v in similarities.items()})
    print()

# Evaluate performance
evaluation_report = classifier.evaluate_classification(cleaned_results)

Document: Compliance Report 4.pdf
Predicted: Compliance (confidence: 0.382)
Similarities: {'Compliance': '0.382', 'Sleep': '0.317', 'Order': '0.296', 'Delivery': '0.246', 'Physician': '0.217', 'Prescription': '0.243'}

Document: Compliance Report 1.pdf
Predicted: Compliance (confidence: 0.412)
Similarities: {'Compliance': '0.412', 'Sleep': '0.334', 'Order': '0.332', 'Delivery': '0.269', 'Physician': '0.275', 'Prescription': '0.311'}

Document: Compliance Report 2.pdf
Predicted: Compliance (confidence: 0.416)
Similarities: {'Compliance': '0.416', 'Sleep': '0.202', 'Order': '0.413', 'Delivery': '0.333', 'Physician': '0.302', 'Prescription': '0.407'}

Document: Compliance Report 3.pdf
Predicted: Order (confidence: 0.314)
Similarities: {'Compliance': '0.314', 'Sleep': '0.278', 'Order': '0.314', 'Delivery': '0.244', 'Physician': '0.203', 'Prescription': '0.289'}

Document: Sleep Study Report 3.pdf
Predicted: Sleep (confidence: 0.526)
Similarities: {'Compliance': '0.286', 'Sleep': '0.526', '

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
evaluation_report

Unnamed: 0,precision,recall,f1-score,support,avg_confidence
Compliance,0.75,0.75,0.75,4.0,0.38097
Delivery,0.0,0.0,0.0,2.0,0.444883
Order,0.5,0.75,0.6,4.0,0.448041
Physician,0.0,0.0,0.0,4.0,0.514223
Prescription,0.0,0.0,0.0,3.0,0.314499
Sleep,0.375,0.75,0.5,4.0,0.513123
Unknown,0.0,0.0,0.0,0.0,
accuracy,0.428571,0.428571,0.428571,0.428571,
macro avg,0.232143,0.321429,0.264286,21.0,
weighted avg,0.309524,0.428571,0.352381,21.0,


In [5]:
# Old Code 

"""
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load pre-trained embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can replace with a larger model if needed# Define prototypes or example descriptions for each class
label_descriptions = {
    "Compliance": "Document about patient compliance, device usage, compliance percentage, usage days.",
    "Sleep": "Detailed sleep study results, patient sleep data, apnea index, polysomnography analysis.",
    "Order": "Medical equipment order, transaction details, MRN, authorization, supply.",
    "Delivery": "Delivery receipt for medical equipment, contains delivery date, receipt number.",
    "Physician": "Physician's notes, patient assessment, diagnosis and treatment information.",
    "Prescription": "Prescription with medication name, dosage instructions, refill options."
}

# Create prototype embeddings for each class
label_embeddings = {label: model.encode(description) for label, description in label_descriptions.items()}

# Function to classify a document using embeddings
def classify_document_embedding(text):
    # Generate embedding for the document
    doc_embedding = model.encode(text)
    
    # Calculate similarity between document and each class prototype
    similarities = {}
    for label, label_embedding in label_embeddings.items():
        similarity = util.cos_sim(doc_embedding, label_embedding).item()  # Cosine similarity
        similarities[label] = similarity

    # Select the class with the highest similarity score
    best_class = max(similarities, key=similarities.get)
    best_confidence = similarities[best_class]
    
    return best_class, best_confidence, similarities
# Sample usage
sample_text = cleaned_results[list(cleaned_results.keys())[0]]['text']
classified_class, confidence, all_similarities = classify_document_embedding(sample_text)

print(f"Classified as: {classified_class} with confidence: {confidence}")
print("Similarity scores:", all_similarities)# Let's test the function with all the documents

for doc_name, doc_data in cleaned_results.items():
    classified_class, confidence, all_similarities = classify_document_embedding(doc_data['text'])
    print(f"Document: {doc_name} - Classified as: {classified_class} with confidence: {confidence}")
    print("Similarity scores:", all_similarities)# Analysying which class was classified correctly and which was not

for doc_name, doc_data in cleaned_results.items():
    classified_class, confidence, all_similarities = classify_document_embedding(doc_data['text'])
    if classified_class == doc_data['label']:
        print(f"Document: {doc_name} - Classified as: {classified_class} with confidence: {confidence}")
# Analysying which class was classified correctly and which was not

for doc_name, doc_data in cleaned_results.items():
    classified_class, confidence, all_similarities = classify_document_embedding(doc_data['text'])
    if classified_class != doc_data['label']:
        print(f"Document: {doc_name} - Classified as: {classified_class} with confidence: {confidence}")

"""

'\nfrom sentence_transformers import SentenceTransformer, util\nimport numpy as np\n\n# Load pre-trained embedding model\nmodel = SentenceTransformer(\'all-MiniLM-L6-v2\')  # You can replace with a larger model if needed# Define prototypes or example descriptions for each class\nlabel_descriptions = {\n    "Compliance": "Document about patient compliance, device usage, compliance percentage, usage days.",\n    "Sleep": "Detailed sleep study results, patient sleep data, apnea index, polysomnography analysis.",\n    "Order": "Medical equipment order, transaction details, MRN, authorization, supply.",\n    "Delivery": "Delivery receipt for medical equipment, contains delivery date, receipt number.",\n    "Physician": "Physician\'s notes, patient assessment, diagnosis and treatment information.",\n    "Prescription": "Prescription with medication name, dosage instructions, refill options."\n}\n\n# Create prototype embeddings for each class\nlabel_embeddings = {label: model.encode(descripti