# Load the cleaned results

In [1]:
import json

with open('cleaned_results.json', 'r') as file:
    cleaned_results = json.load(file)

print(cleaned_results)

{'Compliance Report 4.pdf': {'text': '0\nResiVied 06/08/2023 - 07/07/2023\nAirView\nCompliance Report\nInitial compliance period 06/08/2028 - 07/07/2023\nCompliance met Yes\nCompliance percentage 70 %\nPayor Standard\nUsage days 22/30 days 73 % 4 hours 21 days 70 %\n4 hours 1 days 3 %\nUsage hours 149 hours 48 minutes\nAverage usage total days 5 hours 0 minutes\nAverage usage days used 6 hours 49 minutes\nMedian usage days used 6 hours 55 minutes\nTotal used hours value since last reset - 07/07/2023 276 hours: MirSanca AutoSe\nSerial number 23231350306\nMode CPAP\nSet pressure: 15 cemH20\nEPR Fulltime\nEPR level 3\nerap\nLeaks - L/min Median: 34, 5 95th percentile: 62. 6 Maximum: 72. 7\nEvents per hour Al: 4. 0 Hk: 21 AHI: 6. 1\nApnea Index Central: 1. 8 Obstructive: 0. 4 Unknown: 1. 7\nRERA Index 1. 3\nCheyne-Stokes respiration average duration per night 7 minutes 2 %\nUsage - hours\n8 10 12 14 16 18 20 22 24 26 28 30 2. 4 6\nPrinted on 07/11/2023 - ResMed Airview version 4. 41. 0-9. 

# Rules approach

Let's start by creating a very basic rule based approach to classify the documents.

In [2]:
from typing import Dict, Tuple, List, Optional
import re
import pandas as pd
from sklearn.metrics import classification_report
import json
from dataclasses import dataclass

@dataclass
class DocumentFeatures:
    """Data class to store document classification features"""
    keywords: List[str]
    structure_starts: List[str]
    measurements: List[str]
    required_fields: List[str]
    semantic_patterns: List[str]

class RulesBasedClassifier:
    def __init__(self, threshold: float = 0.4):
        """
        Initialize the rules-based document classifier.
        
        Args:
            threshold: Minimum confidence threshold for classification
        """
        self.threshold = threshold
        self.rules = self._create_enhanced_rules()
        
    def _create_enhanced_rules(self) -> Dict[str, DocumentFeatures]:
        """
        Create detailed classification rules with multiple criteria per class.
        """
        return {
            "Compliance": DocumentFeatures(
                keywords=[
                    "compliance", "usage", "period", "percentage", "met",
                    "adherence", "therapy", "utilization", "average", "daily use",
                    "pressure settings", "leak rate", "events per hour"
                ],
                structure_starts=[
                    "AirView", "Compliance Report", "Usage Summary",
                    "Patient Compliance Data", "Therapy Report"
                ],
                measurements=[
                    "cmH2O", "L/min", "hours/night", "days/week", "%",
                    "events/hour", "AHI"
                ],
                required_fields=[
                    "usage days", "compliance percentage", "average usage",
                    "therapy hours", "pressure settings"
                ],
                semantic_patterns=[
                    r"\d+(\.\d+)?\s*hours?\s*(per|/)\s*(night|day)",
                    r"\d+(\.\d+)?\s*%\s*compliance",
                    r"used\s+([0-9]+)\s+out of\s+([0-9]+)\s+nights?"
                ]
            ),
            "Sleep": DocumentFeatures(
                keywords=[
                    "sleep", "study", "apnea", "diagnostic", "polysomnography",
                    "rem", "arousal", "hypopnea", "oxygen", "saturation"
                ],
                structure_starts=[
                    "MUSC", "MEDICAL UNIVERSITY", "Sleep Study Report",
                    "Polysomnography Report", "Sleep Laboratory"
                ],
                measurements=[
                    "cm", "hours", "SpO2", "µV", "Hz", "dB",
                    "events/hour", "breaths/min"
                ],
                required_fields=[
                    "sleep study report", "patient name", "apnea index",
                    "study date", "total sleep time"
                ],
                semantic_patterns=[
                    r"AHI\s*[:<]?\s*\d+(\.\d+)?",
                    r"Stage [N|R][1-3]:\s*\d+(\.\d+)?%",
                    r"Sleep efficiency:\s*\d+(\.\d+)?%"
                ]
            ),
            "Order": DocumentFeatures(
                keywords=[
                    "order", "equipment", "supply", "authorized", "prescribed",
                    "requested", "purchase", "requisition", "authorization"
                ],
                structure_starts=[
                    "MRN", "Order Date", "Purchase Order",
                    "Equipment Request", "Supply Order"
                ],
                measurements=[],
                required_fields=[
                    "order", "MRN", "date", "provider", "equipment description"
                ],
                semantic_patterns=[
                    r"Order\s*#?\s*\d+",
                    r"MRN\s*#?\s*\d+",
                    r"Date:\s*\d{1,2}[-/]\d{1,2}[-/]\d{2,4}"
                ]
            ),
            "Delivery": DocumentFeatures(
                keywords=[
                    "delivery", "receipt", "equipment", "supplied", "received",
                    "shipment", "delivered", "confirmed", "acceptance"
                ],
                structure_starts=[
                    "DELIVERY RECEIPT", "Proof of Delivery",
                    "Equipment Delivery", "Delivery Confirmation"
                ],
                measurements=[],
                required_fields=[
                    "name", "equipment", "delivery date", "signature"
                ],
                semantic_patterns=[
                    r"Delivered\s+on:\s+\d{1,2}[-/]\d{1,2}[-/]\d{2,4}",
                    r"Received\s+by:\s+[A-Za-z\s]+",
                    r"Delivery\s+ID:\s*\w+"
                ]
            ),
            "Physician": DocumentFeatures(
                keywords=[
                    "assessment", "diagnosis", "examination", "treatment",
                    "evaluation", "plan", "symptoms", "findings"
                ],
                structure_starts=[
                    "Follow up:", "Physician's Notes", "Clinical Notes",
                    "Medical Assessment", "Progress Notes"
                ],
                measurements=[
                    "mg", "kg", "cm", "mm Hg", "bpm"
                ],
                required_fields=[
                    "patient name", "physician", "assessment", "date",
                    "diagnosis"
                ],
                semantic_patterns=[
                    r"Assessment:.*Plan:",
                    r"Diagnosis:\s*[A-Z][\w\s]+",
                    r"Dr\.\s+[A-Za-z\s,]+"
                ]
            ),
            "Prescription": DocumentFeatures(
                keywords=[
                    "rx", "prescribed", "dosage", "prescription", "refill",
                    "medication", "dispense", "pharmacy", "sig"
                ],
                structure_starts=[
                    "Rx:", "Prescription", "Medication Order",
                    "Drug Order", "Script"
                ],
                measurements=[
                    "MG", "ML", "MCG", "G", "Units"
                ],
                required_fields=[
                    "dosage", "prescription", "medication name",
                    "quantity", "refills"
                ],
                semantic_patterns=[
                    r"Take\s+\d+\s+tablet\(s\)\s+\w+",
                    r"Refills:\s*\d+",
                    r"Disp:\s*#?\d+"
                ]
            )
        }

    def _calculate_feature_score(self, text: str, feature_list: List[str], 
                               weight: float = 1.0) -> Tuple[int, int]:
        """
        Calculate score for a specific feature type.
        
        Args:
            text: Document text
            feature_list: List of features to check
            weight: Weight to apply to the score
            
        Returns:
            Tuple of (weighted matches, total possible weighted matches)
        """
        matches = sum(1 for feature in feature_list 
                     if re.search(rf'\b{re.escape(feature)}\b', text, re.IGNORECASE))
        return int(matches * weight), int(len(feature_list) * weight)

    def classify_document(self, text: str) -> Tuple[str, float, Dict[str, float]]:
        """
        Classify a document using enhanced rule-based scoring.
        
        Args:
            text: Document text to classify
            
        Returns:
            Tuple of (predicted_class, confidence, all_scores)
        """
        class_scores = {}
        
        for doc_class, features in self.rules.items():
            # Initialize score components with weights
            keyword_score, keyword_total = self._calculate_feature_score(
                text, features.keywords, weight=1.0)
            structure_score, structure_total = self._calculate_feature_score(
                text, features.structure_starts, weight=1.5)
            measurement_score, measurement_total = self._calculate_feature_score(
                text, features.measurements, weight=0.8)
            required_score, required_total = self._calculate_feature_score(
                text, features.required_fields, weight=2.0)
            
            # Calculate semantic pattern matches
            pattern_matches = sum(1 for pattern in features.semantic_patterns 
                                if re.search(pattern, text, re.IGNORECASE))
            pattern_score = pattern_matches * 1.5
            pattern_total = len(features.semantic_patterns) * 1.5
            
            # Calculate total weighted score and maximum possible score
            total_score = (keyword_score + structure_score + measurement_score + 
                         required_score + pattern_score)
            max_score = (keyword_total + structure_total + measurement_total + 
                        required_total + pattern_total)
            
            # Calculate confidence as percentage
            confidence = (total_score / max_score * 100) if max_score > 0 else 0
            class_scores[doc_class] = confidence
        
        # Find best class and confidence
        best_class = max(class_scores, key=class_scores.get)
        confidence = class_scores[best_class]
        
        # Return Unknown if confidence is below threshold
        if confidence < self.threshold:
            return "Unknown", confidence, class_scores
            
        return best_class, confidence, class_scores

    def evaluate(self, test_data: Dict[str, Dict[str, str]]) -> pd.DataFrame:
        """
        Evaluate classifier performance on test data.
        
        Args:
            test_data: Dictionary of test documents with their true labels
            
        Returns:
            DataFrame with evaluation metrics
        """
        predictions = []
        true_labels = []
        confidences = []
        
        for doc_id, doc_info in test_data.items():
            pred_class, confidence, _ = self.classify_document(doc_info['text'])
            print(f"Document: {doc_id} - Classified as: {pred_class} with confidence: {confidence}%")
            predictions.append(pred_class)
            true_labels.append(doc_info['label'])
            confidences.append(confidence)
        
        # Generate classification report
        report = classification_report(true_labels, predictions, output_dict=True)
        df_report = pd.DataFrame(report).transpose()
        
        # Add average confidence per class
        confidence_series = pd.Series(confidences, index=true_labels)
        avg_confidences = confidence_series.groupby(level=0).mean()
        df_report['avg_confidence'] = avg_confidences
        
        return df_report

    def save_rules(self, filepath: str):
        """Save classification rules to a JSON file."""
        rules_dict = {class_name: {
            'keywords': features.keywords,
            'structure_starts': features.structure_starts,
            'measurements': features.measurements,
            'required_fields': features.required_fields,
            'semantic_patterns': features.semantic_patterns
        } for class_name, features in self.rules.items()}
        
        with open(filepath, 'w') as f:
            json.dump(rules_dict, f, indent=2)

    def load_rules(self, filepath: str):
        """Load classification rules from a JSON file."""
        with open(filepath, 'r') as f:
            rules_dict = json.load(f)
            
        self.rules = {
            class_name: DocumentFeatures(**features)
            for class_name, features in rules_dict.items()
        }

In [3]:
# Initialize classifier
classifier = RulesBasedClassifier(threshold=0.4)

# Save/load rules
# classifier.save_rules('classification_rules.json')
classifier.load_rules('classification_rules.json')

# Classify a single document
class_label, confidence, all_scores = classifier.classify_document(cleaned_results[list(cleaned_results.keys())[0]]['text'])

# Evaluate on test data
test_data = cleaned_results
evaluation_results = classifier.evaluate(test_data)


Document: Compliance Report 4.pdf - Classified as: Compliance with confidence: 43.037974683544306%
Document: Compliance Report 1.pdf - Classified as: Compliance with confidence: 40.50632911392405%
Document: Compliance Report 2.pdf - Classified as: Compliance with confidence: 45.56962025316456%
Document: Compliance Report 3.pdf - Classified as: Compliance with confidence: 30.37974683544304%
Document: Sleep Study Report 3.pdf - Classified as: Sleep with confidence: 29.333333333333332%
Document: Sleep Study Report 2.pdf - Classified as: Sleep with confidence: 26.666666666666668%
Document: Sleep Study Report 1.pdf - Classified as: Sleep with confidence: 34.66666666666667%
Document: Sleep Study Report 4.pdf - Classified as: Sleep with confidence: 14.666666666666666%
Document: Order 3.pdf - Classified as: Order with confidence: 49.18032786885246%
Document: Delivery Ticket 2.pdf - Classified as: Delivery with confidence: 40.0%
Document: Physician Notes 1.pdf - Classified as: Physician with co

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
evaluation_results

Unnamed: 0,precision,recall,f1-score,support,avg_confidence
Compliance,1.0,1.0,1.0,4.0,39.873418
Delivery,0.5,1.0,0.666667,2.0,40.0
Order,1.0,0.25,0.4,4.0,30.246235
Physician,0.5,0.75,0.6,4.0,13.273632
Prescription,0.0,0.0,0.0,3.0,18.40796
Sleep,0.8,1.0,0.888889,4.0,26.333333
Unknown,0.0,0.0,0.0,0.0,
accuracy,0.666667,0.666667,0.666667,0.666667,
macro avg,0.542857,0.571429,0.507937,21.0,
weighted avg,0.67619,0.666667,0.613757,21.0,


# Analysis

### As we have less data, generalising and having enough rules for the future is difficult. Still, we can see that the rules are working.
### The Compliance class is being classified correctly most of the time.
### The Sleep class is being classified correctly most of the time.
### However, the Order class, Delivery, Physician and Prescription classes are not being classified correctly.

# Old Code 


In [5]:
"""
import re

# Define rules for each class
rules = {
    "Compliance": {
        "keywords": ["compliance", "usage", "period", "percentage", "met"],
        "structure_starts": ["AirView", "Compliance Report"],
        "measurements": ["cmH20", "L/min"],
        "required_fields": ["usage days", "compliance percentage"]
    },
    "Sleep": {
        "keywords": ["sleep", "study", "apnea", "diagnostic", "polysomnography"],
        "structure_starts": ["MUSC", "MEDICAL UNIVERSITY"],
        "measurements": ["cm", "hours"],
        "required_fields": ["sleep study report", "patient name", "apnea index"]
    },
    "Order": {
        "keywords": ["order", "equipment", "supply", "authorized"],
        "structure_starts": ["MRN", "order date"],
        "required_fields": ["order", "MRN", "date"]
    },
    "Delivery": {
        "keywords": ["delivery", "receipt", "equipment", "supplied"],
        "structure_starts": ["DELIVERY RECEIPT"],
        "required_fields": ["name", "equipment"]
    },
    "Physician": {
        "keywords": ["assessment", "diagnosis", "examination", "treatment"],
        "structure_starts": ["Follow up:", "Physician's notes"],
        "required_fields": ["patient name", "physician", "assessment"]
    },
    "Prescription": {
        "keywords": ["rx", "prescribed", "dosage", "prescription"],
        "measurements": ["MG"],
        "required_fields": ["dosage", "prescription"]
    }
}

# Function to classify a document based on rules
def classify_document(text):
    class_scores = {}
    
    for doc_class, features in rules.items():
        score = 0
        total_criteria = 0
        
        # Check keywords
        keyword_matches = sum(1 for keyword in features.get("keywords", []) if re.search(rf'\b{keyword}\b', text, re.IGNORECASE))
        score += keyword_matches
        total_criteria += len(features.get("keywords", []))
        
        # Check starting structure (if defined)
        structure_match = any(text.startswith(start) for start in features.get("structure_starts", []))
        if features.get("structure_starts"):
            score += int(structure_match)  # Add 1 if structure match found
            total_criteria += 1

        # Check for specific required fields
        required_field_matches = sum(1 for field in features.get("required_fields", []) if re.search(rf'\b{field}\b', text, re.IGNORECASE))
        score += required_field_matches
        total_criteria += len(features.get("required_fields", []))

        # Check for measurements if applicable
        measurement_matches = sum(1 for measure in features.get("measurements", []) if re.search(rf'\b{measure}\b', text, re.IGNORECASE))
        score += measurement_matches
        total_criteria += len(features.get("measurements", []))

        # Calculate confidence score as a percentage
        confidence = (score / total_criteria) * 100 if total_criteria > 0 else 0
        class_scores[doc_class] = confidence

    # Select class with the highest confidence score
    best_class = max(class_scores, key=class_scores.get)
    best_confidence = class_scores[best_class]
    
    return best_class, best_confidence, class_scores# Let's test the function with a sample document
sample_text = cleaned_results[list(cleaned_results.keys())[0]]['text']
classified_class, confidence, all_scores = classify_document(sample_text)

print(f"Classified as: {classified_class} with confidence: {confidence}%")
print("Detailed scores:", all_scores)for doc_name, doc_data in cleaned_results.items():
    classified_class, confidence, all_scores = classify_document(doc_data['text'])
    if classified_class != doc_data['label']:
        print(f"Document: {doc_name} - Classified as: {classified_class} with confidence: {confidence}%")   
# Analysying which class was classified correctly and which was not

for doc_name, doc_data in cleaned_results.items():
    classified_class, confidence, all_scores = classify_document(doc_data['text'])
    if classified_class == doc_data['label']:
        print(f"Document: {doc_name} - Classified as: {classified_class} with confidence: {confidence}%")   

"""

'\nimport re\n\n# Define rules for each class\nrules = {\n    "Compliance": {\n        "keywords": ["compliance", "usage", "period", "percentage", "met"],\n        "structure_starts": ["AirView", "Compliance Report"],\n        "measurements": ["cmH20", "L/min"],\n        "required_fields": ["usage days", "compliance percentage"]\n    },\n    "Sleep": {\n        "keywords": ["sleep", "study", "apnea", "diagnostic", "polysomnography"],\n        "structure_starts": ["MUSC", "MEDICAL UNIVERSITY"],\n        "measurements": ["cm", "hours"],\n        "required_fields": ["sleep study report", "patient name", "apnea index"]\n    },\n    "Order": {\n        "keywords": ["order", "equipment", "supply", "authorized"],\n        "structure_starts": ["MRN", "order date"],\n        "required_fields": ["order", "MRN", "date"]\n    },\n    "Delivery": {\n        "keywords": ["delivery", "receipt", "equipment", "supplied"],\n        "structure_starts": ["DELIVERY RECEIPT"],\n        "required_fields": ["