# Load the results



In [1]:
import json
results = json.load(open('selected_results.json', 'r', encoding='utf-8'))

In [2]:
results['texts']

{'Compliance Report 4.pdf': '0?\n\nResiVied 06/08/2023 - 07/07/2023\n\nAirView"\n\nCompliance Report\n\nInitial compliance period 06/08/2028 - 07/07/2023\nCompliance met Yes\nCompliance percentage 70%\nPayor Standard\nUsage days 22/30 days (73%)\n>= 4 hours 21 days (70%)\n<4 hours 1 days (3%)\n“Usage hours 149 hours 48 minutes\nAverage usage (total days) 5 hours 0 minutes\nAverage usage (days used) 6 hours 49 minutes\nMedian usage (days used) 6 hours 55 minutes\nTotal used hours (value since last reset - 07/07/2023) 276 hours:\nMirSanca AutoSe\nSerial number 23231350306\nMode CPAP\nSet pressure : 15 cemH20\nEPR Fulltime\nEPR level 3\nerap\nLeaks - L/min Median: 34,5 95th percentile: 62.6 Maximum: 72.7\nEvents per hour Al: 4.0 Hk: 21 AHI: 6.1\nApnea Index Central: 1.8 Obstructive: 0.4 Unknown: 1.7\nRERA Index 1.3\nCheyne-Stokes respiration (average duration per night) 7 minutes (2%)\n\nUsage - hours\n\n8 10 12 14 16 18 20 22 24 26 28 30 2. 4 6\n\nPrinted on 07/11/2023 - ResMed Airview v

# Adding ground truth labels

In [3]:
labeled_results = {}
for filename in results['texts']:
    labeled_results[filename] = {'text': results['texts'][filename], 'label': filename.split(' ')[0]}

In [4]:
labeled_results

{'Compliance Report 4.pdf': {'text': '0?\n\nResiVied 06/08/2023 - 07/07/2023\n\nAirView"\n\nCompliance Report\n\nInitial compliance period 06/08/2028 - 07/07/2023\nCompliance met Yes\nCompliance percentage 70%\nPayor Standard\nUsage days 22/30 days (73%)\n>= 4 hours 21 days (70%)\n<4 hours 1 days (3%)\n“Usage hours 149 hours 48 minutes\nAverage usage (total days) 5 hours 0 minutes\nAverage usage (days used) 6 hours 49 minutes\nMedian usage (days used) 6 hours 55 minutes\nTotal used hours (value since last reset - 07/07/2023) 276 hours:\nMirSanca AutoSe\nSerial number 23231350306\nMode CPAP\nSet pressure : 15 cemH20\nEPR Fulltime\nEPR level 3\nerap\nLeaks - L/min Median: 34,5 95th percentile: 62.6 Maximum: 72.7\nEvents per hour Al: 4.0 Hk: 21 AHI: 6.1\nApnea Index Central: 1.8 Obstructive: 0.4 Unknown: 1.7\nRERA Index 1.3\nCheyne-Stokes respiration (average duration per night) 7 minutes (2%)\n\nUsage - hours\n\n8 10 12 14 16 18 20 22 24 26 28 30 2. 4 6\n\nPrinted on 07/11/2023 - ResMed 

# Text cleaning and analysis: 

This code provides functions to clean and analyze text extracted from PDF files. The advanced_pdf_text_cleaning function normalizes text by removing unwanted characters, patterns (like phone numbers and emails), and specified "noise" words. It also standardizes the format (e.g., lowercase and whitespace cleanup) and filters out short texts. The batch_clean_pdf_texts function applies this cleaning process to a batch of texts and returns cleaned results. Finally, analyze_cleaned_texts calculates basic statistics on the cleaned text, such as average, minimum, and maximum text lengths, and the total number of files processed.










In [5]:
import re

def refined_clean_text(text):
    # Step 1: Remove unwanted special characters and symbols at the start
    text = re.sub(r'[^\w\s:/.,%-]+', '', text)
    
    # Step 2: Replace specific lingering artifacts (such as OCR errors)
    text = re.sub(r'\boor\b|\be\b|\baye\b|\beee\b', '', text)  # Remove specific unwanted words
    
    # Step 3: Remove repeated punctuation and excessive whitespace
    text = re.sub(r'\n+', '\n', text)  # Convert multiple newlines to a single newline
    text = re.sub(r'\s{2,}', ' ', text)  # Convert multiple spaces to a single space
    
    # Step 4: Add space after punctuation where missing
    text = re.sub(r'(?<=[.,])(?=\S)', ' ', text)  # Add space after commas and periods where needed
    
    # Step 5: Remove text following unwanted headers or patterns (for contact info, unwanted lines)
    contact_info_pattern = r'(Phone|Fax|Email):? [^\n]*\n'
    text = re.sub(contact_info_pattern, '', text)
    
    # Step 6: Remove any lingering symbols or non-word characters at the start of the text
    text = re.sub(r'^[^\w]+', '', text)

    # Step 7: Add space between numbers and units or percentages if missing
    text = re.sub(r'(\d)(cmH20|L/min|%)', r'\1 \2', text)  # Add space between number and units/percentage

    # Step 8: Ensure single space around punctuation (remove spaces before, add after if needed)
    text = re.sub(r'\s+([:,])', r'\1', text)  # Remove spaces before colons and commas
    text = re.sub(r'([:,])\s+', r'\1 ', text)  # Ensure single space after colons and commas

    # Step 9: Final trim of leading and trailing spaces
    text = text.strip()
    
    return text

# Apply cleaning function to each entry in labeled_results
cleaned_texts = {}
for filename in labeled_results:
    cleaned_texts[filename] = {'text': refined_clean_text(labeled_results[filename]['text']), 'label': labeled_results[filename]['label']}


In [6]:
cleaned_texts

{'Compliance Report 4.pdf': {'text': '0\nResiVied 06/08/2023 - 07/07/2023\nAirView\nCompliance Report\nInitial compliance period 06/08/2028 - 07/07/2023\nCompliance met Yes\nCompliance percentage 70 %\nPayor Standard\nUsage days 22/30 days 73 % 4 hours 21 days 70 %\n4 hours 1 days 3 %\nUsage hours 149 hours 48 minutes\nAverage usage total days 5 hours 0 minutes\nAverage usage days used 6 hours 49 minutes\nMedian usage days used 6 hours 55 minutes\nTotal used hours value since last reset - 07/07/2023 276 hours: MirSanca AutoSe\nSerial number 23231350306\nMode CPAP\nSet pressure: 15 cemH20\nEPR Fulltime\nEPR level 3\nerap\nLeaks - L/min Median: 34, 5 95th percentile: 62. 6 Maximum: 72. 7\nEvents per hour Al: 4. 0 Hk: 21 AHI: 6. 1\nApnea Index Central: 1. 8 Obstructive: 0. 4 Unknown: 1. 7\nRERA Index 1. 3\nCheyne-Stokes respiration average duration per night 7 minutes 2 %\nUsage - hours\n8 10 12 14 16 18 20 22 24 26 28 30 2. 4 6\nPrinted on 07/11/2023 - ResMed Airview version 4. 41. 0-9. 

In [7]:
for i in list(cleaned_texts.keys()):
    print(i)

Compliance Report 4.pdf
Compliance Report 1.pdf
Compliance Report 2.pdf
Compliance Report 3.pdf
Sleep Study Report 3.pdf
Sleep Study Report 2.pdf
Sleep Study Report 1.pdf
Sleep Study Report 4.pdf
Order 3.pdf
Delivery Ticket 2.pdf
Physician Notes 1.pdf
Prescription 4.pdf
Order 2.pdf
Physician Notes 3.pdf
Delivery Ticket 1.pdf
Physician Notes 2.pdf
Order 1.pdf
Prescription 2.pdf
Order 4.pdf
Physician Notes 4.pdf
Prescription 1.pdf


In [8]:
filename = 'Sleep Study Report 1.pdf'
print(labeled_results[filename]['text'])

7/17/2023 10:20 AM FROM: Fax null TO: 9548341807 PAGE: 027 OF 030

POLYSOMNOGRAPHY REPORT

Patient Demographics:
Patient Name: |
| Ficst Name:

i
Diagnostic Psg

9.48: 48 PM
[pir Bate eT Stopped Taz
Feighte Peg bs
BMI: 44.01 Fm arthn | - ca
Referring Provider: LMarien Perez Interpreting Physician:

Testing Type & Methods

Type of Test: Diagnostic PSG

Method: Polysomnography was conducted on the night of 6/22/2023. The following parameters were monitored:
Frontal, central and occipital EEG, electroculogram {EOG}, submentalis EMG, nasal and oral airflow, anterior tibialis
EMG, body position and electrocardiogram. Additionally, thoracic and abdominal movements were recorded by
inductance plethysmography, Oxygen saturation (¢p02) was monitored using a pulse oximeter, The tracing was
scored using 30 second epochs. Hypopneas were scored per AASM definition.

A Central Apnea was defined as a cessation of oral and nasal airflow with simultaneous cessations of respiratory
movements for at leas

In [9]:
print(cleaned_texts[filename]['text'])

7/17/2023 10:20 AM FROM: POLYSOMNOGRAPHY REPORT
Patient Demographics: Patient Name: Ficst Name: i
Diagnostic Psg
9. 48: 48 PM
pir Bate eT Stopped Taz
Feighte Peg bs
BMI: 44. 01 Fm arthn - ca
Referring Provider: LMarien Perez Interpreting Physician: Testing Type Methods
Type of Test: Diagnostic PSG
Method: Polysomnography was conducted on the night of 6/22/2023. The following parameters were monitored: Frontal, central and occipital EEG, electroculogram EOG, submentalis EMG, nasal and oral airflow, anterior tibialis
EMG, body position and electrocardiogram. Additionally, thoracic and abdominal movements were recorded by
inductance plethysmography, Oxygen saturation p02 was monitored using a pulse oximeter, The tracing was
scored using 30 second epochs. Hypopneas were scored per AASM definition.
A Central Apnea was defined as a cessation of oral and nasal airflow with simultaneous cessations of respiratory
movements for at least 10 seconds 2 respiratory cycles in children, An Obstructive

In [10]:
# save the cleaned texts to a json file
with open('cleaned_results.json', 'w', encoding='utf-8') as f:
    json.dump(cleaned_texts, f, ensure_ascii=False, indent=4)


# Future work:
1. Use LLMs to clean the data and perform the analysis 