In [12]:
import PyPDF2
import pandas as pd
import re

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def process_extracted_text(text):
    # Split the text into lines
    lines = text.split('\n')

    # Initialize a dictionary to store the results
    results = {
        'Test': [],
        'Value': [],
        'Units': [],
        'Reference Range': [],
        'Collected Date': [],
        'Received Date': [],
        'Reported Date': []
    }

    current_dates = {}

    # Patterns for parsing dates and results
    date_pattern = re.compile(r'Collected:\s*(\d{2}/\d{2}/\d{4} \d{2}:\d{2} [APM]{2})\s*Received:\s*(\d{2}/\d{2}/\d{4} \d{2}:\d{2} [APM]{2})\s*Reported:\s*(\d{2}/\d{2}/\d{4})')
    test_pattern = re.compile(r'^(WBC|RBC|HGB|HCT|MCV|MCH|MCHC|RDW|PLATELET COUNT|MPV|Hemoglobin A1c)')
    value_pattern = re.compile(r'(\d+(\.\d+)?)')
    reference_range_pattern = re.compile(r'(\d+(\.\d+)?)\s*-\s*(\d+(\.\d+)?)')

    for line in lines:
        # Extract the dates if present
        date_match = date_pattern.search(line)
        if date_match:
            current_dates['Collected Date'] = date_match.group(1)
            current_dates['Received Date'] = date_match.group(2)
            current_dates['Reported Date'] = date_match.group(3)
            continue

        # Extract test results if present
        test_match = test_pattern.search(line)
        if test_match:
            parts = line.split()
            test_name = parts[0]
            value_match = value_pattern.search(parts[1])
            reference_range_match = reference_range_pattern.search(line)

            test_value = value_match.group(1) if value_match else ''
            reference_range = f"{reference_range_match.group(1)} - {reference_range_match.group(3)}" if reference_range_match else ''
            units = ''

            # Set units for specific tests
            if test_name in ['WBC', 'PLATELET COUNT']:
                units = 'x 10(3)/uL'
            elif test_name == 'RBC':
                units = 'x 10(6)/uL'
            elif test_name == 'HGB':
                units = 'g/dL'
            elif test_name == 'HCT':
                units = '%'
            elif test_name == 'MCV':
                units = 'fL'
            elif len(parts) > 2:
                units = parts[2]

            results['Test'].append(test_name)
            results['Value'].append(test_value)
            results['Units'].append(units)
            results['Reference Range'].append(reference_range)
            results['Collected Date'].append(current_dates.get('Collected Date', ''))
            results['Received Date'].append(current_dates.get('Received Date', ''))
            results['Reported Date'].append(current_dates.get('Reported Date', ''))

    return results

def create_dataset(results):
    df = pd.DataFrame(results)
    return df

# Define the PDF path
pdf_path = "pdf1.pdf"

# Extract text from the PDF
extracted_text = extract_text_from_pdf(pdf_path)

# Process the extracted text to extract lab results and other information
lab_results = process_extracted_text(extracted_text)

# Create a dataset
dataset = create_dataset(lab_results)

# Print the first few rows of the dataset
print("Dataset Head:")
print(dataset.head())

# Save the dataset to a CSV file
dataset.to_csv("lab_results.csv", index=False)

Dataset Head:
  Test Value       Units Reference Range       Collected Date  \
0  WBC  6.13  x 10(3)/uL    4.00 - 10.10  07/17/2023 11:18 AM   
1  RBC  4.86  x 10(6)/uL     3.58 - 5.19  07/17/2023 11:18 AM   
2  HGB  13.3        g/dL     11.0 - 15.5  07/17/2023 11:18 AM   
3  HCT  41.0           %     31.5 - 44.8  07/17/2023 11:18 AM   
4  MCV  84.4          fL     78.0 - 98.0  07/17/2023 11:18 AM   

         Received Date Reported Date  
0  07/17/2023 11:27 PM    07/23/2023  
1  07/17/2023 11:27 PM    07/23/2023  
2  07/17/2023 11:27 PM    07/23/2023  
3  07/17/2023 11:27 PM    07/23/2023  
4  07/17/2023 11:27 PM    07/23/2023  


In [13]:
dataset.head()

Unnamed: 0,Test,Value,Units,Reference Range,Collected Date,Received Date,Reported Date
0,WBC,6.13,x 10(3)/uL,4.00 - 10.10,07/17/2023 11:18 AM,07/17/2023 11:27 PM,07/23/2023
1,RBC,4.86,x 10(6)/uL,3.58 - 5.19,07/17/2023 11:18 AM,07/17/2023 11:27 PM,07/23/2023
2,HGB,13.3,g/dL,11.0 - 15.5,07/17/2023 11:18 AM,07/17/2023 11:27 PM,07/23/2023
3,HCT,41.0,%,31.5 - 44.8,07/17/2023 11:18 AM,07/17/2023 11:27 PM,07/23/2023
4,MCV,84.4,fL,78.0 - 98.0,07/17/2023 11:18 AM,07/17/2023 11:27 PM,07/23/2023
