In [1]:

%pprint
import sys
if (osp.join(os.pardir, 'py') not in sys.path): sys.path.insert(1, osp.join(os.pardir, 'py'))

Pretty printing has been turned OFF


In [2]:

from FRVRS import (nu, fu, osp, DataFrame, display)
from PyPDF2 import PdfReader

In [3]:

def convert(file_path, verbose=False):
    """
    Convert PDF, return its text content as a string
    """
    text = ''
    with open(file_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        for page_number in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_number]
            text += page.extract_text()
    if verbose: print(f'Text length for {file_path} is {len(text):,} characters.')

    return text

In [4]:

def split_by_regex(lines_list, regex_pattern, ignores_list=[]):
    """
    Splits a list of strings into sublists based on a regular expression match.
    
    Parameters:
       lines_list: A list of strings.
       regex_pattern: A compiled regular expression object.

    Returns:
       A list of sublists, where each sublist contains lines between occurrences of
       the regex match with the subsequent lists the first element being the match.
    """
    sublists = []
    current_sublist = []
    for line in lines_list:
        if regex_pattern.match(line):
            if current_sublist: sublists.append(current_sublist)
            current_sublist = []
        if not (line in ignores_list): current_sublist.append(line)
    if current_sublist: sublists.append(current_sublist)
    
    return sublists

In [5]:

scenario_regex = re.compile(r'SCENARIO \d')
patient_regex = re.compile(r'Patient -[A-Z]')
medic_regex = re.compile(r'Medic -\d+')
scenario_details_regex = re.compile(r'Situation|Available Resources')
patient_details_regex = re.compile(r'Description|Vitals|Required Decision')
medic_details_regex = re.compile(r'Actions|Explanation')
ascii_regex = re.compile('[^a-z0-9]+')
kv_regex = re.compile(r'\s*[:=]\s*')
ignores_list = ['DARPA ITM TA3  January 2024', 'Graphic', 'Medical Decision Maker Information']
item_prefix = '- '

In [6]:

if nu.pickle_exists('experiment_scenarios_df'): experiment_scenarios_df = nu.load_object('experiment_scenarios_df')
else:
    file_path = osp.abspath('../data/pdf/ExperimentScenarios_TA3 Jan24.pdf')
    text = convert(file_path, verbose=True)
    lines_list = [line.strip() for line in text.split('\n') if line.strip()]
    scenario_sublists = split_by_regex(lines_list, scenario_regex, ignores_list)
    rows_list = []
    for scenario_list in scenario_sublists:
        patient_sublists = split_by_regex(scenario_list, patient_regex, ignores_list)
        for patient_list in patient_sublists:
            medic_sublists = split_by_regex(patient_list, medic_regex, ignores_list)
            for medic_list in medic_sublists:
                row_dict = {}
                header_element = medic_list[0]
                if scenario_regex.match(header_element):
                    row_dict['scenario_number'] = int(header_element.split(' ')[1])
                    details_sublists = split_by_regex(medic_list, scenario_details_regex)
                    row_dict['scenario_situation'] = ' '.join(details_sublists[1][1:])
                    row_dict['scenario_resources'] = ' '.join(details_sublists[2][1:])
                elif patient_regex.match(header_element):
                    row_dict['patient_letter'] = header_element.split('-')[1]
                    details_sublists = split_by_regex(medic_list, patient_details_regex)
                    row_dict['patient_description'] = ' '.join(details_sublists[1][1:])
                    patient_vitals_list = details_sublists[2][1:]
                    for kv_pair in patient_vitals_list:
                        kv_list = kv_regex.split(kv_pair)
                        key = 'vitals_' + ascii_regex.sub('_', kv_list[0].rstrip('• ').lower()).strip('_')
                        value = kv_list[1].rstrip(' ')
                        # print(key, value)
                        row_dict[key] = value
                    if (len(details_sublists) == 4): row_dict['required_decision'] = ' '.join(details_sublists[3][1:])
                elif medic_regex.match(header_element):
                    row_dict['medic_number'] = int(header_element.split('-')[1])
                    details_sublists = split_by_regex(medic_list, medic_details_regex)
                    # print()
                    if (len(details_sublists) == 3):
                        medic_explanation = ' '.join(details_sublists[2][1:]).lstrip(item_prefix)
                        medic_explanation = re.sub(' +', ' ', medic_explanation)
                        row_dict['medic_explanation'] = medic_explanation
                        # display(medic_explanation)
                    actions_list = [a.strip() for a in ' '.join(details_sublists[1][1:]).split(item_prefix) if a]
                    # print(actions_list)
                    row_dict['medic_actions'] = actions_list
                rows_list.append(row_dict)
    experiment_scenarios_df = DataFrame(rows_list).ffill().bfill()
    print(experiment_scenarios_df.shape)
    print(df.columns.tolist())
    nu.store_objects(experiment_scenarios_df=experiment_scenarios_df)
    nu.save_data_frames(experiment_scenarios_df=experiment_scenarios_df)

In [7]:

experiment_scenarios_df.sample(5).T.sample(5).T

Unnamed: 0,vitals_heart_rate,vitals_pulse_quality,scenario_number,patient_letter,medic_number
14,"140, weak thready pulses",70/palp,4.0,C,44.0
7,150,"Normal in unaffected limb, carotid pulse also ...",2.0,A,101.0
8,150,"Normal in unaffected limb, carotid pulse also ...",3.0,A,101.0
0,130 (weak),"No palpable radial pulse, weak carotid pulse.",1.0,A,77.0
11,"140, weak thready pulses",70/palp,3.0,C,101.0


In [8]:

column_descriptions_df = nu.get_column_descriptions(experiment_scenarios_df)
column_descriptions_df

Unnamed: 0,column_name,dtype,count_blanks,count_uniques,count_zeroes,has_dates,min_value,max_value,only_integers
0,scenario_number,float64,0,4.0,0,True,1.0,4.0,True
1,medic_number,float64,0,8.0,0,True,33.0,101.0,True
2,scenario_situation,object,0,4.0,0,False,"Remote rural location, 2 hours from nearest me...","Two critical patients, both in excruciating pa...",
3,scenario_resources,object,0,4.0,0,False,One dose of Morphine 10 mg IV/IO,You have run out of supplies,
4,patient_letter,object,0,3.0,0,False,A,C,
5,patient_description,object,0,7.0,0,False,24 yo male dressed in business attire. The pat...,"Male patient seated on subway platform , obvio...",
6,vitals_ability_to_follow_commands,object,0,3.0,0,False,No,Yes,
7,vitals_respiratory_effort,object,0,6.0,0,False,20/min,"Shallow, 4/min",
8,vitals_pulse_quality,object,0,7.0,0,False,120/86,unable to obtain,
9,vitals_heart_rate,object,0,6.0,0,False,100,150,
