## Extract Text from Document


In [101]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [102]:
from pdf2image import convert_from_path
import pytesseract
def extract_text_from_doc(pdf_path, start_page=10):
    images = convert_from_path(pdf_path)
    
    # Slice only the pages you want
    selected_images = images[start_page:]
    
    extracted_text = ""
    for i, image in enumerate(selected_images, start=start_page):
        print(f"Processing page {i + 1}")
        text = pytesseract.image_to_string(image)
        extracted_text += f"\n\n=== Page {i + 1} ===\n{text}"
    
    return extracted_text


In [103]:
# starting from page-10 to avoid appendix and other unnecessary pages 
text = extract_text_from_doc("q2_pdf.pdf", start_page=10)
text

Processing page 11
Processing page 12
Processing page 13
Processing page 14
Processing page 15
Processing page 16
Processing page 17
Processing page 18
Processing page 19
Processing page 20
Processing page 21
Processing page 22
Processing page 23
Processing page 24
Processing page 25
Processing page 26
Processing page 27
Processing page 28
Processing page 29
Processing page 30
Processing page 31
Processing page 32
Processing page 33
Processing page 34
Processing page 35
Processing page 36
Processing page 37
Processing page 38
Processing page 39
Processing page 40
Processing page 41
Processing page 42
Processing page 43
Processing page 44
Processing page 45
Processing page 46
Processing page 47
Processing page 48
Processing page 49
Processing page 50
Processing page 51
Processing page 52
Processing page 53
Processing page 54
Processing page 55
Processing page 56
Processing page 57
Processing page 58
Processing page 59
Processing page 60
Processing page 61
Processing page 62
Processing p

'\n\n=== Page 11 ===\nProtocol number DNDi-CH-E1224-003\nMay 04, 2018. Version 5.0.\n\nPROTOCOL SUMMARY\n\nProtocol Title\n\nDouble-blind, Double-dummy, Phase 2 Randomized, Multicenter, Proof-of-\nConcept, Safety and Efficacy Trial to Evaluate Different Oral Benznidazole\nMonotherapy and Benznidazole/E1224 Combination Regimens for the Treatment\nof Adult Patients with Chronic Indeterminate Chagas Disease\n\nPhase\n\nInvestigational — Phase 2 trial\n\nIndication\n\nChronic Indeterminate Chagas Disease\n\nProtocol\nNumber\n\nDNDi-CH-E1224-003\n\nBackground\nInformation\nand Trial\nRationale\n\nThe current treatment for Chagas disease has significant limitations, including long\ntreatment durations, safety and tolerability concerns and is currently limited to two\nnitro-heterocyclic drugs, nifurtimox and benznidazole (BZN). BZN, a nitroimidazole\nintroduced by Roche in 1971, is marketed by Laboratério Farmacéutico do Estado\nde Pernambuco S/A — LAFEPE and Laboratorio ELEA — Argentina. It 

## Extract Primary & secondary objectives

In [104]:
import re
def extract_outcomes(text):
    primary_objectives = re.search(r'Primary Objectives:(.*?):', text, re.DOTALL)
    
    if primary_objectives:
        primary_objectives = primary_objectives.group(1).strip()
    
    # Extract secondary objectives
    secondary_objectives = re.search(r'Secondary Objectives:(.*?):', text, re.DOTALL)
    if secondary_objectives:
        secondary_objectives = secondary_objectives.group(1).strip()
    primary_objectives = re.sub(r'^\s*e\b', '-', primary_objectives)
    secondary_objectives = re.sub(r'^\s*e\b', '-', secondary_objectives)
    secondary_objectives = re.sub(r'\ne', '\n-', secondary_objectives)
    return primary_objectives, secondary_objectives


In [105]:
import textwrap
def format_primary_outcome(text):
    # Replace line breaks and multiple spaces with a single space
    cleaned_text = ' '.join(text.split())

    # Optional: Wrap text at 100 characters per line for readability
    wrapped_text = textwrap.fill(cleaned_text, width=100)

    return wrapped_text


In [106]:
import textwrap
primary, secondary = extract_outcomes(text)
primary=format_primary_outcome(primary)
print("Primary Outcomes:\n", textwrap.fill(primary,width=100,replace_whitespace=False))
print("\nSecondary Outcomes:\n", secondary)

Primary Outcomes:
 - To determine the efficacy of different dosing regimens of orally administered BZN and BZN/E1224 in
individuals with chronic indeterminate CD, by determining the proportion of patients who convert
from positive to negative in serial, qualitative PCR test results (3 negative PCR results) at end of
treatment (EOT) and sustain parasitological clearance at 6 months of follow-up, in comparison to
placebo. Secondary Objectives

Secondary Outcomes:
 - To measure the reduction in parasite load at weeks 1, 2, 3, 4, 6, 10, 12 and
at 4, 6 and 12 months follow-up, as measured by quantitative PCR.

- To assess the time to parasite DNA clearance (below the quantitative PCR
[qPCR] Limit of Detection [LOD]) for each of the regimens

- To assess the sustained parasitological response at week 12, and 12
months for each of the regimens, in comparison with placebo.

- To assess the time to sustained clearance of parasitemia for each of the
treatment regimens.

- To determine the effica

## extract trial duration and age

In [107]:
import re

def extract_trial_duration(text):
    # Regular expression to find the total duration after "total duration" and stop at the time unit
    duration_match = re.search(r'total duration.*?(\d+\s*(months|weeks|days))', text, re.IGNORECASE)
    
    if duration_match:
        total_duration = duration_match.group(1)  # Extract the duration and time unit
    else:
        total_duration = None
    
    return total_duration

def extract_age_range(text):

    pattern = r'(?:age)\s*(?:>|greater than)\s*(\d{1,2})\s*(?:to|and|-)\s*(?:<|less than)\s*(\d{1,2})\s*(?:year|years|yrs)?'   
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return int(match.group(1)), int(match.group(2))
    return None, None


In [108]:
duration = extract_trial_duration(text)
min_age, max_age = extract_age_range(text)


In [109]:
print(f"Total Duration: {duration}")
print(f"Age Range: {min_age} to {max_age} years")

Total Duration: 13 months
Age Range: 18 to 50 years
