In [1]:
import requests
import numpy as np
import json
import pandas as pd
from datetime import datetime
import time
import os
from pathlib import Path
import re

In [7]:
def fetch_wb_documents():
    """API call to get metadata"""
    base_url = "https://search.worldbank.org/api/v3/wds"
    fields = "id,docdt,docty,majdocty,url,count,lang,repnb,projid,alt_title,display_title_exact,display_title"

    offset = 0
    rows_per_page = 100

    print("Fetching data from World Bank API v3...")

    params = {
        "format": "json",
        "rows": rows_per_page,
        "docty_exact": "Implementation Completion Report Review",
        "majdocty_exact": "Project Documents",
        "strdate": "2019-01-01",
        "enddate": "2025-04-15",
        "fl": fields,
        "os": offset,
    }

    response = requests.get(base_url, params=params)
    response.raise_for_status()
    all_data = response.json()

    total_docs = all_data.get("total", 0)
    fetched_count = len(all_data.get("documents", {}))

    print(f"Fetched {fetched_count} of {total_docs} documents")

    while fetched_count < total_docs:
        offset += rows_per_page

        params["os"] = offset

        response = requests.get(base_url, params=params)
        response.raise_for_status()
        page_data = response.json()

        page_docs = page_data.get("documents", {})

        all_data["documents"].update(page_docs)

        fetched_count = len(all_data.get("documents", {}))
        print(f"Fetched {fetched_count} of {total_docs} documents")

        time.sleep(0.5)

    print(f"Total documents fetched: {fetched_count}")
    return all_data


def extract_urls(doc_json):
    meta_d = doc_json.get("documents")
    meta_values = [
        doc for key, doc in meta_d.items() if key != "facets" and isinstance(doc, dict)
    ]
    urls = [doc.get("pdfurl", "") for doc in meta_values]
    urls_2 = [u.replace("pdf", "txt") for u in urls]
    ids = [doc.get("id", "") for doc in meta_values]

    url_dict = dict(zip(ids, urls_2))

    return url_dict


def download_txts(url_dictionary, path):
    os.makedirs(path, exist_ok=True)
    for file_id, url in url_dictionary.items():
        try:
            if not url.strip():
                raise ValueError("Empty URL")
            response = requests.get(url)
            response.raise_for_status()  # Raises HTTPError for bad status
            filepath = os.path.join(path, f"{file_id}.txt")
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(response.text)
        except (requests.exceptions.RequestException, ValueError) as e:
            print(f"Failed to download {file_id}: {e}")

In [None]:
iccr_docs = fetch_wb_documents()
url_d = extract_urls(iccr_docs)

Fetching data from World Bank API v3...
Fetched 101 of 1623 documents
Fetched 201 of 1623 documents
Fetched 301 of 1623 documents
Fetched 401 of 1623 documents
Fetched 501 of 1623 documents
Fetched 601 of 1623 documents
Fetched 701 of 1623 documents
Fetched 801 of 1623 documents
Fetched 901 of 1623 documents
Fetched 1001 of 1623 documents
Fetched 1101 of 1623 documents
Fetched 1201 of 1623 documents
Fetched 1301 of 1623 documents
Fetched 1401 of 1623 documents
Fetched 1501 of 1623 documents
Fetched 1601 of 1623 documents
Fetched 1624 of 1623 documents
Total documents fetched: 1624


In [None]:
download_txts(url_d, "../02_inter_data")

Failed to download 33632066: Empty URL


In [None]:
def clean_text(text):
    """
    Clean extracted text
    """
    if not text or text == "Not found":
        return text

    text = re.sub(r"Page \d+ of \d+", "", text)

    text = re.sub(r"\(p\. \d+\)", "", text)

    text = re.sub(r"\f", "", text)

    text = re.sub(r"\n{3,}", "\n\n", text)

    text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]", "", text)

    text = text.strip()

    return text


def parse_me_sections(file_path):
    """
    parse, find and save m&e sections
    """
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()

        # Define patterns to match each section
        design_pattern = r"a\.\s+M&E\s+Design\s+(.*?)(?=b\.\s+M&E\s+Implementation)"
        implementation_pattern = (
            r"b\.\s+M&E\s+Implementation\s+(.*?)(?=c\.\s+M&E\s+Utilization)"
        )
        utilization_pattern = (
            r"c\.\s+M&E\s+Utilization\s+(.*?)(?=M&E\s+Quality\s+Rating)"
        )
        rating_pattern = r"M&E\s+Quality\s+Rating\s+(\w+)"

        # Extract sections using regex with DOTALL flag to match across multiple lines
        design_match = re.search(design_pattern, content, re.DOTALL)
        implementation_match = re.search(implementation_pattern, content, re.DOTALL)
        utilization_match = re.search(utilization_pattern, content, re.DOTALL)
        rating_match = re.search(rating_pattern, content)

        results = {
            "M&E Design": clean_text(
                design_match.group(1).strip() if design_match else "Not found"
            ),
            "M&E Implementation": clean_text(
                implementation_match.group(1).strip()
                if implementation_match
                else "Not found"
            ),
            "M&E Utilization": clean_text(
                utilization_match.group(1).strip() if utilization_match else "Not found"
            ),
            "M&E Quality Rating": clean_text(
                rating_match.group(1).strip() if rating_match else "Not found"
            ),
        }

        return results

    except Exception as e:
        return {"Error": f"An error occurred: {str(e)}"}


def parse_directory(directory_path):
    """
    parse through all files in directory
    """
    all_results = {}
    try:
        file_paths = []
        for filename in os.listdir(directory_path):
            if filename.lower().endswith(".txt"):
                full_path = os.path.join(directory_path, filename)
                if os.path.isfile(full_path):
                    file_paths.append((filename, full_path))

        for filename, file_path in file_paths:
            clean_filename = filename.replace(".txt", "")
            file_results = parse_me_sections(file_path)

            # Add debug statement
            if not isinstance(file_results, dict):
                print(
                    f"Warning: parse_me_sections returned {type(file_results)} for {filename}"
                )

            all_results[clean_filename] = file_results

        return all_results

    except Exception as e:
        return {"Error": f"An error occurred while processing directory: {str(e)}"}
        return {}


def to_dataframe(all_results):
    """
    Convert nested dict to df
    """
    rows = []

    for doc_id, sections in all_results.items():
        # Check if sections is a dictionary
        if not isinstance(sections, dict):
            print(f"Warning: Expected dictionary for {doc_id}, got {type(sections)}")
            continue

        if "Error" in sections:
            continue

        row = {
            "document_id": doc_id,
            "design_text": sections.get("M&E Design", ""),
            "implementation_text": sections.get("M&E Implementation", ""),
            "utilization_text": sections.get("M&E Utilization", ""),
            "quality_rating": sections.get("M&E Quality Rating", ""),
        }
        rows.append(row)

    if not rows:
        return pd.DataFrame()  #

    df = pd.DataFrame(rows)
    return df

In [None]:
d_path = "../02_inter_data"
test_results = parse_directory(d_path)
df_test_results = to_dataframe(test_results)
df_test_results.to_csv("../01_data/txt_reports.csv", index=False)

In [None]:
df_test_results.head(10)

Unnamed: 0,document_id,design_text,implementation_text,utilization_text,quality_rating
0,31244746,There were three original key outcome indicato...,The ICR (paragraph 38) provides little detail ...,Data collected during implementation was used ...,Modest
1,32310135,The key outcome indicators (restoring electric...,"A dated covenant in the Legal Agreement, that ...",The project management unit utilized the M&E i...,Substantial
2,33978014,The M&E system was designed as a Results-Based...,The MDLF implemented the M&E system as designe...,M&E data informed progress against the DLIs an...,Substantial
3,33281277,Not found,Not found,Not found,Not found
4,34339832,Not found,Not found,Not found,Not found
5,30888660,The ICR (paragraph 71) notes that due to the e...,"During implementation, three key indicators we...",The ICR provides no information on whether the...,Modest
6,33580118,The PAD did not include a Theory of Change as ...,Implementation was overseen by the National Di...,"According to the ICR (paragraph 72) the ""M&E f...",Modest
7,34104048,The M&E design of the program was to use the g...,The Ministry of Agriculture and Rural Developm...,M&E data was used by the National Coordinating...,Substantial
8,32019383,M&E arrangements were generally adequate with ...,"According to the ICR, project implementation i...",M&E data were utilized for regular project pro...,Substantial
9,31030527,The main PDO-level results indicator is the in...,The MoPL had responsibility for coordinating a...,The M&E data was utilized at the international...,Substantial
