# UNDP Policies Scrapper

This code scrapes policy documents from the UNDP site, downloads them if they are related to Human Resource Management, converts the DOCX files to JSON format, and calculates statistical information about the word counts of these documents.

In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import docx
import json
import statistics

In [2]:

# Function to get direct sublevels from a given URL
def get_direct_sublevels(url):
    direct_sublevels = set()  # Set to store direct sublevel URLs
    response = requests.get(url)  # Make a GET request to the URL
    soup = BeautifulSoup(response.content, 'html.parser')  # Parse the HTML content
    links = soup.find_all('a', href=True)  # Find all anchor tags with href attribute
    for link in links:
        href = link['href']
        absolute_url = urljoin(url, href)  # Convert relative URL to absolute URL
        if absolute_url.startswith(url) and absolute_url != url:
            direct_sublevels.add(absolute_url)  # Add the sublevel URL to the set
    return direct_sublevels

In [3]:
# Function to download a file from a sublevel URL
def download_file(sublevel_url, download_folder):
    os.makedirs(download_folder, exist_ok=True)  # Create download folder if it doesn't exist
    response = requests.get(sublevel_url)  # Make a GET request to the sublevel URL
    if response.ok:
        soup = BeautifulSoup(response.content, 'html.parser')  # Parse the HTML content
        # Select the breadcrumb anchor to check if the page is related to HRM
        breadcrumb_anchor = soup.select_one('.breadcrumb > ul:nth-child(1) > li:nth-child(2) > a:nth-child(1)')
        if breadcrumb_anchor and 'aria-label' in breadcrumb_anchor.attrs and breadcrumb_anchor['aria-label'] == "Human Resources Management":
            # Find the download section in the policy document page
            download_div = soup.find('div', class_='main-policy document-download-card')
            if download_div:
                # Check for span with specified classes to find the download button
                download_span = download_div.find('span', class_=lambda value: value and 'download-btn-position' in value.split())
                if download_span:
                    download_link = download_span.find('a', href=True)['href']
                    file_url = urljoin(sublevel_url, download_link)  # Create the full file URL
                    # Extract filename from sublevel URL
                    filename = sublevel_url.split('/')[-1] + ".docx"
                    file_path = os.path.join(download_folder, filename)
                    # Download and save the file
                    with open(file_path, 'wb') as f:
                        f.write(requests.get(file_url).content)
                    print(f"Downloaded {filename} to {download_folder}")
                else:
                    print(f"No download link found on {sublevel_url}")
            else:
                print(f"No download section found on {sublevel_url}")
        else:
            print(f"Page {sublevel_url} is not HRM")
    else:
        print(f"Failed to retrieve content from {sublevel_url}. Status code: {response.status_code}")

In [4]:
# Base URL of the policy pages
base_url = "https://popp.undp.org/policy-page/"

# Get direct sublevel URLs
direct_sublevels = get_direct_sublevels(base_url)
download_folder = "policies"  # Folder to save downloaded policy documents

In [None]:
# Download files from each sublevel URL
for sublevel_url in direct_sublevels:
    download_file(sublevel_url, download_folder)

In [None]:
# Function to convert DOCX files to JSON format
def convert_docx_to_json(docx_folder, json_file):
    data = {}

    # Loop through DOCX files in the input folder
    for root, dirs, files in os.walk(docx_folder):
        for file in files:
            if file.endswith(".docx"):
                docx_file = os.path.join(root, file)
                # Load the DOCX file
                doc = docx.Document(docx_file)
                # Extract text from paragraphs and remove newline characters
                text = ' '.join(paragraph.text for paragraph in doc.paragraphs)
                # Extract filename without extension
                filename = os.path.splitext(os.path.basename(docx_file))[0]
                # Add text to dictionary
                data[filename] = text

    # Write data to JSON file
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
# Define input folder and output JSON file
docx_folder = "policies"
json_file = "policies.json"

# Convert DOCX files to JSON
convert_docx_to_json(docx_folder, json_file)

In [6]:
# Function to calculate statistics for word counts in dictionary values
def calculate_stats(data):
    word_counts = [len(str(value).split()) for value in data.values()]
    min_word_count = min(word_counts)
    max_word_count = max(word_counts)
    avg_word_count = sum(word_counts) / len(word_counts)
    stddev_word_count = statistics.stdev(word_counts)
    return min_word_count, max_word_count, avg_word_count, stddev_word_count

# Load JSON data from a file
with open('policies.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Calculate statistics
min_word_count, max_word_count, avg_word_count, stddev_word_count = calculate_stats(data)

# Print the results
print(f"Minimum word count: {min_word_count}")
print(f"Maximum word count: {max_word_count}")
print(f"Average word count: {avg_word_count:.2f}")
print(f"Standard Deviation: {stddev_word_count:.2f}")

Minimum word count: 85
Maximum word count: 17288
Average word count: 2405.16
Standard Deviation: 3364.75
