# Extracting Education Policies from OECD 

**Import libraries**

In [1]:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

**Chrome Driver**

In [2]:
url = "https://gpseducation.oecd.org/revieweducationpolicies/#!node=42869&filter=all" # the URL of the page we are scraping
service = Service('/Users/hajar/Documents/ALMANARA/OECD_scrapping /chromedriver-mac-arm64/chromedriver')
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

**Function to save the extracted text**

In [3]:
def save_to_file(content, filename="scraped_oecd.txt", mode="a"):
    with open(filename, mode, encoding="utf-8") as file:
        file.write(content + "\n")

**Function to extraxt Regional Development Insights**

In [4]:
def get_main_text(soup):
        results = []
        mainText = soup.find("div", {'id': 'leadTexts'})
        if mainText:
            paragraphs = mainText.find_all("p")
            for paragraph in paragraphs:
                results.append(paragraph.get_text(strip=True))
        return results

**Function to extraxt the Key Insights**

In [6]:
def get_key_insights(soup):
        results = []
        mainText = soup.find("div", {'id': 'keyInsights'})
        if mainText:
            print("Found keyInsights div")
            sections = mainText.find_all("div", {'class': 'Tertiary'})
            print(f"Number of sections found: {len(sections)}")
            for section in sections:
               title = section.find("h3")
               paragraph = section.find("p")
               if title and paragraph:
                   print(f"Found title and paragraph")
                   results.append({
                    'title': title.get_text(strip=True),
                    'text': paragraph.get_text(strip=True)
                   })
               else:
                   print("Missing title or paragraph")  
        
        else:
           print("keyInsights div not found")
        return results

**Function to extraxt Policy Options**

In [7]:
def get_policy_options(soup):
    results = []
    mainText = soup.find("div", {'id': 'policyDirections'})
    if mainText:
        print("Found policyDirections div")
        sections = mainText.find_all("div", {'class': 'Tertiary'})
        print(f"Number of sections found: {len(sections)}")
        for section in sections:
            title = section.find("h3")
            paragraph = section.find("div", {'class': 'collapsibleParagraph'})
            if title and paragraph:
                print(f"Found title and paragraph")
                results.append({
                    'title': title.get_text(strip=True),
                    'text': paragraph.get_text(strip=True)
                })
            else:
                print("Missing title or paragraph")
    else:
        print("policyDirections div not found")
    return results

**Extraction process**

In [11]:
try:
  
    driver.get(url)

   
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.ID, "policyDirections"))
    )
    time.sleep(10)


    page_source = driver.page_source


    soup = BeautifulSoup(page_source, "html.parser")

    text = get_main_text(soup)
    if text:
        with open("scraped_oecd.txt", "w", encoding="utf-8") as file:
            file.write("\n".join(text))
        print("Saved")
    else:
        print("Not found 404")

    key_insights = get_key_insights(soup)
    if key_insights:
        save_to_file("\nKey Insights:", mode="a")
        for insight in key_insights:
            content = f"\nTitle: {insight['title']}\nText: {insight['text']}\n{'-' * 50}"
            save_to_file(content, mode="a")
        print("Saved")
    else:
        print("Not found 404")


    policy_options = get_policy_options(soup)
    if policy_options:
        save_to_file("\nPolicy Options:", mode="a")
        for option in policy_options:
            content = f"\nTitle: {option['title']}\nText: {option['text']}\n{'-' * 50}"
            save_to_file(content, mode="a")
        print("Saved")
    else:
        print("Not found 404")


finally:

    driver.quit()

Saved
Saved
Saved
