I want to simulate human interaction with the code: go to the website and download the link, next page, download the link, until the last page. 

In [92]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

# Initialize the Chrome webdriver
driver = webdriver.Chrome()

# Navigate to the website
driver.get('https://disclosures.ifc.org/enterprise-search-results-home?f_type_description=Investment')

# Click on the "Development Impact"" dropdown to expand it
project_type_dropdown = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), 'Development Impact')]"))
)
project_type_dropdown.click()

# Now, wait for the "Development Results" checkbox to be clickable and then click on it
investment_checkbox = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), 'Development Results')]"))
)
investment_checkbox.click()


# Go to each page and download the links
all_links = []

# Function to extract links from the current page
def extract_links():
    """Function to extract links from the current page and return them."""
    link_elements = driver.find_elements(By.XPATH, '//div[contains(@class, "row margin-top15 projects")]//a[@class="search-head"]')
    return [element.get_attribute('href') for element in link_elements]

# Number of times to click the "Next" button
click_count = 32 #how many pages in the search result: you need modify it in the future

for _ in range(click_count + 1):  # +1 to include the initial page
    # Extract links from the current page
    all_links.extend(extract_links())
    
    # If it's not the last iteration, click the "Next" button
    if _ != click_count:
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//em[@class='fa fa-chevron-right']"))
        )
        
        # Scroll to the next button
        actions = ActionChains(driver)
        actions.move_to_element(next_button).perform()
        
        # Click the next button
        next_button.click()
        
        # Wait for the page to load new content
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[contains(@class, "row margin-top15 projects")]//a[@class="search-head"]'))
        )

# Print all the extracted links
for link in all_links:
    print(link)



https://disclosures.ifc.org/project-detail/SII/48626/ocp-solar-2
https://disclosures.ifc.org/project-detail/SII/46178/bcp-cmgp-rsf
https://disclosures.ifc.org/project-detail/SII/46236/somabis-sarl
https://disclosures.ifc.org/project-detail/SII/47009/tnd
https://disclosures.ifc.org/project-detail/SII/46920/rse-almarai-3
https://disclosures.ifc.org/project-detail/SII/48453/cimaf-green-loan
https://disclosures.ifc.org/project-detail/ESRS/48626/ocp-solar-2
https://disclosures.ifc.org/project-detail/ESRS/46178/bcp-cmgp-rsf
https://disclosures.ifc.org/project-detail/ESRS/46236/somabis-sarl
https://disclosures.ifc.org/project-detail/ESRS/47009/tnd
https://disclosures.ifc.org/project-detail/SII/42810/project-simba-equity-insurance-sector
https://disclosures.ifc.org/project-detail/SII/43901/ratch-loan
https://disclosures.ifc.org/project-detail/SII/43733/ocb-green-loan
https://disclosures.ifc.org/project-detail/SII/45146/psl-ultratech
https://disclosures.ifc.org/project-detail/SII/44080/wcs-covi

In [93]:
print(f"Number of links extracted: {len(all_links)}")

Number of links extracted: 330


In [83]:
import pandas as pd

# Extract the numbers and links from the "all_links" list
new_rows = []
for url in all_links:
        id = url.split("/")[-2]  # Apply id = link.split("/")[-2] to every link
        new_rows.append({"url": url, "id": id})

# Append the new rows to the DataFrame
df = pd.DataFrame(new_rows, columns=["url", "id"])

# Print the dataframe
print(df.head(5))

# Check how many urls are NA in df
na_urls = df["url"].isna().sum()
print(f"Number of NA urls in df: {na_urls}")


                                                 url     id
0  https://disclosures.ifc.org/project-detail/SII...  48626
1  https://disclosures.ifc.org/project-detail/SII...  46178
2  https://disclosures.ifc.org/project-detail/SII...  46236
3  https://disclosures.ifc.org/project-detail/SII...  47009
4  https://disclosures.ifc.org/project-detail/SII...  46920
Number of NA urls in df: 0


For each link with SII in the link, extract the relevant information

In [102]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException, TimeoutException
import time

# Initialize the Chrome webdriver
driver = webdriver.Chrome()

def extract_complete_content_from_url(url):
    driver.get(url)
    
    # Wait until the required elements are loaded
    wait = WebDriverWait(driver, 30)
    wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'esrs-name')))
    wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'esrs-value')))
    
    # Extract initial data from the elements
    names = [element.text for element in driver.find_elements(By.CLASS_NAME, 'esrs-name')]
    values = [element.text for element in driver.find_elements(By.CLASS_NAME, 'esrs-value')]

    # Specifically extract the "Project Description" using a more precise XPath
    project_description = driver.find_element(By.XPATH, "//div[contains(text(), 'Project Description')]/following-sibling::div[1]").text
    values[names.index('Project Description')] = project_description

    # Specifically extract the "Sector" using a more precise XPath
    sector = driver.find_element(By.XPATH, "//div[contains(text(), 'Sector')]/following-sibling::div[1]/div").text
    if 'Sector' in names:
        values[names.index('Sector')] = sector

    # Store data in a dictionary
    project_info = dict(zip(names, values))
    
    # Scroll down a bit to ensure the "Development Impact" button is visible
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/4);")
    time.sleep(2)  # Let the page load
    
    # Click on the "Development Impact" tab
    wait_and_click("//div[contains(text(), 'Development Impact')]")
    time.sleep(2)  # Brief pause for potential page changes
    
    # Click on the "Anticipated Impact Measurement & Monitoring (AIMM) Assessment" button
    wait_and_click("//button[contains(text(), 'Anticipated Impact Measurement & Monitoring (AIMM) Assessment')]")
    time.sleep(2)  # Brief pause for potential page changes
    
    # Extract AIMM content
    content = driver.find_element(By.XPATH, "//div[@id='siiAccorAnticipated']//p[@class='disclaimer-text']").text
    
    # Add the AIMM content to the project info dictionary
    project_info["AIMM Assessment Content"] = content
    project_info["URL"] = url
    
    return project_info

def wait_and_click(xpath):
    try:
        elem = WebDriverWait(driver, 1).until(EC.element_to_be_clickable((By.XPATH, xpath)))
        
        # Scroll the element into view
        driver.execute_script("arguments[0].scrollIntoView(true);", elem)
        time.sleep(2)  # Brief pause to let the page adjust
        
        elem.click()
    except ElementClickInterceptedException:
        print(f"ElementClickInterceptedException encountered for xpath: {xpath}. Attempting JavaScript click.")
        driver.execute_script("arguments[0].click();", elem)
    except TimeoutException:
        print(f"TimeoutException: Couldn't locate or click the element with xpath: {xpath}")

# Test the function on the given URL
url = 'https://disclosures.ifc.org/project-detail/SII/36265/vicentin-pre-exp'
project_data = extract_complete_content_from_url(url)

# Print the extracted data
for key, value in project_data.items():
    print(f"{key}: {value}")

# Close the WebDriver
driver.quit()


Project Number: 36265
Company Name: VICENTIN S.A.I.C.
Date SPI Disclosed: Aug 4, 2015
Country: Argentina
Region: South America
Projected Board Date: Sep 4, 2015
Environmental Category: B
Status: Active
Last Updated Date: 
Department: Gbl Ind, Manufact, Agribus &amp; Services
Industry: Agribusiness and Forestry
Previous Events: Approved : Oct 15, 2015
Sector: Vegetable Fats and Oils
Project Description: The proposed investment consists of a medium-term pre-export finance facility to support Vicentin SAIC’s working capital needs from exports of oilseed oil and sub-products (the “Project”).
AIMM Assessment Content: 1) Project will support growth and operations of a competitive domestic player, making a significant contribution to local economic development through job creation, and to local farmers and intermediaries that supply oilseeds.
2) Approximately 2,000 soybean and sunflowerseed farmers are expected to benefit from improved market access through Vicentin, directly benefiting rural

In [84]:
sii_links = [url for url in all_links if "SII" in url]
len(sii_links)

326

In [103]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def initialize_webdriver():
    return webdriver.Chrome()

driver = initialize_webdriver()

def extract_data_from_links(links):
    global driver  # Declare driver as a global variable
    all_project_data = []
    for url in links:
        try:
            project_data = extract_complete_content_from_url(url)
            all_project_data.append(project_data)
            print(f"Data extracted from {url}")
        except Exception as e:
            print(f"Error extracting data from {url}: {e}")
            driver.quit()
            time.sleep(3)  # Give it a few seconds before reinitializing
            driver = initialize_webdriver()
    driver.quit()
    return all_project_data

# Now, apply the function to the first two URLs from sii_links
all_project_data = extract_data_from_links(sii_links)


Data extracted from https://disclosures.ifc.org/project-detail/SII/48626/ocp-solar-2
Data extracted from https://disclosures.ifc.org/project-detail/SII/46178/bcp-cmgp-rsf
Data extracted from https://disclosures.ifc.org/project-detail/SII/46236/somabis-sarl
Data extracted from https://disclosures.ifc.org/project-detail/SII/47009/tnd
Data extracted from https://disclosures.ifc.org/project-detail/SII/46920/rse-almarai-3
Data extracted from https://disclosures.ifc.org/project-detail/SII/48453/cimaf-green-loan
Data extracted from https://disclosures.ifc.org/project-detail/SII/42810/project-simba-equity-insurance-sector
Data extracted from https://disclosures.ifc.org/project-detail/SII/43901/ratch-loan
Data extracted from https://disclosures.ifc.org/project-detail/SII/43733/ocb-green-loan
Data extracted from https://disclosures.ifc.org/project-detail/SII/45146/psl-ultratech
Data extracted from https://disclosures.ifc.org/project-detail/SII/44080/wcs-covid-nmbtz
Data extracted from https://di

In [104]:
len(all_project_data)

322

In [105]:
import json

# Save the data to a JSON file
filename = "all_project_data.json"
with open(filename, 'w') as file:
    json.dump(all_project_data, file)

print(f"Data saved to {filename}")

Data saved to all_project_data.json


Conver this file to suit the ts template. 

In [106]:
import pandas as pd
import json
import re
import jieba

# Load the data
with open('all_project_data.json') as f:
    data = json.load(f)

# Define a function to clean text
def clean_text(text):
    text = re.sub("\\n", " ", text)
    text = re.sub("\\s+", " ", text)
    text = re.sub("\\.([a-zA-Z])", ". \\1", text)
    text = text.strip()
    text = text.lower()
    return text

# Clean the data
for project in data:
    company_name = project.pop('Company Name')
    project_number = project.pop('Project Number')
    project['essay_title'] = company_name
    #project['essay_url'] = "https://disclosures.ifc.org/project-detail/SII/" + project_number + "/" + company_name.lower().replace(' ', '-')
    url = project.pop('URL')
    project['essay_date'] = project.pop('Country') + "; " + ("other" if project['Sector'] == "other" else project['Sector'][7:])
    project['essay_thanks'] = clean_text(project.pop('Sector'))
    project['content'] = clean_text(project.pop('Project Description') + ' The development impact is: ' + project.pop('AIMM Assessment Content'))
    project['content_length'] = len(project['content'])
    project['id'] = project_number
    project['essay_url'] = url


# Transform the data to match the desired structure
transformed_data = {
    "current_date": "2023-03-01",
    "author": "Your name here",
    "url": "Your URL here",
    "length": sum(len(essay["content"]) for essay in data),
    "tokens": sum(len(essay["content"].split()) for essay in data),
    "essays": []
}

def get_chunks(content, url, title):
    content_length = len(content)
    content_tokens = len(list(jieba.cut(content, cut_all=False)))
    chunk = {
        "essay_title": title,
        "essay_url": url,
        "essay_date": "NA",
        "essay_thanks": id,
        "content": content,
        "content_length": content_length,
        "content_tokens": content_tokens,
        "embedding": []
    }
    return [chunk]

for project in data:
    essay = {
        "title": project["essay_title"],
        "url": project["essay_url"],
        "date": project["essay_date"],
        "thanks": project["essay_thanks"],
        "content": project["content"],
        "content_length": project["content_length"],
        "content_tokens": len(project["content"].split()),
        "chunks": get_chunks(project["content"], project["essay_url"], project["essay_title"]),
        "embedding": []
    }
    transformed_data["essays"].append(essay)

# Save the transformed data
with open('pg.json', 'w') as f:
    json.dump(transformed_data, f, ensure_ascii=False, indent=4)
