In [15]:
# !pip install selenium

In [16]:
from tqdm import tqdm

In [17]:
from __future__ import absolute_import, division, print_function, unicode_literals

import datetime
import logging
import os
import random
import time
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

# import shapefile
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait

In [18]:
DEFAULT_WINDOW_SIZE = (1366, 768)
DEFAULT_LOG_PATH = os.path.devnull

USER_AGENTS = (
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/604.4.7 (KHTML, like Gecko) Version/11.0.2 Safari/604.4.7',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    )
"""
Tuple[str]: Set of user agents that we randomly choose from to seem "human".
    Source: https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
"""


def init_chrome_webdriver(
        executable_path='chromedriver', download_dir=None,
        window_size=None, user_agent=None, log_path=None,
        headless=True, incognito=True,
        ignore_certificate_errors=True,
        disable_gpu=True, disable_notifications=True, disable_infobars=True):
    """
    Configure and initialize the ChromeDriver service, then create and return a
    new instance of the Chrome web driver, via ``selenium``.

    Args:
        executable_path (str): Path to ChromeDriver executable, downloadable from
            http://chromedriver.storage.googleapis.com/index.html. If 'chromedriver',
            the executable must be somewhere in ``$PATH``.
        window_size (Tuple[int, int]): Size (width x height) of browser window.
            If None, a default window size is used.
        user_agent (str): Set browser's user agent; If None, a common user agent
            is randomly selected. This can be useful for "spoofing".
        log_path (str): Path on disk to which logging statements are written.
            If None, '/dev/null' is used by default, effectively disabling logging.
        headless (bool): If True, run browser in headless mode, i.e. without a UI
            or display server dependencies; otherwise, open a regular browser window.
        incognito (bool): Launch browser in incognito mode.
        ignore_certificate_errors (bool): If True, ignore certificate-related
            errors; otherwise, raise exceptions for such errors.
        disable_gpu (bool): If True, disable GPU hardware acceleration; otherwise,
            attempt to use GPU when rendering.
        disable_notifications (bool): If True, disable web notification and push APIs.
        disable_infobars (bool): If True, prevent infobars from appearing.

    Returns:
        :class:`webdriver.Chrome()`

    References:
        A complete list of chrome options can be found here:
        https://peter.sh/experiments/chromium-command-line-switches/
    """
    options = webdriver.ChromeOptions()

    # set boolean switch args
    if headless is True:
        options.add_argument('--headless')
    if incognito is True:
        options.add_argument('--incognito')
    if ignore_certificate_errors is True:
        options.add_argument('--ignore-certificate-errors')
    if disable_gpu is True:
        options.add_argument('--disable-gpu')
    if disable_notifications is True:
        options.add_argument('--disable-notifications')
    if disable_infobars is True:
        options.add_argument('--disable-infobars')

    # set window size, using a global default if not specified
    # (does this matter if `headless=True`?)
    if not window_size:
        window_size = DEFAULT_WINDOW_SIZE
    options.add_argument('--window-size={w},{h}'.format(w=window_size[0], h=window_size[1]))

    # set user-agent, using a randomly selected default if not specified
    if not user_agent:
        user_agent = random.choice(USER_AGENTS)
    options.add_argument('--user-agent="{}"'.format(user_agent))

    # get a global default for logging path if not specified
    # value passed in webdriver.Chrome init
    if not log_path:
        log_path = DEFAULT_LOG_PATH

    capabilities = DesiredCapabilities.CHROME.copy()
    capabilities['version'] = 'latest'
    capabilities['loggingPref'] = {
        'driver': 'WARNING', 'server': 'WARNING', 'browser': 'WARNING'}
    
    
    prefs = {
        'profile.default_content_settings.popups': False,
        'directory_upgrade': True,
    }
    if download_dir:
        prefs['download.default_directory'] = download_dir
    options.add_experimental_option('prefs', prefs)

    # initialize the driver with specified configuration
    driver = webdriver.Chrome(
        executable_path=executable_path,
        options=options,
        desired_capabilities=capabilities,
        service_log_path=log_path)

    return driver


def randomized_sleep(duration):
    """
    Sleep a randomized amount of time between ``duration`` and 2 * ``duration`` seconds.
    """
    time.sleep(duration + duration * random.random())
    
    
   

In [19]:

def execute_search(search_term):
    print("#"*30)
    print('Searching for Term:', search_term)
    ## Build the chrome windows
    driver = init_chrome_webdriver(headless=False, download_dir=None)
    sleep(2) ## Wait for it
    
    ## Grab the Url
    url = "https://disclosures.ifc.org/#/enterpriseSearchResultsHome/*"
    driver.get(url)
    print('Initializing Website')
    sleep(3) ## Wait for it
    
    ## Esecute the Search
    inputElement = driver.find_element_by_id("searchBox")
    inputElement.clear() ## Clear it just in case
    inputElement.send_keys('"{}"'.format(search_term))
    inputElement.send_keys(Keys.ENTER)
    print('searching for term')
    sleep(3)

    ## Now Collect the Links

    soup = BeautifulSoup(driver.page_source)
    current_page = 0
    results = []
    pagenum = soup.find(text=" Page")
    total_pages = int([i for i in pagenum.parent.nextSiblingGenerator()][3].text)
    print('Total Pages', total_pages)

    print('Scraping Results')
    while current_page+1 <= total_pages:
        current_page += 1
        soup = BeautifulSoup(driver.page_source)


        print ('\nProcessing Page: %s' % current_page, '\n')
        for i in soup.find_all('div' , {"class": "projects"}):
            try:
                selected = i.find('a',{'class':'search-head'});
                url = selected['href']
                label = selected.text

#                 print(label, url)
                results.append([label, url])
            except TypeError:
                continue
        if current_page < total_pages:
            sleep(2)
            nextButton = driver.find_element_by_class_name('next')
            print(nextButton)
            nextButton.click()
            sleep(2)
            
    df = pd.DataFrame(results,columns=['Project Name','url'])
    df['search_term'] = search_term
    driver.quit()
    print('Completed Search for', search_term,'\n')
    return df

In [20]:
search_terms = ["China Construction Bank", "Bank of China",'Deutsche Bank','JPMorgan Chase']
master_df = None
dfs = []
for idx, t in enumerate(tqdm(search_terms)):   
    print(t)
    results = execute_search(t)
    if idx==0:
        master_df = results
    else:
        master_df = master_df.append(results)

  0%|          | 0/4 [00:00<?, ?it/s]

China Construction Bank
##############################
Searching for Term: China Construction Bank
Initializing Website
searching for term
Total Pages 1
Scraping Results

Processing Page: 1 



 25%|██▌       | 1/4 [00:16<00:48, 16.23s/it]

Completed Search for China Construction Bank 

Bank of China
##############################
Searching for Term: Bank of China
Initializing Website
searching for term
Total Pages 2
Scraping Results

Processing Page: 1 

<selenium.webdriver.remote.webelement.WebElement (session="bfe0986638711e8a91d9e526b3a0f36f", element="0.33977600427145505-2")>


 50%|█████     | 2/4 [00:37<00:37, 18.54s/it]


Processing Page: 2 

Completed Search for Bank of China 

Deutsche Bank
##############################
Searching for Term: Deutsche Bank
Initializing Website
searching for term
Total Pages 2
Scraping Results

Processing Page: 1 

<selenium.webdriver.remote.webelement.WebElement (session="0857ba1439c766e25e31a5d3436cdd48", element="0.22010257161358293-2")>


 75%|███████▌  | 3/4 [00:56<00:18, 18.68s/it]


Processing Page: 2 

Completed Search for Deutsche Bank 

JPMorgan Chase
##############################
Searching for Term: JPMorgan Chase
Initializing Website
searching for term
Total Pages 1
Scraping Results

Processing Page: 1 



100%|██████████| 4/4 [01:12<00:00, 18.09s/it]

Completed Search for JPMorgan Chase 






In [21]:
master_df = master_df.reset_index(drop=True)
master_df['reviewed'] = None
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('idi_example.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
master_df.to_excel(writer, sheet_name='Sheet1')

# Close the Pandas Excel writer and output the Excel file.
writer.save()

In [22]:
master_df.url.value_counts()

https://disclosures.ifc.org/#/projectDetail/SPI/29386     2
https://disclosures.ifc.org/#/projectDetail/SII/31937     2
https://disclosures.ifc.org/#/projectDetail/ESRS/34297    1
https://disclosures.ifc.org/#/projectDetail/AS/592067     1
https://disclosures.ifc.org/#/projectDetail/SPI/10603     1
https://disclosures.ifc.org/#/projectDetail/SPI/556906    1
https://disclosures.ifc.org/#/projectDetail/SPI/29920     1
https://disclosures.ifc.org/#/projectDetail/AS/586187     1
https://disclosures.ifc.org/#/projectDetail/SII/34297     1
https://disclosures.ifc.org/#/projectDetail/SII/33120     1
https://disclosures.ifc.org/#/projectDetail/SPI/23524     1
https://disclosures.ifc.org/#/projectDetail/SPI/25114     1
https://disclosures.ifc.org/#/projectDetail/SPI/24385     1
https://disclosures.ifc.org/#/projectDetail/SPI/30572     1
https://disclosures.ifc.org/#/projectDetail/SPI/30815     1
https://disclosures.ifc.org/#/projectDetail/SPI/21476     1
https://disclosures.ifc.org/#/projectDet

# End