In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import pymongo
import time
import re

# Define the MongoDB connection parameters
mongo_uri = "<uri>"
db_name = 'fc'
collection_name = 'publications'

# Create a new MongoDB client
client = pymongo.MongoClient(mongo_uri)

# Select the database
db = client[db_name]

# Select the collection
collection = db[collection_name]

url = 'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=Faculty+of+Computing%2C+Universiti+Teknologi+Malaysia'

scholar_list = []
astart = 0
#documents = []

# Set up the Selenium driver
driver = webdriver.Chrome()

while True:
    
    driver.get(url)

    # Wait for the page to load
    driver.implicitly_wait(10)

    # Get the page source and parse it using Beautiful Soup
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    # Find all the scholars, their affiliations and emails on the page
    scholars = soup.find_all('h3', class_='gs_ai_name')
    affs = soup.find_all('div', class_='gs_ai_aff')
    emails = soup.find_all('div', class_='gs_ai_eml')

    for scholar, aff, eml in zip(scholars, affs, emails):
        # ignore scholars from Universiti Teknologi Mara (UiTM)
        if not (re.search('MARA', aff.text, re.IGNORECASE) or re.search('uitm', eml.text, re.IGNORECASE)):
            # add UTM FC scholars to the list
            scholar_list.append(f"https://scholar.google.com{scholar.find('a')['href']}")

    # get next page link from the next page button if it is present
    if soup.select_one(".gsc_pgn button.gs_btnPR").get('onclick'):
        after_author = re.search(r"after_author\\x3d(.*)\\x26", str(soup.select_one(".gsc_pgn button.gs_btnPR").get('onclick'))).group(1)
        astart += 10
        url = f'{url}&after_author={after_author}&astart={astart}'
    else:
        break

with tqdm(total=len(scholar_list)) as pbar:  #progress bar
    for scholar_url in scholar_list:

        driver.get(scholar_url)

        # click show more button in the profile page
        for _ in range(0,3):
            try:
                #Wait up to 10s until the element is loaded on the page
                element = WebDriverWait(driver, 5).until(
                    #Locate element by id
                    EC.presence_of_element_located((By.ID, 'gsc_bpf'))
                )
            finally:
                element.click()
            time.sleep(2)

        # Get the page source and parse it using Beautiful Soup
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # get the links for all articles on the page of the scholar
        links = soup.find_all('a', class_='gsc_a_at')
        links_list = []
        for link in links:
            links_list.append(f'https://scholar.google.com{link["href"]}')

        # loop through all article links of the scholar
        for url in links_list:

            driver.get(url)

            # Wait for the page to load
            driver.implicitly_wait(10)

            # Get the page source and parse it using Beautiful Soup
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')

            result = soup.find('div', id='gsc_vcpb')

            # format the data in dictionary format
            document = {}
            if result.find('a', class_='gsc_oci_title_link'):
                document['Title'] = result.find('a', class_='gsc_oci_title_link').text
                document['Link'] = result.find('a', class_='gsc_oci_title_link')['href']
            else:
                document['Title'] = result.find('div', id='gsc_oci_title').text
            
            for field, value in zip(result.find_all('div', class_='gsc_oci_field'), result.find_all('div', class_='gsc_oci_value')):
                if field.text == 'Scholar articles':
                    break
                elif field.text == 'Total citations':
                    document[field.text] = int(re.search(r'\d+', value.find('a').text).group())
                else:
                    document[field.text] = value.text
                
            #documents.append(document)

            # insert the document into MongoDB database
            collection.insert_one(document)

            time.sleep(2)
        
        # update progress bar after each scholar iteration
        pbar.update(1)

driver.quit()

  0%|          | 0/50 [00:00<?, ?it/s]