# Part 2: Web Scraping Text Content 

#### Get the list of researchers from Faculty of Computing at the University Technology Malaysia.

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import time
import pandas as pd

# Set up the Selenium webdriver to open a headless Chrome browser
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome(options=options)

# Navigate to the Google Scholar search page
driver.get('https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=&btnG=')

# Find the search box and enter the search query
search_box = driver.find_element(By.NAME, 'mauthors')
search_box.send_keys('Faculty of Computing, "Universiti Teknologi Malaysia"')

# Submit the search query
search_box.submit()

# Wait for the page to load and get the page source
time.sleep(5)
html = driver.page_source

# Parse the HTML data using BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

names = []
profiles = []

# Find all the search results
search_results = soup.find_all('div', {'class': 'gs_ai_t'})

# get the names and links of the researchers on the first page
for result in search_results:
    name = result.find('h3', {'class': 'gs_ai_name'}).text
    link = result.find('a')['href']
    names.append(name)
    profiles.append(link)
    
# navigate to the next page and get the names and links of the researchers      
while True:
    next_button = driver.find_element(By.XPATH, '//button[@aria-label="Next"]')
    if not next_button.is_enabled():
        break
        
    # click on the "Next" button to load the next page of results
    next_button.click()
    time.sleep(2)  # wait for the page to load
    
    # Get the new page source and parse it using BeautifulSoup
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    # Find all the search results
    search_results = soup.find_all('div', {'class': 'gs_ai_t'})

    # Loop over the search results
    for result in search_results:
        name = result.find('h3', {'class': 'gs_ai_name'}).text
        link = result.find('a')['href']

        names.append(name)
        profiles.append(link)

print(pd.DataFrame({'Researcher': names, 'Link': profiles}))

                            Researcher                                Link
0                 Waleed Al-Rahmi, PhD  /citations?hl=en&user=jEJuIWAAAAAJ
1                 Mohd Shahizan Othman  /citations?hl=en&user=QzgVq24AAAAJ
2                     Dayang NA Jawawi  /citations?hl=en&user=4zE7K1YAAAAJ
3             Syed Hamid Hussain Madni  /citations?hl=en&user=MUUFa60AAAAJ
4             Bander Ali Saleh Al-rimy  /citations?hl=en&user=8vIXgG8AAAAJ
5                Mohd Murtadha Mohamad  /citations?hl=en&user=F4HHScQAAAAJ
6                     Shahida Sulaiman  /citations?hl=en&user=5hOCRTgAAAAJ
7            Syed Zainudeen Mohd Shaid  /citations?hl=en&user=XErI7_oAAAAJ
8                 Norris Syed Abdullah  /citations?hl=en&user=dG_zPsgAAAAJ
9                       Farhan Mohamed  /citations?hl=en&user=lU_OdNsAAAAJ
10                     Noraini Ibrahim  /citations?hl=en&user=-zEjYpUAAAAJ
11           Ts. Dr. Maheyzah Md Siraj  /citations?hl=en&user=2oMpyioAAAAJ
12                      C

#### Store the articles data of the researchers.

In [2]:
titles = []
authors = []
pub_dates = []
journal_names = []
citations = []

for profile in profiles:
    # Construct the URL for the Google Scholar profile
    url = f"https://scholar.google.com/{profile}"
    
    # Visit the profile page using Selenium webdriver
    driver.get(url)
    
    # Find the "Show more" button and click it repeatedly until it is disabled
    while True:
        show_more_button = driver.find_element(By.XPATH, '//button[@id="gsc_bpf_more"]')
        if not show_more_button.is_enabled():
            break
            
        show_more_button.click()
        time.sleep(2)  # wait for the result to load
            
    # Extract the page content with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    publications = soup.find_all("tr", {"class": "gsc_a_tr"})
    
    # Extract metadata from the parsed HTML
    titles += [title.text for title in soup.find_all('a', class_='gsc_a_at')]
    pub_dates += [pub_date.text for pub_date in soup.find_all('span', class_='gsc_a_hc')]
    citations += [citation.text for citation in soup.find_all('a', class_='gsc_a_ac')]
    
    for pub in publications:
        author_elem = pub.find_all("div", {"class": "gs_gray"})[0]
        author = author_elem.text.strip()
        
        journal_elem = pub.find_all("div", {"class": "gs_gray"})[1]
        journal = journal_elem.text.strip()

        journal_names.append(journal)
        authors.append(author)
    

# Print the extracted metadata
print("Title:", titles)
print("Authors:", authors)
print("Publication Date:", pub_dates)
print("Journal or Conference Name:", journal_names)
print("Citations:", citations)
print("------")
    
# Close the Selenium webdriver after scraping is done
driver.close()

Authors: ['W Al-Rahmi, M Othman', 'WM Al-Rahmi, N Yahaya, AA Aldraiweesh, MM Alamri, NA Aljarboa, ...', 'WM Al-Rahmi, AM Zeki', 'WM Al-Rahmi, MS Othman, MA Musa', 'WM Al-Rahmi, N Alias, MS Othman, VI Marin, G Tur', 'H Abuhassna, WM Al-Rahmi, N Yahya, MAZM Zakaria, ABM Kosnin, ...', 'W Al-Rahmi, MS Othman, LM Yusuf', 'MA Almaiah, MM Alamri, W Al-Rahmi', 'NURSAR WALEED AL-RAHMI, NORMA ALIAS, MOHD SHAHIZAN OTHMAN , AHMED IBRAHIM ...', 'WM Al-Rahmi, N Yahaya, MM Alamri, IY Alyoussef, AM Al-Rahmi, ...', 'W Al-Rahmi, A Aldraiweesh, N Yahaya, YB Kamin, AM Zeki', 'WM Al-Rahmi, MS Othman, LM Yusuf', 'WM Al-Rahmi, MS Othman, LM Yusof, MA Musa', 'WM Al-Rahmi, AI Alzahrani, N Yahaya, N Alalwan, YB Kamin', 'N Alalwan, WM Al-Rahmi, O Alfarraj, A Alzahrani, N Yahaya, ...', 'WM Al-Rahimi, MS Othman, MA Musa', 'MM Alamri, MA Almaiah, WM Al-Rahmi', 'WM Al-Rahmi, N Yahaya, U Alturki, A Alrobai, AA Aldraiweesh, ...', 'WM Al-Rahmi, MS Othman, LM Yusuf', 'WM Al-Rahmi, MS Othman', 'Q Al-Maatouk, MS Othman, A

#### Store the data in DataFrame

In [3]:
article_dict = {'Title': titles, 'Authors': authors, "Publication Date": pub_dates, 'Journal or Conference Name': journal_names, 'Citations': citations}
df = pd.DataFrame(article_dict)
df

Unnamed: 0,Title,Authors,Publication Date,Journal or Conference Name,Citations
0,The impact of social media use on academic per...,"W Al-Rahmi, M Othman",2013,Journal of information systems research and in...,317
1,Integrating technology acceptance model with i...,"WM Al-Rahmi, N Yahaya, AA Aldraiweesh, MM Alam...",2019,"Ieee Access 7, 26797-26809, 2019",295
2,A model of using social media for collaborativ...,"WM Al-Rahmi, AM Zeki",2017,Journal of King Saud University-Computer and I...,250
3,The improvement of students' academic performa...,"WM Al-Rahmi, MS Othman, MA Musa",2014,"Asian Social Science 10 (8), 210, 2014",243
4,A model of factors affecting learning performa...,"WM Al-Rahmi, N Alias, MS Othman, VI Marin, G Tur",2018,"Computers & Education 121, 59-72, 2018",233
...,...,...,...,...,...
1811,A Review on Linear encoding methods and Geomet...,"S Fotoohifiroozabadi, S Deris, MS Mohamad, J R...",2014,"Life Science Journal 11 (3s), 2014",
1812,Knowledge audit for Phd information system at ...,S Fotoohi,2012,"Universiti Teknologi Malaysia, Faculty of Comp...",
1813,Intelligent Caching Wireless Data Access in th...,"SM Jais, S Sulaiman, SM Shamsuddin",2013,TELKOMNIKA Indonesian Journal of Electrical En...,1
1814,Blockchain-Based Distributed File System Secur...,"ZZ Mohtar, MY Idris, F Mohamed",2022,2022 4th International Conference on Smart Sen...,


#### Export the data as .csv file

In [4]:
df.to_csv('GoogleScholar.csv',index=False)

#### Save the data to MongoDB

In [15]:
import pymongo

# Connect to MongoDB
client = pymongo.MongoClient(URL)
db = client['google_scholar']
collection = db['google_scholar']

data = df.to_dict(orient='records')
collection.insert_many(data)

  return rust_x509.load_der_x509_certificate(data)


<pymongo.results.InsertManyResult at 0x1c5b9c02b20>