# Google Scholar Scraper
Scraping Google Scholar citations based on replication dataset utilizing Playwright to search the page, NopeCHA to get past any CAPTCHAs and BeautifulSoup to scrape HTMLs

### Set Up

In [41]:
from playwright.async_api import async_playwright
import pandas as pd
from bs4 import BeautifulSoup
import time

#playwright = await async_playwright().start()
#browser = await playwright.chromium.launch(headless=False)

#### Add in NopeCHA extension to Playwright

In [42]:
import requests
import zipfile
import json

# https://developers.nopecha.com/guides/extension/#loading-the-nopecha-extension-in-a-browser
with open('chromium_automation.zip', 'wb') as f:
    f.write(requests.get('https://github.com/NopeCHALLC/nopecha-extension/releases/latest/download/chromium_automation.zip').content)

with zipfile.ZipFile('chromium_automation.zip', 'r') as zip_ref:
    zip_ref.extractall("nopecha")

#### Launch NopeCHA-enabled browser 

In [43]:
import asyncio
from playwright.async_api import async_playwright, Playwright

user_data_dir = "./user-data"

playwright = await async_playwright().start()
browser = await playwright.chromium.launch_persistent_context(
    user_data_dir,
    headless=False,
    ignore_default_args=['--enable-automation'],
    args=[
        f"--disable-extensions-except=./nopecha",
        f"--load-extension=./nopecha",
    ],
)

### Load in data

In [35]:
article_df = pd.read_csv("../data/replication_database.csv")
article_refs = article_df['ref_original']

In [36]:
article_refs = list(set(article_refs))
len(article_refs)

346

### Set up scraper
Loop through all articles and append data, getting citations and abstracts
Note: this version FAILED after I discovered a nan in the list. To save time and risk of Google banning me, I started again with a second scraper below about mid-way through

In [23]:
article_list = []
problem_articles = []
page = await browser.new_page()

for article in article_refs:
    # set up dict
    article_dict = {}
    print(article)
    # col for joining back
    article_dict['ref_original'] = article
    
    await page.goto("https://scholar.google.com/")

    time.sleep(1)

    # fill text box
    await page.get_by_role('textbox').fill(article)

    # click search
    await page.get_by_role("button", name="Search").click()

    time.sleep(2)

    # if page.get_by_text == "Please show you're not a robot":
    #     print("hit a captcha")

    # try:
    #     await expect(page.get_by_text("Articles").toBeVisible())
    #     print("got the articles id")
    # except:
    #     print("oops")

    # scrape page
    html = await page.content()
    doc = BeautifulSoup(html)
    
    # get first citation from search results
    first_citation = doc.find(class_="gs_ri")
    
    # abstract
    try:
        article_dict['abstract'] = first_citation.find(class_="gs_fma_abs").text
    except:
        print("Couldn't scrape abstract")
        problem_articles.append(article_dict)   
        
    # citation #
    try:
        article_dict['cited_by_count'] = first_citation.find(class_="gs_fl").find_all("a")[2].text.replace("Cited by ", "")
    except:
        print("Couldn't scrape citations")
        problem_articles.append(article_dict)   
        
    # name in google scholar for confirmation
    try:
        article_dict['gs_title'] = first_citation.find(class_="gs_rt").text
    except:
        print("Couldn't scrape title")
        problem_articles.append(article_dict)          

    article_list.append(article_dict)

    time.sleep(2)



Stavrova, O., Ehlebracht, D., & Vohs, K. D. (2020). Victims, perpetrators, or both? The vicious cycle of disrespect and cynical beliefs about human nature. Journal of Experimental Psychology: General, 149(9), 1736.
Schnall, S., Benton, J., & Harvey, S. (2008). With a Clean Conscience: Cleanliness Reduces the Severity of Moral Judgments. Psychological Science, 19(12), 1219-1222.  Study 2
Wu, S.-L., & Ortega, L. (2013). Measuring global oral proficiency in SLA research: A new elicited imitation test of L2 Chinese. Foreign Language Annals, 46(4), 680-704.
Livingston, R. W., Rosette, A. S., & Washington, E. F. (2012). Can an agentic Black woman get ahead? The impact of race and interpersonal dominance on perceptions of female leaders. Psychological science, 23(4), 354-358.
Masicampo, E.J., & Baumeister, R.F. (2008). Toward a physiology of dual-process reasoning and judgment: Lemonade, willpower, and expensive rule-based analysis. Psychological Science, 19(3), 255-260.
Nekmat, E., Gower, K.

Exception: Locator.fill: Connection closed while reading from the driver

In [26]:
gs_df = pd.DataFrame(article_list) 

In [27]:
gs_df.to_csv("../data/gs_scrape.csv", index=False)

## Resets scraper without the nan that caused the failure

In [44]:
# get rid of nan that caused break and restart
article_refs = [x for x in article_refs if str(x) != 'nan']


In [45]:
article_list_2 = []
problem_articles_2 = []
page = await browser.new_page()

for article in article_refs[130:]:
    # set up dict
    article_dict = {}
    print(article)
    # col for joining back
    article_dict['ref_original'] = article
    
    await page.goto("https://scholar.google.com/")

    time.sleep(1)

    # fill text box
    await page.get_by_role('textbox').fill(article)

    # click search
    await page.get_by_role("button", name="Search").click()

    time.sleep(2)

    # if page.get_by_text == "Please show you're not a robot":
    #     print("hit a captcha")

    # try:
    #     await expect(page.get_by_text("Articles").toBeVisible())
    #     print("got the articles id")
    # except:
    #     print("oops")

    # scrape page
    html = await page.content()
    doc = BeautifulSoup(html)
    
    # get first citation from search results
    first_citation = doc.find(class_="gs_ri")
    
    # abstract
    try:
        article_dict['abstract'] = first_citation.find(class_="gs_fma_abs").text
    except:
        print("Couldn't scrape abstract")
        problem_articles.append(article_dict)   
        
    # citation #
    try:
        article_dict['cited_by_count'] = first_citation.find(class_="gs_fl").find_all("a")[2].text.replace("Cited by ", "")
    except:
        print("Couldn't scrape citations")
        problem_articles.append(article_dict)   
        
    # name in google scholar for confirmation
    try:
        article_dict['gs_title'] = first_citation.find(class_="gs_rt").text
    except:
        print("Couldn't scrape title")
        problem_articles.append(article_dict)          

    article_list_2.append(article_dict)

    time.sleep(2)



Hauser, M. D., Cushman, F. A., Young, L., Jin, R., & Mikhail, J. M. (2007). A dissociation between moral judgments and justifications. Mind & Language, 22, 1–21.  j.1468-0017.2006.00297.x Scenarios 1 and 2
Kruger, J. (1999). Lake Wobegon be gone! The "below-average effect" and the egocentric nature of comparative ability judgments. Journal of Personality and Social Psychology, 77(2), 221–232.
Reis, H. T., Lee, K. Y., O'Keefe, S. D., & Clark, M. S. (2018). Perceived partner responsiveness promotes intellectual humility. Journal of Experimental Social Psychology, 79, 21-33.
Exline, J. J., Baumeister, R. F., Zell, A. L., Kraft, A. J., & Witvliet, C. V. (2008). Not so innocent: Does seeing one's own capability for wrongdoing predict forgiveness?. Journal of Personality and Social Psychology, 94(3), 495.
Thürmer, J. L., & McCrea, S. M. (2018). Beyond motivated reasoning: Hostile reactions to critical comments from the outgroup. Motivation Science, 4(4), 333-346.
Santello, M., Flanders, M., 

In [46]:
gs_df_2 = pd.DataFrame(article_list_2) 

In [47]:
gs_df_2.to_csv("../data/gs_scrape_2.csv", index=False)

### Aggregate and join
Takes the two successful dfs and concats before joining back to original df

In [56]:
gs_df = pd.read_csv("../data/gs_scrape.csv")
gs_df_2 = pd.read_csv("../data/gs_scrape_2.csv")
gs_df_full = pd.concat([gs_df, gs_df_2])

In [74]:
# First, check the types again to confirm
print("Article DataFrame type:", article_df["ref_original"].dtype)
print("GS DataFrame type:", gs_df_full["ref_original"].dtype)

# Explicitly convert with string methods to ensure clean strings
article_df["ref_original"] = article_df["ref_original"].astype(str).str.strip()
gs_df_full["ref_original"] = gs_df_full["ref_original"].astype(str).str.strip()

# Verify conversion
print("After conversion:")
print("Article DataFrame type:", article_df["ref_original"].dtype)
print("GS DataFrame type:", gs_df_full["ref_original"].dtype)

# Try merge instead of join (sometimes works better)
result_df = pd.merge(article_df, 
                    gs_df_full, 
                    on="ref_original", 
                    how='left')

Article DataFrame type: object
GS DataFrame type: object
After conversion:
Article DataFrame type: object
GS DataFrame type: object


In [None]:
result_df.to_csv("../data/replication_database.csv", index=False)