## Protype for Google Scholar Code

- Web scraper I developed to scrape Google Scholar
- CSV is the csv of the school's scraped for this project

In [41]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import time
import random
import fake_useragent # rotating user agents
import os

In [40]:
from selenium import webdriver
chrome_driver_path = '/usr/local/bin/chromedriver'
chrome_service = webdriver.chrome.service.Service(chrome_driver_path)
chrome_service.start()
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By


In [None]:
# USING SELINIUM
def sel_page(url):
    c_options = Options()
    c_options.add_argument('--headless') # this wont open a new window
    c_options.add_argument(f'user-agent={fake_useragent.UserAgent().random}')

    driver = webdriver.Chrome(options = c_options)

    driver.get(url)
    updated_page = driver.page_source

    soup = bs(updated_page, 'html.parser')

    if soup != driver.page_source:
        soup = bs(driver.page_source, 'html.parser')

    return soup, driver

In [None]:
# GETTING SCHOOL NAMES FROM CSV
def extracting_school_names(csv):
    '''
    getting the school names out of the csvs
    Input: csv (csv) file 
    Returns: names_lst (list) list of extracted schools'''

    names_lst = []
    csv = pd.read_csv(csv)
    
    for _, row in csv.iterrows():
        school_name = row['name']
        names_lst.append(school_name)

    return names_lst

In [None]:
# MAKING GS URLS FROM SCHOOL NAMES
def making_first_urls(csv):
    ''' 
    Creates followable urls from names extracted from csv
    Input: csv (csv) file 
    Returns: full_urls (lst) a list of tuples containing followable urls and school name
    '''

    names_lst = extracting_school_names(csv)
    
    full_urls = []
    
    for name in names_lst:
        search = '+'.join(name.split())
        test_url = f'https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors={search}&btnG='
        full_urls.append((test_url, name))
    
    return full_urls

In [None]:
# NAVIGATING TO CORRECT PAGE
def getting_correct_page(csv):
    '''
    The name + the link does not take you to the correct page.
    This function naviagates to the completely correct page. 
    Inputs: 
        csv: (csv)
    Outputs:
        full_links: (dict) a dictionary of the correct links to begin scraping
        {school: link}
    '''

    website = 'https://scholar.google.com/'

    result = making_first_urls(csv)
    
    full_links = {}
    for (url, school_name) in result:
        try:
            soup, driver = sel_page(url)
            time.sleep(random.randint(5, 10))
            if soup:
                # for element in that tiny block on the top
                for element in soup.find_all('div', class_ = 'gsc_instbox_sec'):
                    # for the links in that block (sometimes multiple)
                    for link in element.find_all('a', href = True):
                        # if the text span_element is the correct school name
                        span_element = soup.find('span', class_='gs_hlt').text
                        if span_element == school_name:
                            link = link['href']
                            full_link = (website + link)
                            full_links.update({school_name : full_link})
                            print('link added:', full_link)
                            time.sleep(random.randint(5, 10))
                            break 
                            # breaking or it will continue to grab 
                            # all the links in the box
                        else:
                            print('link not added:', school_name)
                            time.sleep(random.randint(5, 10))
                    
        except Exception as e:
            print('Error:', e,
                'School:', school_name)
                   
    return full_links

In [None]:
# EXTRACTING INFO FROM CORRECT PAGE
def getting_info(soup, school):
    '''
    extracting the actual info from the page
    Inputs:
        soup: (str) The souped page
        school: (str) the name of the school
    Ouputs:
        df: (dataframe) a pd dataframe containing the school information
    '''
    
    data = []
    df = pd.DataFrame()
    try:
        for element in soup.find('div', id = 'gs_bdy_ccl'):
            for profile in element.find_all('div', class_='gsc_1usr'):
            
                name = profile.find('h3', class_='gs_ai_name')
                if name:
                    name = name.get_text()
                else:
                    name = None
                
                citation = profile.find('div', class_ = 'gs_ai_cby')
                if citation:
                    # only care about the number
                    citation = citation.text.split()
                    citation = citation[-1]
                else:
                    # want to have a difference between they have 0 citations and 
                    # info is unavaliable
                    citation = None

                aff = profile.find('div', 'gs_ai_aff')
                if aff:
                    aff = aff.get_text()
                else:
                    aff = None
                
                email = profile.find('div', 'gs_ai_eml') # we only want verified 
                                        # prof emails once datatset in cleaned
                if email:
                    email = email.get_text()
                else:
                    email = None
                
                print(school, name, citation, aff, email)
                
                d = { 'name': name, 'num_cited': citation, 'affiliation': aff, 'email': email}
                data.append(d)
    
    except TimeoutException:
            print('timeout, probably blocked')
    
    except Exception as e:
        print('Error:', e)

    # writing scraped info into csv
    finally:
        if data:
            df = pd.DataFrame(data)
            df['school'] = school
            df = df[['school', 'name', 'num_cited', 'affiliation', 'email']]
            df.to_csv(f'{school}_csv.csv', index=False, mode='a', header=not os.path.exists(f'{school}_csv.csv'))

    return df

In [None]:
def next_page(driver, school):   
    '''
    This function uses selinium to go to the next page of google scholar
    Inputs:
        driver: the selinium driver
        school: (str) the name of the school
    Outputs:
        full_df: (df) of given page
    '''
    
    times_run = 0

    while True:
        try:
            soup = bs(driver.page_source, 'html.parser') # souping page
            full_df = getting_info(soup, school) # making into datafarme
            # finding the button to click
            button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(@onclick, \
                                        'window.location') \
                                        and contains(@aria-label, 'Next')]"))) 
            
            # waiting, then clicking button
            time.sleep(random.randint(10, 15))             
            button.click()
            # on next page, waiting, then restarting
            time.sleep(random.randint(1, 5))
            times_run += 1
            
            # restarting driver, helps avoid block
            current_url = driver.current_url
            driver.quit()
            _, driver = sel_page(current_url)
        
        except Exception as e:
            if isinstance(e, TimeoutException):
                print('timeout') # times out when its reached the last page. 
                                 #(or we got blocked if not last page)
            elif isinstance(e, KeyboardInterrupt):
                print('kb int')
            print('pages crawled:', times_run)
            break
    
    return full_df

In [None]:
def crawling_loop(csv):
    '''
    This is the loop that brings everything together and the only one you have to run
    Inputs:
        csv: (csv file) schools you want to scrape ('R1.CSV'/'R2.CSV')
    Returns:
        full_df: (dataframe) full data frame of scraped information
    '''
    
    links = getting_correct_page(csv)
    indiv_df = []
    
    for school, link in links.items():

        soup, driver = sel_page(link)
        df = next_page(driver, school)
        indiv_df.append(df)

    full_df = pd.concat(indiv_df, ignore_index=True)
    full_df.to_csv('test_run.csv', index=False) # put the CSV to be 
    
    return full_df

In [None]:
# Example of how you would run the notebook to get all the schools together:
csv = 'test_csv.csv'
school_df = crawling_loop(csv)

## Regex Expression

- There were a few times in the beginning when the scraper did not timeout when it reached the end or something else went wrong. When this happened, I developed a regex expression to extract the necessary information from the profiles the scraper printed out.

In [35]:
import re

In [36]:
# first, you copy and paste the lines that the scraper printed out 
# into string format. Using these lines as an example
df_lines = '''
Ball State University Scott Trappe 17074 Professor of Human Bioenergetics, Ball State University Verified email at bsu.edu
Ball State University Leonard Kaminsky 13665 Professor, Ball State University Verified email at bsu.edu
Ball State University Thomas Holtgraves 10098 Professor of Psychological Science, Ball State University Verified email at bsu.edu
Ball State University Gerardo Ramirez 8618 Ball State University Verified email at bsu.edu
Ball State University Jerrell Cassady 7568 Ball State University Verified email at bsu.edu
Ball State University Guy mittleman 6671 Ball State University Verified email at bsu.edu
Ball State University Dr. Craig Webster 5403 Associate Professor, Ball State University Verified email at bsu.edu
Ball State University Lawrence Gerstein 4822 Ball State University Verified email at bsu.edu
Ball State University Sergiy Rosokha 4798 Professor of Chemistry Ball State University Verified email at bsu.edu
Ball State University Bowen Zhang 4089 Ball State University Verified email at bsu.edu
Ball State University Douglas W Naffziger 4063 Associate Professor of Management, Ball State University Verified email at bsu.edu
Ball State University Panayiotis Theodossiou 3717 Professor of Finance, Ball State University Verified email at bsu.edu
Ball State University Kristin M. Perrone 3624 Ball State University Verified email at bsu.edu
Ball State University John Millis 3525 Professor of Physics, Ball State University Verified email at bsu.edu
Ball State University Richard J. Petts 3507 Ball State University Verified email at bsu.edu
Ball State University Sungok Serena Shim 3405 Ball State University Verified email at bsu.edu
Ball State University Lawrence Judge 3386 Ball State University Verified email at bsu.edu
Ball State University James M Nyce 3372 Professor Emeritus, Ball State University · Department of Anthropology. PhD Brown 87 … Verified email at bsu.edu
Ball State University Matthew R Marvel 3355 Associate Dean and The George A. Ball Distinguished Professor, Ball State University Verified email at bsu.edu
Ball State University Stefanía Ægisdóttir 3206 Ball State University Verified email at bsu.edu
'''

In [37]:
# then, you strip and split those lines
lines = df_lines.strip().split('\n')

# and formulate your regex expression
exp = (
r'(?P<school>Ball State University)\s+(?P<name>.*)\s(?P<num_cited>\d+)\s(?P<title>.*?)(?P<email>Verified.*)')

In [38]:
# and lastly, you would run them through this
data = []
df = pd.DataFrame()

try:
    for line in lines: 
        match = re.search(exp, line)
        school = match.group('school')
        name = match.group('name')
        num_cited = match.group('num_cited')
        title = match.group('title')
        email = match.group('email')
        print(school, name, num_cited, title, email)
        d = {'school': school, 'name': name, 'num_cited': num_cited, 
             'affiliation': title, 'email': email}
        data.append(d)

except Exception as e:
        print(f"Error on line {line.index(line[0]):d}: {e}")

finally:
        full_d = {school: data}
        
        for key, value in full_d.items():
            if value:
                df = pd.DataFrame(value)
                df['school'] = school
                df = df[['school', 'name', 'num_cited', 'affiliation', 'email']]

Ball State University Scott Trappe 17074 Professor of Human Bioenergetics, Ball State University  Verified email at bsu.edu
Ball State University Leonard Kaminsky 13665 Professor, Ball State University  Verified email at bsu.edu
Ball State University Thomas Holtgraves 10098 Professor of Psychological Science, Ball State University  Verified email at bsu.edu
Ball State University Gerardo Ramirez 8618 Ball State University  Verified email at bsu.edu
Ball State University Jerrell Cassady 7568 Ball State University  Verified email at bsu.edu
Ball State University Guy mittleman 6671 Ball State University  Verified email at bsu.edu
Ball State University Dr. Craig Webster 5403 Associate Professor, Ball State University  Verified email at bsu.edu
Ball State University Lawrence Gerstein 4822 Ball State University  Verified email at bsu.edu
Ball State University Sergiy Rosokha 4798 Professor of Chemistry Ball State University  Verified email at bsu.edu
Ball State University Bowen Zhang 4089 Ball

In [39]:
# and this would return the dataframe
# there are still things that would need to be cleaned 
# (the name on 17 for example), but for the most part it seperates the data correctly
df

Unnamed: 0,school,name,num_cited,affiliation,email
0,Ball State University,Scott Trappe,17074,"Professor of Human Bioenergetics, Ball State U...",Verified email at bsu.edu
1,Ball State University,Leonard Kaminsky,13665,"Professor, Ball State University",Verified email at bsu.edu
2,Ball State University,Thomas Holtgraves,10098,"Professor of Psychological Science, Ball State...",Verified email at bsu.edu
3,Ball State University,Gerardo Ramirez,8618,Ball State University,Verified email at bsu.edu
4,Ball State University,Jerrell Cassady,7568,Ball State University,Verified email at bsu.edu
5,Ball State University,Guy mittleman,6671,Ball State University,Verified email at bsu.edu
6,Ball State University,Dr. Craig Webster,5403,"Associate Professor, Ball State University",Verified email at bsu.edu
7,Ball State University,Lawrence Gerstein,4822,Ball State University,Verified email at bsu.edu
8,Ball State University,Sergiy Rosokha,4798,Professor of Chemistry Ball State University,Verified email at bsu.edu
9,Ball State University,Bowen Zhang,4089,Ball State University,Verified email at bsu.edu
