### Multi-page, Multi-term Web Scraper For Google Scholar version 6
* Designed to scrape all results of Google Scholar searches, up to Scholar's imposed maximum of 100 pages (1000 results) for each search.
* Developed using the beautifulsoup and pandas packages; it also requires the requests and time packages.
* This code extends a single-page scraping architecture for Google.com search results developed by Edmund Martin, whose original work is available [here](https://edmundmartin.com/scraping-google-with-python/).
* Adaptation for Google Scholar, iteration over pages, data extraction and manipulation, and export formatting were coded by Cory J. Combs.

#### This scraper consists of five components:
1. A user agent, which provides identifying information to the server
2. A function to fetch results
3. A function to parse results
4. An function to execute fetching and parsing with error handlers
5. The main search script, which:
  * executes the search with the input parameters,
  * outputs the results in a pandas data frame,
  * extracts metadata elements not consistently identifiable through Google Scholar's html or xml alone,
  * cleans and formats the data, and
  * exports the fully formatted dataframe into Excel

The results may be explored in the output Excel file or in Python using pandas. The final formatted pandas data frame is called "data_df_clean".

In [0]:
# Imports
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

In [0]:
# Build the user agent
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}

In [0]:
# Create function to fetch Google Scholar results given specified search term
def fetch_results(search_term, language_code):
    """Fetch Google Scholar search results given specified search term and language code."""
    
    # Confirm search term is a string, else raise Assertion Error
    assert isinstance(search_term, str), 'Search term must be a string'
    escaped_search_term = search_term.replace(' ', '+')
    
    # Establish template URL
    scholar_url = 'https://scholar.google.com/scholar?start={}0&q={}&hl={}&as_sdt=1,21&as_ylo={}&as_yhi={}'.format(pagination, escaped_search_term, language_code, from_year, to_year)
    
    # Response handling
    response = requests.get(scholar_url, headers=USER_AGENT)
    response.raise_for_status()
    
    return search_term, response.text

In [0]:
# Create function to parse results
def parse_results(html, keyword):
    """Parses Google Scholar results and appends valid entries to results list."""
    
    # Call html parser
    soup = BeautifulSoup(html, 'html.parser')
    
    # Setup
    found_results = [] # Initialize temporary results storage
    rank = 1  # Initialize rank, which records position on the search results page (1 <= rank <= 10)
    
    # Make soup: parse and identify all relevant elements
    result_block = soup.find_all('div', attrs={'class': 'gs_ri'})
    
    # Make the soup beautiful: parse and order the results
    for result in result_block:
        
        # Identify key data
        title = result.find('h3')
        link = result.find('a', href=True)
        metadata = result.find('div', attrs={'class': 'gs_a'})
        description = result.find('div', attrs={'class': 'gs_rs'})
        
        # If both link and title are present, get result data and append as new entry
        if link and title:
            link = link['href']
            title = title.get_text()
            if description:
                description = description.get_text()
            if link != '#':
                found_results.append({'Keyword': keyword,
                                      'Page': pagination + 1, 'Rank': rank,
                                      'Title': title, 'Metadata': metadata,
                                      'Description': description, 'Link': link})
                if verbose == "yes":
                    print("* New result added...")
                
                # Pause the search for for seconds each entry to limit server load 
                time.sleep(2)
                
                rank += 1
                
    return found_results

In [0]:
# Create scraping function and establish error handling mechanisms
def scrape_google(search_term, language_code):
    """Scraping function with user and server-side error handling.
    
    Flags incorrect arguments, Google Scholar server blocks, and disconnection."""
 
    try:
        keyword, html = fetch_results(search_term, language_code)  # Fetch search results
        results = parse_results(html, keyword)  # Parse fetched results
        return results
    except AssertionError:
        raise Exception("Incorrect arguments were parsed to the function; please revisit your inputs")
    except requests.HTTPError:
        raise Exception("Google may have blocked the search; if the problem persists, try running the search from a different IP address, e.g. by connecting to a different network, resetting your router, or using a VPN")
    except requests.RequestException:
        raise Exception("Connection issue detected; please check your internet connection")

In [0]:
# Implement search from script execution
if __name__ == '__main__':
    
    # Select parameters - use standard Google Scholar search syntax within single parentheses,
    # separating distinct searches with commas
    keywords = ['"life cycle assessment" OR LCA, electricity, temporal']
    from_year = 2009  # Start year/lower bound
    to_year = 2009  # End year/upper bound
    language_code = "en"  # English-language results only
    verbose = "yes"  # Enter "yes" to show addition of each result; any other value turns this off
    
    # Setup
    page_max = 100 # Number of pages from start pagination
    # Google Scholar only shows 100 pages; iterating over empty pages will simply add no results
    data = []
    
    # Initialize Search
    print("Initiating search")
    print("-----------------")
    for keyword in keywords:
        pagination = 79
        print("Now searching for", keyword)
        for n in range (pagination, pagination + page_max):
            # Note that we do not need to add one to page_max, as the first URL count starts from 0
            # and the final page URL counts from 990 (page 99), not 1000 (page 100)
            try:
                print("# Now on page", pagination + 1)
                results = scrape_google(keyword, "en")
                # Requests only results in English - however, the last pages tend to include results in other language
                for result in results:
                    data.append(result)
                    time.sleep(1.05)  # Add pauses to relieve pressure on server
                pagination += 1
                print("# Pausing between pages to relieve pressure on server...")
                time.sleep(6)
            except Exception as e:
                print(e)
        print("# Pausing between iterations to relieve pressure on server...")
        print("-------------------------------------------------------------")
        time.sleep(30)  # For multiple keywords, use at least 30 (seconds); for a single keyword, pause is unused

    # Format the raw data as a data frame
    print("Formatting raw data...")
    data_df = pd.DataFrame(data)  # Convert list to data frame
    
    # NOTE: the author, year, journal, and publisher info are not consistently identifiable in the html or xml alone,
    # and so are extracted and manipulated using pandas, below
    
    # Prepare metadata and extract year
    data_df_extraction = data_df
    print("Cleaning data...")
    for row in range(0, len(data_df_extraction['Metadata'])):
        # Extract pure text from the metadata html
        soup = BeautifulSoup(str(data_df_extraction['Metadata'][row]))
        # Update metadata cells with pure text extracted above
        data_df_extraction.at[row, 'Metadata'] = soup.get_text()
        # Extract year from metadata
        data_df_extraction['Year'] = data_df_extraction['Metadata'].str.extract('( \d\d\d\d )', expand=True)
    
    # Extract publisher info - always appears after year and a hyphen
    data_df_extraction = data_df_extraction.join(data_df_extraction['Metadata'].str.split('\d\d\d\d -', expand=True).rename(columns={0: 'Temp', 1: 'Publisher'}))
    
    # Extract and clean author and journal info - the indirect approach was required to handle hyphenated names
    data_df_extraction = data_df_extraction.join(data_df_extraction['Temp'].str.split('- ', expand=True).rename(columns={0: 'Author(s)', 1: 'Journal', 2: 'Unexpected Terms'}))
    data_df_extraction['Journal'] = data_df_extraction['Journal'].str.replace(',', '')  # Strip away unnecessary commas
    
    # Drop now-obsolete "Metadata" and "Combined" columns
    data_df_extraction = data_df_extraction.drop(columns=['Metadata', 'Temp'])
    
    # Rearrange columns into final order
    col = ['Keyword', 'Year', 'Page', 'Rank', 'Title', 'Author(s)', 'Description', 'Journal', 'Publisher', 'Link']
    data_df_clean = data_df_extraction[col]  # Create new data frame with specified column order
    
    # Export Finalized Data
    print("Exporting final data frame to Excel...")
    print("--------------------------------------")
    data_df_clean.to_excel(r'scholar_search_results_2009-2018_3.xlsx', index=None, header=True)
    
    print("Search complete. Results exported to Excel.")