## The idea: 
First we will want to get all the links we want. Meaning that for each company and each region we want a base link. Other filters can simply be obtained from modifying these base links.


## Installing the necessary packages & driver
As specific python packages we will need selenium which allow us to automate a webbrowser and beautifulsoup which is a classic html parser. This can easily be done with pip or conda. We are also going to need a driver which will be the browser we are going to automate. The links to install the driver can be found here: https://selenium-python.readthedocs.io/installation.html

Note: Be careful to install the driver and the version corresponding to your working browser. To know what google version you are using go to help>about google chrome 

## Setup

In [1]:
# Imports
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, ElementNotInteractableException, WebDriverException

import logging
import re
import random
import time
import pandas as pd
from openpyxl import load_workbook

In [2]:
# # For logging
# logger = logging.getLogger()
# fhandler = logging.FileHandler(filename='links.log', mode='a')
# formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# fhandler.setFormatter(formatter)
# logger.addHandler(fhandler)
# logger.setLevel(logging.DEBUG)

In [3]:
def get_status():
    """
    Reads the excel with the company names, links, modified links, how many options there were at search
    and how much has been scraped yetand returns those different lists
    """
    wb = load_workbook('status.xlsx')
    sheet = wb.worksheets[0]
    companies = sheet['A']
    number_options = sheet['B']
    links = sheet['C']
    links_filtered = sheet['D']
    pages = sheet['E']
    
    return companies, number_options, links, links_filtered, pages

In [4]:
def cells_to_list(cells):
    """
    Convert openpyxl format cells to strings
    """
    cells_list = []
    for cell in cells:
        cells_list.append(cell.value)
    return cells_list

In [5]:
def get_rid_overlay(driver):
    """
    Sometimes the site shows an overlay that cannot be removed by clicking, which blocks interaction with the site,
    this is some code to remove it.
    """
    driver.execute_script("""
        javascript:(function(){
          document.getElementsByClassName('hardsellOverlay')[0].remove();
          document.getElementsByTagName("body")[0].style.overflow = "scroll";
          let style = document.createElement('style');
          style.innerHTML = `
            #LoginModal {
              display: none!important;
            }
          `;
          document.head.appendChild(style);
          window.addEventListener("scroll", function (event) {
            event.stopPropagation();
          }, true);
        })();
        """)
    return driver

In [6]:
def setup_driver_to_reviews_search():
    """
    Launches driver, sets window size and implicit wait
    """
    driver = webdriver.Chrome('/Users/corentin/OneDrive - Universite de Liege/chromedriver')
    time.sleep(3)
    driver.implicitly_wait(20)
    driver.get('https://www.glassdoor.com/Reviews/index.htm')
    driver.set_window_size(1280, 800)
    
    logging.info('Driver succesfully setup')
    return driver

In [7]:
# Set site language to us

def set_site_language(driver):
    time.sleep(random.uniform(2, 4))
    # Scroll to the bottom of the page
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(random.uniform(2, 4))
    
    # Find the box for language choice and click on it
    language_choice = driver.find_element_by_xpath('//*[@id="Footer"]/nav/ul[2]/li[3]/div/div/div[1]')
    language_choice.click()
    
    # Parse the html of the page and find the choices of language, click on united states
    soup = BeautifulSoup(driver.page_source, 'lxml')
    language_click_box = soup.find('div', class_='dropdownOptions dropdownExpanded animated above')
    for li in language_click_box.find_all('li'):
        country = li.find_all('span')[1].string
        if country == 'United States':
            us_id = li.get('id')
            break
    us = driver.find_element_by_id(us_id)
    time.sleep(random.uniform(2, 4))
    us.click()
    time.sleep(random.uniform(2, 4))
    
    logging.info('Website succesfully put in english')
    return driver

## Looking for the companies

In [8]:
def search_company(company, driver):
    """
    Searches the name of a company in the search bar, clicks on the first results and also returns
    how many results there were
    """
    #Searches the company in the search bar
    company_search_bar = driver.find_element_by_xpath('//*[@id="KeywordSearch"]')
    company_search_bar.send_keys(company)
    time.sleep(random.uniform(2, 4))
    location_search_bar = driver.find_element_by_xpath('//*[@id="LocationSearch"]')
    location_search_bar.clear()
    time.sleep(random.uniform(2, 4))
    company_search_bar.send_keys(Keys.ENTER)
    time.sleep(random.uniform(2, 4))
    
    #Result of the search: either directly on a page or given choices or no results
    try: #If falls on page with multiple choices
        soup = BeautifulSoup(driver.page_source, 'lxml')
        div_number = soup.find('div', class_='pb-lg-xxl pb-std')
        number = div_number.find_all('strong')[-1].text
        first_result = driver.find_element_by_xpath('//*[@id="MainCol"]/div/div[1]/div/div[1]/div/div[2]/h2/a')
        first_result.click()
        time.sleep(random.uniform(2, 4))
        soup = BeautifulSoup(driver.page_source, 'lxml')
        reviews_tag = soup.find('a', {'data-test': 'reviewSeeAllLink'})
        url_path = reviews_tag.get('href')
        domain = 'https://www.glassdoor.com'
        reviews_url = domain + url_path
    except AttributeError: 
        try: #If falls on page directly
            time.sleep(random.uniform(2, 4))
            soup = BeautifulSoup(driver.page_source, 'lxml')
            reviews_tag = soup.find('a', {'data-test': 'reviewSeeAllLink'})
            url_path = reviews_tag.get('href')
            domain = 'https://www.glassdoor.com'
            reviews_url = domain + url_path
            number = '1'
        except: #If no results
            reviews_url = '/'
            number = '/'

    time.sleep(random.uniform(2, 4))

    return driver, number, reviews_url

## Flow
1) Read the excel and get the data in lists
2) Setup the driver and put the site in english
3) Loop over the companies, if the scraper has already tried to get a link, skip. Otherwise, try. Update the excel at each try.

In [9]:
companies_cells, number_options_cells, links_cells, links_filtered_cells, pages_cells = get_status()

In [10]:
companies = cells_to_list(companies_cells)
number_options = cells_to_list(number_options_cells)
links = cells_to_list(links_cells)
links_filtered = cells_to_list(links_filtered_cells)
pages = cells_to_list(pages_cells)

In [11]:
driver = setup_driver_to_reviews_search()

Driver succesfully setup


In [12]:
driver = set_site_language(driver)

Website succesfully put in english


In [None]:
wb = load_workbook('status.xlsx')
sheet = wb.worksheets[0]
for i in range(len(companies)):
    if links[i] == '_':
        driver.get('https://www.glassdoor.com/Reviews/index.htm')
        driver, number_options[i], links[i] = search_company(companies[i], driver)
        sheet.cell(row=i+1, column=2).value=number_options[i]
        sheet.cell(row=i+1, column=3).value=links[i]
        wb.save('status.xlsx')
    else:
        continue

In [None]:
# # If need to specify location
# def get_url_by_location(locations, driver):
#     """
#     Adjusts window size because otherwise no scrolling possible
#     Selects a location and returns url of the page
#     """
#     driver.set_window_size(1384, 789)
#     time.sleep(random.uniform(2, 4))
    
#     urls = {}
#     for location in locations:
#         time.sleep(random.uniform(2, 4))
#         driver.execute_script("window.scrollBy(0, arguments[0]);", 500)

#         filter_button = driver.find_element_by_xpath('//*[@id="MainContent"]/div/div[1]/div[1]/div[1]/div/div[1]/div[2]/div[2]/button/span')
#         filter_button.click()
#         time.sleep(random.uniform(2, 3))

#         location_box = driver.find_element_by_xpath('//*[@id="MainContent"]/div/div[1]/div[1]/div[1]/div/div[1]/div[3]/div[2]/div[1]/div/div[1]')
#         location_box.click()
#         time.sleep(random.uniform(2, 3))

#         location_input = driver.find_element_by_xpath('//*[@id="MainContent"]/div/div[1]/div[1]/div[1]/div/div[1]/div[3]/div[2]/div[1]/div/div[1]/div/div/div/input')
#         location_input.send_keys(location)
#         time.sleep(random.uniform(1, 2))
#         location_input.send_keys(Keys.ARROW_DOWN)
#         time.sleep(random.uniform(1, 2))
#         location_input.send_keys(Keys.ENTER)

#         time.sleep(random.uniform(2, 4))
#         urls[location] = driver.current_url
#         driver = get_rid_overlay(driver)
    
#     return driver, urls