## The idea: 
First we will want to get all the links we want. Meaning that for each company and each region we want a base link. Other filters can simply be obtained from modifying these base links.


## Installing the necessary packages & driver
As specific python packages we will need selenium which allow us to automate a webbrowser and beautifulsoup which is a classic html parser. This can easily be done with pip or conda. We are also going to need a driver which will be the browser we are going to automate. The links to install the driver can be found here: https://selenium-python.readthedocs.io/installation.html

Note: Be careful to install the driver and the version corresponding to your working browser. To know what google version you are using go to help>about google chrome 

## Setup

In [1]:
# Imports
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, ElementNotInteractableException, WebDriverException

import logging
import re
import random
import time
import pandas as pd

In [2]:
# For logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='links.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

In [36]:
def read_status():
    status_df = pd.read_csv('link_status.csv', sep=';', index_col=0)
    return status_df

In [15]:
def get_rid_overlay(driver):
    """
    Sometimes the site shows an overlay that cannot be removed by clicking, which blocks interaction with the site,
    this is some code to remove it.
    """
    driver.execute_script("""
        javascript:(function(){
          document.getElementsByClassName('hardsellOverlay')[0].remove();
          document.getElementsByTagName("body")[0].style.overflow = "scroll";
          let style = document.createElement('style');
          style.innerHTML = `
            #LoginModal {
              display: none!important;
            }
          `;
          document.head.appendChild(style);
          window.addEventListener("scroll", function (event) {
            event.stopPropagation();
          }, true);
        })();
        """)
    return driver

In [16]:
def setup_driver_to_reviews_search():
    """
    Launches driver, sets window size and implicit wait
    """
    driver = webdriver.Chrome('/Users/corentin/OneDrive - Universite de Liege/chromedriver')
    time.sleep(3)
    driver.implicitly_wait(20)
    driver.get('https://www.glassdoor.com/Reviews/index.htm')
    driver.set_window_size(1280, 800)
    
    logging.info('Driver succesfully setup')
    return driver

In [18]:
# Set site language to us
def set_site_language(driver):
    time.sleep(random.uniform(2, 4))
    # Scroll to the bottom of the page
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(random.uniform(2, 4))
    
    # Find the box for language choice and click on it
    language_choice = driver.find_element_by_xpath('//*[@id="Footer"]/nav/ul[2]/li[3]/div/div/div[1]')
    language_choice.click()
    
    # Parse the html of the page and find the choices of language, click on united states
    soup = BeautifulSoup(driver.page_source, 'lxml')
    language_click_box = soup.find('div', class_='dropdownOptions dropdownExpanded animated above')
    for li in language_click_box.find_all('li'):
        country = li.find_all('span')[1].string
        if country == 'United States':
            us_id = li.get('id')
            break
    us = driver.find_element_by_id(us_id)
    time.sleep(random.uniform(2, 4))
    us.click()
    time.sleep(random.uniform(2, 4))
    
    logging.info('Website succesfully put in english')
    return driver

## Looking for the companies

In [56]:
def search_company(company, driver):
    """
    Searches the name of a company in the search bar
    """
    company_search_bar = driver.find_element_by_xpath('//*[@id="KeywordSearch"]')
    company_search_bar.send_keys(company)
    time.sleep(random.uniform(2, 4))
    location_search_bar = driver.find_element_by_xpath('//*[@id="LocationSearch"]')
    location_search_bar.clear()
    time.sleep(random.uniform(2, 4))
    company_search_bar.send_keys(Keys.ENTER)
    time.sleep(random.uniform(2, 4))

    first_result = driver.find_element_by_xpath('//*[@id="MainCol"]/div/div[1]/div/div[1]/div/div[2]/h2/a')
    first_result.click()
    
    time.sleep(random.uniform(2, 4))
    url = driver.current_url
    return driver

In [21]:
def get_reviews_page(driver):
    soup = BeautifulSoup(driver.page_source, 'lxml')
    reviews_tag = soup.find('a', {'data-test': 'reviewSeeAllLink'})
    url_path = reviews_tag.get('href')
    domain = 'https://www.glassdoor.com'
    reviews_url = domain + url_path
    
    #driver.get(reviews_url)
    
    return driver, reviews_url

In [73]:
def get_links(status, driver):
    for i in range(len(status)):
        if pd.isnull(status['link'].iloc[i]):
            try:
                driver.get('https://www.glassdoor.com/Reviews/index.htm')
                driver = search_company(status['companies'].iloc[i], driver)
                driver, url = get_reviews_page(driver)
                status['link'].iloc[i] = url
                status.to_csv('link_status.csv', sep=';')
                logging.info(f'Link {status.companies.iloc[i]}: {url}')
            except:
                status['link'].iloc[i] = '-'
                logging.info(f'Unsuccesful attempt for {status.companies.iloc[i]}')
        else:
            continue
            
        #if (i%10==0 and i!=0) or i == (len(status)-1):
            #status.to_csv('link_status.csv', sep=';')
        #else:
            #print(i, 'else')
            #continue
                                             
    return driver, status                        

In [71]:
status = read_status()

In [63]:
driver = setup_driver_to_reviews_search()

In [64]:
driver = set_site_language(driver)

In [74]:
driver, status = get_links(status, driver)

https://www.glassdoor.com/Reviews/Google-Reviews-E9079.htm
https://www.glassdoor.com/Reviews/Deloitte-Reviews-E2763.htm
https://www.glassdoor.com/Reviews/PwC-Reviews-E8450.htm
https://www.glassdoor.com/Reviews/Meta-Reviews-E40772.htm
https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036.htm
https://www.glassdoor.com/Reviews/Intel-Corporation-Reviews-E1519.htm
https://www.glassdoor.com/Reviews/AMD-Reviews-E15.htm
https://www.glassdoor.com/Reviews/HP-Inc-Reviews-E1093161.htm
https://www.glassdoor.com/Reviews/Roku-Reviews-E26760.htm


## Specifying regions

In [12]:
# # If need to specify location
# def get_url_by_location(locations, driver):
#     """
#     Adjusts window size because otherwise no scrolling possible
#     Selects a location and returns url of the page
#     """
#     driver.set_window_size(1384, 789)
#     time.sleep(random.uniform(2, 4))
    
#     urls = {}
#     for location in locations:
#         time.sleep(random.uniform(2, 4))
#         driver.execute_script("window.scrollBy(0, arguments[0]);", 500)

#         filter_button = driver.find_element_by_xpath('//*[@id="MainContent"]/div/div[1]/div[1]/div[1]/div/div[1]/div[2]/div[2]/button/span')
#         filter_button.click()
#         time.sleep(random.uniform(2, 3))

#         location_box = driver.find_element_by_xpath('//*[@id="MainContent"]/div/div[1]/div[1]/div[1]/div/div[1]/div[3]/div[2]/div[1]/div/div[1]')
#         location_box.click()
#         time.sleep(random.uniform(2, 3))

#         location_input = driver.find_element_by_xpath('//*[@id="MainContent"]/div/div[1]/div[1]/div[1]/div/div[1]/div[3]/div[2]/div[1]/div/div[1]/div/div/div/input')
#         location_input.send_keys(location)
#         time.sleep(random.uniform(1, 2))
#         location_input.send_keys(Keys.ARROW_DOWN)
#         time.sleep(random.uniform(1, 2))
#         location_input.send_keys(Keys.ENTER)

#         time.sleep(random.uniform(2, 4))
#         urls[location] = driver.current_url
#         driver = get_rid_overlay(driver)
    
#     return driver, urls

## Add filters

In [13]:
def add_filter(base_url):
    """
    You can use this function to alter the link if you want to change the language 
    or the job types (intern, full time, etc)
    """
    if base_url != '-': 
        return base_url + '?filter.iso3Language=eng' #Modify the filter path according to needs
    else:
        return '-'

In [77]:
status['link'] = status['link'].map(add_filter)
status.to_csv('link_status.csv', sep=';')