# Scraper (master)
This notebook scrapes company data from urls found on glassdoor.com's [Browse Companies](https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=0&page=1&isHiringSurge=0) page in order to create a data base for our NLP_Diversity_and_Inclusion project.

Data includes:
- Company Name
- Size
- Headquarters location
- Industry
- Ratings given by employees *(Overall, Diversity & Inclusion, Culture & Values, Work/Life Balance, Senior Management, Compensation and Benefits, and Career Opportunities)*
- Number of Reviews
- Company Description
- Mission Statement

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import pandas as pd
import numpy as np
import time

**Setting up User-Agent to prevent ip block:**

In [5]:
#Setting driver and User-Agent to prevent ip block
    
opts = Options()
opts.add_argument("Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36")
#opts.add_argument('headless')

driver = webdriver.Chrome(options=opts)
time.sleep(5)

**DRIVER HERE:**

In [6]:
#start driver on main url, 
url_main = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=0&page=1&isHiringSurge=0' #main url

driver.get(url_main)
#time.sleep(10)


## Scrapes
Beginning from most recent scrape

### Attept 8:

In [130]:
##8th try

unsuccessful_links8 = [] ##UPDATE## this line to create unique list for this scrape attempt
companies8 = [] ##UPDATE## this line to create unique list for this scrape attempt

def scraping_pages(num_pages):
    #Creating 'n' urls with url_roots to scrape
    url_root = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=0&page=' #root url
    nums = [x+290 for x in range(num_pages)] ##UPDATE## x + __ according to where last scrape attempt left off
    url_mains = list(map(lambda n: url_root + str(n), nums)) #adding 'n' number to call url_root 
    time.sleep(10) #give page plenty of time to load (page 1 loads first, then specified 'n' page)
    
    for u in url_mains:
        driver.get(u)
        time.sleep(10)
        
    #looking for 'Overview' links from each main search page
        elems = driver.find_elements_by_tag_name('a') #find links on an individual search page tagged with the 'a' tag
        company_links = []
        for elem in elems:
            company_link = elem.get_attribute('href') #returns every item with 'href' attribute (these are the links for each company)
            if 'Overview' in company_link:
                company_links.append(company_link) #each company's 'Overview' link added to company_link list  
    
    #iterating through each company's "Overview" url
        for url in company_links:
            try: #fail safe for inevitable errors
                driver.get(url)
                time.sleep(5)

##---------------------------------------- Handling login ------------------------------------------##
                name = 'n' # <---- ENTER GLASSDOOR CREDENTIALS HERE
                pw = 'pw'
                
                try: #login                    
                    username = driver.find_element_by_id("userEmail")
                    password = driver.find_element_by_id("userPassword")
                    submit = driver.find_element_by_xpath('//*[@id="InlineLoginModule"]/div/div[2]/div/div[1]/div[3]/form/div[3]/div[1]/button')
                    username.send_keys(name)
                    password.send_keys(pw)
                    submit.click()
                    time.sleep(4) #let page load
                except: #no login required
                    time.sleep(2)
                    pass

##---------------------------------- Gathering Variables - Main Page ---------------------------------##                                
                name = driver.find_element_by_xpath('//*[@id="EmpHeroAndEmpInfo"]/div[3]/div[2]').text
                size = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[3]/div').text
                headquarters = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[2]/div').text
                industry = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[6]/div').text
                try:
                    num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[3]/a').text
                except: 
                    num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[3]/a').text        

            #Gather Description - handling "Read More" button
                try:
                    read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class 
                    read_more.click()
                    time.sleep(2)
                    description = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[1]/span').text
                except:
                    description = "N/A"

            #Gather Mission - handling "Read More" button    
                try:
                    read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class
                    read_more.click()
                    time.sleep(2)
                    mission = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[2]').text
                except:
                    mission = "N/A"

##-------------------------------- Gathering Variables - Ratings Pop-up --------------------------------##    
            #Webpage layout 1
                try: 
                    driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]').click()
                    time.sleep(5) #let page load

                    rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
                    rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
                    rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
                    rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
                    rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
                    rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
                    rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

                    time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
            #Webpage layout 2
                except: 
                    driver.get(url) #recalling url
                    driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[1]/div[2]').click()
                    time.sleep(5) #let page load
                    
                    rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
                    rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
                    rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
                    rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
                    rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
                    rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
                    rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

                    time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
                                        
##---------------------------------------- Creating a Dictionary ----------------------------------------##
                companies8.append({ ##UPDATE## this line to create unique dictionary for this scrape attempt
                    "NAME" : name,
                    "SIZE" : size,
                    "LOCATION_HQ" : headquarters,
                    "INDUSTRY" : industry,
                    "RATING_OVERALL" : rating_overall,
                    "RATING_DI" : rating_DI,
                    "RATING_CV" : rating_CV,
                    "RATING_WL" : rating_WL,
                    "RATING_SM" : rating_SM,
                    "RATING_CB" : rating_CB,
                    "RATING_CO" : rating_CO,
                    "NUM_REVIEWS" : num_reviews,
                    "DESCRIPTION" : description,
                    "MISSION" : mission
                                 })

            except: #fail safe for inevitable errors
                unsuccessful_links8.append(url) #adding unsuccessful urls to a list ##UPDATE## unsuccessful_links list
                print('ERROR: ', url)
                time.sleep(10)
                
        print(f'Finished scraping {len(companies8)} companies') ##UPDATE## companies list name
        df8 = pd.DataFrame(companies8)                          ##UPDATE## df number and companies list name
    return df8                                                  ##UPDATE## df number


In [131]:
scraping_pages(100)

ERROR:  https://www.glassdoor.com/Overview/Working-at-Achievement-First-EI_IE297029.11,28.htm
Finished scraping 9 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-American-Campus-Communities-EI_IE35177.11,38.htm
Finished scraping 16 companies
Finished scraping 25 companies
Finished scraping 32 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-American-Campus-Communities-EI_IE35177.11,38.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-Altran-EI_IE28187.11,17.htm
Finished scraping 40 companies
Finished scraping 47 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Beam-Suntory-EI_IE817030.11,23.htm
Finished scraping 56 companies
Finished scraping 64 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Natixis-EI_IE10682.11,18.htm
Finished scraping 73 companies
Finished scraping 83 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-GlobalLogic-EI_IE23009.11,22.htm
Finished scraping 92 companies
Finished scraping 1

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,"Travis County, Texas",1001 to 5000 Employees,"Austin, TX",Municipal Governments,3.8,3.3,3.4,3.7,3.2,3.7,3.4,See All 119 Reviews,,
1,bareMinerals\nPart of Shiseido,1001 to 5000 Employees,"San Francisco, CA",Beauty & Personal Accessories Stores,3.9,4.3,4.3,3.7,3.6,3.5,3.1,See All 743 Reviews,The foundation that launched the mineral makeu...,
2,Valley Medical Center\nHiring Surge,1001 to 5000 Employees,"Renton, WA",Health Care Services & Hospitals,3.8,3.6,3.6,3.6,3.1,4.5,3.6,See All 125 Reviews,Valley Medical Center is the largest nonprofit...,"Mission: Valley Medical Center, the District’s..."
3,2U,1001 to 5000 Employees,"Lanham, MD",Enterprise Software & Network Solutions,3.9,3.9,3.9,3.9,3.4,3.6,3.5,See All 790 Reviews,"2U, Inc. is a diverse collection of more than ...",
4,St. Joseph Health System,10000+ Employees,"Irvine, CA",Health Care Services & Hospitals,3.7,3.4,3.6,3.5,3.2,3.4,3.5,See All 566 Reviews,"Based in Irvine, Calif., St. Joseph Health ser...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868,Valeo,10000+ Employees,"Paris, France",Transportation Equipment Manufacturing,3.6,3.9,3.4,3.0,2.9,3.5,3.4,"See All 2,492 Reviews","Valeo is an automotive supplier, partner to al...",
869,TVA,10000+ Employees,"Knoxville, TN",Energy,3.6,3.6,3.3,3.8,3.0,4.1,3.2,See All 378 Reviews,"For more than eight decades, the Tennessee Val...",Mission: Mission:\nTVA was built for the peopl...
870,Dierbergs Markets,1001 to 5000 Employees,"Chesterfield, MO",Grocery Stores & Supermarkets,3.6,3.8,3.3,3.5,3.6,2.5,3.4,See All 209 Reviews,Discover your hometown career at the Hometown ...,
871,Deckers Brands,1001 to 5000 Employees,"Goleta, CA","Department, Clothing, & Shoe Stores",3.6,4.0,3.9,3.7,3.2,3.5,3.0,See All 293 Reviews,Deckers Brands is a global leader in designing...,


In [132]:
print(len(companies8))
print('unsuccessful: ', len(unsuccessful_links8))
unsuccessful_links8

873
unsuccessful:  50


['https://www.glassdoor.com/Overview/Working-at-Achievement-First-EI_IE297029.11,28.htm',
 'https://www.glassdoor.com/Overview/Working-at-American-Campus-Communities-EI_IE35177.11,38.htm',
 'https://www.glassdoor.com/Overview/Working-at-American-Campus-Communities-EI_IE35177.11,38.htm',
 'https://www.glassdoor.com/Overview/Working-at-Altran-EI_IE28187.11,17.htm',
 'https://www.glassdoor.com/Overview/Working-at-Beam-Suntory-EI_IE817030.11,23.htm',
 'https://www.glassdoor.com/Overview/Working-at-Natixis-EI_IE10682.11,18.htm',
 'https://www.glassdoor.com/Overview/Working-at-GlobalLogic-EI_IE23009.11,22.htm',
 'https://www.glassdoor.com/Overview/Working-at-Versiti-EI_IE1788941.11,18.htm',
 'https://www.glassdoor.com/Overview/Working-at-Booking-com-EI_IE256653.11,22.htm',
 'https://www.glassdoor.com/Overview/Working-at-CHOC-Children-s-EI_IE18961.11,26.htm',
 'https://www.glassdoor.com/Overview/Working-at-Campbell-Soup-Company-EI_IE129.11,32.htm',
 'https://www.glassdoor.com/Overview/Working

In [136]:
##renaming df with len(companies)
df8_873 = pd.DataFrame(companies8) ##############################################################
df8_873

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,"Travis County, Texas",1001 to 5000 Employees,"Austin, TX",Municipal Governments,3.8,3.3,3.4,3.7,3.2,3.7,3.4,See All 119 Reviews,,
1,bareMinerals\nPart of Shiseido,1001 to 5000 Employees,"San Francisco, CA",Beauty & Personal Accessories Stores,3.9,4.3,4.3,3.7,3.6,3.5,3.1,See All 743 Reviews,The foundation that launched the mineral makeu...,
2,Valley Medical Center\nHiring Surge,1001 to 5000 Employees,"Renton, WA",Health Care Services & Hospitals,3.8,3.6,3.6,3.6,3.1,4.5,3.6,See All 125 Reviews,Valley Medical Center is the largest nonprofit...,"Mission: Valley Medical Center, the District’s..."
3,2U,1001 to 5000 Employees,"Lanham, MD",Enterprise Software & Network Solutions,3.9,3.9,3.9,3.9,3.4,3.6,3.5,See All 790 Reviews,"2U, Inc. is a diverse collection of more than ...",
4,St. Joseph Health System,10000+ Employees,"Irvine, CA",Health Care Services & Hospitals,3.7,3.4,3.6,3.5,3.2,3.4,3.5,See All 566 Reviews,"Based in Irvine, Calif., St. Joseph Health ser...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868,Valeo,10000+ Employees,"Paris, France",Transportation Equipment Manufacturing,3.6,3.9,3.4,3.0,2.9,3.5,3.4,"See All 2,492 Reviews","Valeo is an automotive supplier, partner to al...",
869,TVA,10000+ Employees,"Knoxville, TN",Energy,3.6,3.6,3.3,3.8,3.0,4.1,3.2,See All 378 Reviews,"For more than eight decades, the Tennessee Val...",Mission: Mission:\nTVA was built for the peopl...
870,Dierbergs Markets,1001 to 5000 Employees,"Chesterfield, MO",Grocery Stores & Supermarkets,3.6,3.8,3.3,3.5,3.6,2.5,3.4,See All 209 Reviews,Discover your hometown career at the Hometown ...,
871,Deckers Brands,1001 to 5000 Employees,"Goleta, CA","Department, Clothing, & Shoe Stores",3.6,4.0,3.9,3.7,3.2,3.5,3.0,See All 293 Reviews,Deckers Brands is a global leader in designing...,


In [137]:
df_csv8 = df8_873.to_csv('glassdoor_ratings8_873.csv', index=False) #######################

In [138]:
driver.close()
driver.quit()
#scraper stopped at 390

#### Attept 7:

In [117]:
##7th try

unsuccessful_links7 = [] ###############################################
companies7 = [] ########################################################

def scraping_pages(num_pages):
    #Creating 'n' urls with url_roots to scrape
    url_root = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=0&page=' #root url
    nums = [x+234 for x in range(num_pages)] ####################################################################
    url_mains = list(map(lambda n: url_root + str(n), nums)) #adding 'n' number to call url_root 
    time.sleep(10) #give page plenty of time to load (page 1 loads first, then specified 'n' page)
    
    for u in url_mains:
        driver.get(u)
        time.sleep(10)
        
    #looking for 'Overview' links from each main search page
        elems = driver.find_elements_by_tag_name('a') #find links on an individual search page tagged with the 'a' tag
        company_links = []
        for elem in elems:
            company_link = elem.get_attribute('href') #returns every item with 'href' attribute (these are the links for each company)
            if 'Overview' in company_link:
                company_links.append(company_link) #each company's 'Overview' link added to company_link list  
    
    #iterating through each company's "Overview" url
        for url in company_links:
            try: #fail safe for inevitable errors
                driver.get(url)
                time.sleep(5)

##---------------------------------------- Handling login ------------------------------------------##
                name = 'n' # <---- ENTER GLASSDOOR CREDENTIALS HERE
                pw = 'pw'
                
                try: #login                    
                    username = driver.find_element_by_id("userEmail")
                    password = driver.find_element_by_id("userPassword")
                    submit = driver.find_element_by_xpath('//*[@id="InlineLoginModule"]/div/div[2]/div/div[1]/div[3]/form/div[3]/div[1]/button')
                    username.send_keys(name)
                    password.send_keys(pw)
                    submit.click()
                    time.sleep(4) #let page load
                except: #no login required
                    time.sleep(2) #########do we need this much time?
                    pass

##---------------------------------- Gathering Variables - Main Page ---------------------------------##                                
                name = driver.find_element_by_xpath('//*[@id="EmpHeroAndEmpInfo"]/div[3]/div[2]').text
                size = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[3]/div').text
                headquarters = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[2]/div').text
                industry = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[6]/div').text
                try:
                    num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[3]/a').text
                except: 
                    num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[3]/a').text        

            #Gather Description - handling "Read More" button
                try:
                    read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class 
                    read_more.click()
                    time.sleep(2)
                    description = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[1]/span').text
                except:
                    description = "N/A"

            #Gather Mission - handling "Read More" button    
                try:
                    read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class
                    read_more.click()
                    time.sleep(2)
                    mission = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[2]').text
                except:
                    mission = "N/A"

##-------------------------------- Gathering Variables - Ratings Pop-up --------------------------------##    
            #Webpage layout 1
                try: 
                    driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]').click()
                    time.sleep(5) #let page load

                    rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
                    rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
                    rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
                    rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
                    rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
                    rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
                    rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

                    time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
            #Webpage layout 2
                except: 
                    driver.get(url) #recalling url
                    driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[1]/div[2]').click()
                    time.sleep(5) #let page load
                    
                    rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
                    rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
                    rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
                    rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
                    rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
                    rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
                    rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

                    time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
                                        
##---------------------------------------- Creating a Dictionary ----------------------------------------##
                companies7.append({ ###############################################
                    "NAME" : name,
                    "SIZE" : size,
                    "LOCATION_HQ" : headquarters,
                    "INDUSTRY" : industry,
                    "RATING_OVERALL" : rating_overall,
                    "RATING_DI" : rating_DI,
                    "RATING_CV" : rating_CV,
                    "RATING_WL" : rating_WL,
                    "RATING_SM" : rating_SM,
                    "RATING_CB" : rating_CB,
                    "RATING_CO" : rating_CO,
                    "NUM_REVIEWS" : num_reviews,
                    "DESCRIPTION" : description,
                    "MISSION" : mission
                                 })

            except: #fail safe for inevitable errors
                unsuccessful_links7.append(url) #adding unsuccessful urls to a list ###############
                print('ERROR: ', url)
                time.sleep(10)
                
        print(f'Finished scraping {len(companies7)} companies') ###################
        df7 = pd.DataFrame(companies7) ############################################
    return df7 ####################################################################

In [118]:
scraping_pages(100)

ERROR:  https://www.glassdoor.com/Overview/Working-at-Progrexion-EI_IE396304.11,21.htm
Finished scraping 8 companies
Finished scraping 18 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Mutual-of-Omaha-EI_IE3678.11,26.htm
Finished scraping 26 companies
Finished scraping 36 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-GEP-EI_IE238741.11,14.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-Progrexion-EI_IE396304.11,21.htm
Finished scraping 42 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Progrexion-EI_IE396304.11,21.htm
Finished scraping 51 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-NXP-Semiconductors-EI_IE13528.11,29.htm
Finished scraping 60 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Otter-Products-EI_IE372107.11,25.htm
Finished scraping 68 companies
Finished scraping 77 companies
Finished scraping 87 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-NXP-Semiconductors-EI

WebDriverException: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=88.0.4324.150)


In [120]:
print(len(companies7))
print('unsuccessful: ', len(unsuccessful_links7))
unsuccessful_links7

454
unsuccessful:  48


['https://www.glassdoor.com/Overview/Working-at-Progrexion-EI_IE396304.11,21.htm',
 'https://www.glassdoor.com/Overview/Working-at-Mutual-of-Omaha-EI_IE3678.11,26.htm',
 'https://www.glassdoor.com/Overview/Working-at-GEP-EI_IE238741.11,14.htm',
 'https://www.glassdoor.com/Overview/Working-at-Progrexion-EI_IE396304.11,21.htm',
 'https://www.glassdoor.com/Overview/Working-at-Progrexion-EI_IE396304.11,21.htm',
 'https://www.glassdoor.com/Overview/Working-at-NXP-Semiconductors-EI_IE13528.11,29.htm',
 'https://www.glassdoor.com/Overview/Working-at-Otter-Products-EI_IE372107.11,25.htm',
 'https://www.glassdoor.com/Overview/Working-at-NXP-Semiconductors-EI_IE13528.11,29.htm',
 'https://www.glassdoor.com/Overview/Working-at-Relativity-EI_IE1771815.11,21.htm',
 'https://www.glassdoor.com/Overview/Working-at-Sonoco-Products-EI_IE1890.11,26.htm',
 'https://www.glassdoor.com/Overview/Working-at-Republic-National-Distributing-Company-EI_IE253796.11,49.htm',
 'https://www.glassdoor.com/Overview/Work

In [123]:
df7_453 = pd.DataFrame(companies7) ##############################################################
df7_453

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,Torchy's Tacos,1001 to 5000 Employees,"Austin, TX",Casual Restaurants,3.9,4.6,4.1,3.7,3.4,3.3,3.5,See All 199 Reviews,"The Torchy's legacy began in Austin, Texas in ...",
1,Loma Linda University Health,10000+ Employees,"Loma Linda, CA",Health Care Services & Hospitals,3.8,3.6,3.8,3.6,3.0,3.3,3.5,See All 440 Reviews,Loma Linda University Medical Center (LLUMC) o...,
2,F5,5001 to 10000 Employees,"Seattle, WA",Computer Hardware & Software,3.9,4.2,4.0,4.0,3.6,4.2,3.6,"See All 1,002 Reviews","Today, business is about people, applications,...",Mission: F5 (NASDAQ: FFIV) gives the world’s l...
3,Oakley\nPart of Luxottica,1001 to 5000 Employees,"Foothill Ranch, CA",Consumer Products Manufacturing,3.9,4.1,4.0,3.9,3.4,3.4,2.8,See All 713 Reviews,"Oakley sells so many sunglasses, it goggles th...",
4,St. Dominic's,1001 to 5000 Employees,"Jackson, MS",Health Care Services & Hospitals,4.0,4.5,4.1,3.5,3.3,3.5,3.4,See All 126 Reviews,"Since 1946, when the Dominican Sisters operate...",Mission: St. Dominic’s recognizes its basic pa...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449,dressbarn,10000+ Employees,"Mahwah, NJ","Department, Clothing, & Shoe Stores",3.8,2.7,3.7,3.6,3.0,3.0,2.8,See All 781 Reviews,"At dressbarn, in everything we do, we inspire ...",
450,JLG Industries\nPart of Oshkosh Corporation,1001 to 5000 Employees,"Mc Connellsburg, PA",Industrial Manufacturing,3.9,3.2,4.0,3.6,3.3,4.1,3.8,See All 211 Reviews,"JLG Industries, Inc. is the world’s leading de...",
451,CallidusCloud\nAcquired by SAP,1001 to 5000 Employees,"Dublin, CA",Enterprise Software & Network Solutions,3.8,4.7,3.7,3.5,3.4,3.7,3.6,See All 270 Reviews,"CallidusCloud, now part of SAP, is the global ...",
452,Chevron Phillips Chemical,5001 to 10000 Employees,"The Woodlands, TX",Chemical Manufacturing,3.8,3.1,3.5,3.6,2.9,3.9,3.0,See All 253 Reviews,"You may not realize it, but you’ve likely used...",


In [124]:
df_csv7_453 = df7_453.to_csv('glassdoor_ratings7_453.csv', index=False) #######################

In [125]:
driver.close()
driver.quit()
#stopped at page 290

#### Attept 6:

In [105]:
##6th try

unsuccessful_links6 = [] ###############################################
companies6 = [] ########################################################

def scraping_pages(num_pages):
    #Creating 'n' urls with url_roots to scrape
    url_root = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=0&page=' #root url
    nums = [x+134 for x in range(num_pages)] ############################################################
    url_mains = list(map(lambda n: url_root + str(n), nums)) #adding 'n' number to call url_root 
    time.sleep(10) #give page plenty of time to load (page 1 loads first, then specified 'n' page)
    
    for u in url_mains:
        driver.get(u)
        time.sleep(5)
        
    #looking for 'Overview' links from each main search page
        elems = driver.find_elements_by_tag_name('a') #find links on an individual search page tagged with the 'a' tag
        company_links = []
        for elem in elems:
            company_link = elem.get_attribute('href') #returns every item with 'href' attribute (these are the links for each company)
            if 'Overview' in company_link:
                company_links.append(company_link) #each company's 'Overview' link added to company_link list  
    
    #iterating through each company's "Overview" url
        for url in company_links:
            try: #fail safe for inevitable errors
                driver.get(url)
                time.sleep(5)

##---------------------------------------- Handling login ------------------------------------------##
                name = 'n' # <---- ENTER GLASSDOOR CREDENTIALS HERE
                pw = 'pw'
                
                try: #login                    
                    username = driver.find_element_by_id("userEmail")
                    password = driver.find_element_by_id("userPassword")
                    submit = driver.find_element_by_xpath('//*[@id="InlineLoginModule"]/div/div[2]/div/div[1]/div[3]/form/div[3]/div[1]/button')
                    username.send_keys(name)
                    password.send_keys(pw)
                    submit.click()
                    time.sleep(4) #let page load
                except: #no login required
                    time.sleep(2) #########do we need this much time?
                    pass

##---------------------------------- Gathering Variables - Main Page ---------------------------------##                                
                name = driver.find_element_by_xpath('//*[@id="EmpHeroAndEmpInfo"]/div[3]/div[2]').text
                size = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[3]/div').text
                headquarters = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[2]/div').text
                industry = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[6]/div').text
                try:
                    num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[3]/a').text
                except: 
                    num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[3]/a').text        

            #Gather Description - handling "Read More" button
                try:
                    read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class 
                    read_more.click()
                    time.sleep(2)
                    description = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[1]/span').text
                except:
                    description = "N/A"

            #Gather Mission - handling "Read More" button    
                try:
                    read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class
                    read_more.click()
                    time.sleep(2)
                    mission = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[2]').text
                except:
                    mission = "N/A"

##-------------------------------- Gathering Variables - Ratings Pop-up --------------------------------##    
            #Webpage layout 1
                try: 
                    driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]').click()
                    time.sleep(5) #let page load

                    rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
                    rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
                    rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
                    rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
                    rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
                    rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
                    rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

                    time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
            #Webpage layout 2
                except: 
                    driver.get(url) #recalling url
                    driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[1]/div[2]').click()
                    time.sleep(5) #let page load
                    
                    rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
                    rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
                    rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
                    rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
                    rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
                    rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
                    rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

                    time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
                                        
##---------------------------------------- Creating a Dictionary ----------------------------------------##
                companies6.append({ ###############################################
                    "NAME" : name,
                    "SIZE" : size,
                    "LOCATION_HQ" : headquarters,
                    "INDUSTRY" : industry,
                    "RATING_OVERALL" : rating_overall,
                    "RATING_DI" : rating_DI,
                    "RATING_CV" : rating_CV,
                    "RATING_WL" : rating_WL,
                    "RATING_SM" : rating_SM,
                    "RATING_CB" : rating_CB,
                    "RATING_CO" : rating_CO,
                    "NUM_REVIEWS" : num_reviews,
                    "DESCRIPTION" : description,
                    "MISSION" : mission
                                 })

            except: #fail safe for inevitable errors
                unsuccessful_links6.append(url) #adding unsuccessful urls to a list ###############
                print('ERROR: ', url)
                time.sleep(10)
                
        print(f'Finished scraping {len(companies6)} companies') ###################
        df6 = pd.DataFrame(companies6) ######################################################################
    return df6 ##############################################################################################

In [106]:
scraping_pages(100)

Finished scraping 10 companies
Finished scraping 20 companies
Finished scraping 30 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Twilio-EI_IE410790.11,17.htm
Finished scraping 37 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Twilio-EI_IE410790.11,17.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-Blue-Cross-Blue-Shield-of-Massachusetts-EI_IE6294.11,50.htm
Finished scraping 45 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-American-Fidelity-Assurance-Company-EI_IE17535.11,46.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-Blue-Cross-Blue-Shield-of-Massachusetts-EI_IE6294.11,50.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-Great-American-Insurance-Group-EI_IE38078.11,41.htm
Finished scraping 51 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Colgate-Palmolive-EI_IE162.11,28.htm
Finished scraping 59 companies
Finished scraping 67 companies
Finished scraping 76 companies
Finished scraping 8

Finished scraping 640 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-UW-Health-EI_IE721652.11,20.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-The-Aerospace-Corporation-EI_IE5634.11,36.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-ASML-EI_IE5781.11,15.htm
Finished scraping 647 companies
Finished scraping 657 companies
Finished scraping 667 companies
Finished scraping 667 companies
Finished scraping 676 companies
Finished scraping 684 companies
Finished scraping 694 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-First-Republic-Bank-EI_IE859.11,30.htm
Finished scraping 703 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Match-EI_IE15905.11,16.htm
Finished scraping 712 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Match-EI_IE15905.11,16.htm
Finished scraping 719 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Anaplan-EI_IE695685.11,18.htm
ERROR:  https://www.glassdoor.com/Overview/W

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,Rockstar Games\nPart of Take-Two,1001 to 5000 Employees,"New York, NY",Video Games,4.3,4.7,4.4,4.0,4.0,4.3,3.9,See All 236 Reviews,Rockstar Games doesn't see why only the kids s...,
1,National Renewable Energy Lab,1001 to 5000 Employees,"Golden, CO",Energy,4.3,4.6,4.5,4.3,4.0,4.1,4.1,See All 370 Reviews,The National Renewable Energy Laboratory (NREL...,Mission: Our emphasis is on a comprehensive en...
2,FirstLight Home Care,1001 to 5000 Employees,"Cincinnati, OH",Health Care Services & Hospitals,4.3,4.0,4.3,4.1,4.2,3.7,4.0,See All 251 Reviews,FirstLight Home Care is a leading provider of ...,Mission: The FirstLight Home Care mission is r...
3,ManTech,5001 to 10000 Employees,"Herndon, VA",Research & Development,4.3,4.3,4.2,4.3,4.1,4.3,4.2,"See All 1,005 Reviews",ManTech was founded in 1968 to provide advance...,
4,MedVet,1001 to 5000 Employees,"Worthington, OH",Veterinary Services,4.3,4.2,4.3,3.7,4.1,4.2,4.2,See All 214 Reviews,"MedVet, with headquarters in Columbus, Ohio, i...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,Protective Life,1001 to 5000 Employees,"Birmingham, AL",Insurance Carriers,3.9,3.8,3.9,4.1,3.6,4.1,3.7,See All 257 Reviews,We are a growing company with a bright future....,
842,Legg Mason,1001 to 5000 Employees,"Baltimore, MD",Investment Banking & Asset Management,3.9,4.0,3.8,4.1,3.4,4.0,3.1,See All 195 Reviews,Legg Mason has served investors for over 110 y...,
843,Johns Manville,5001 to 10000 Employees,"Denver, CO",Miscellaneous Manufacturing,3.9,3.7,3.5,3.5,3.3,4.1,3.4,See All 213 Reviews,Johns Manville is a proud member of the Berksh...,
844,PATH,1001 to 5000 Employees,"Seattle, WA",Social Assistance,3.9,4.1,4.1,4.0,3.5,4.0,3.7,See All 231 Reviews,PATH is a global organization that works to ac...,


In [108]:
print(len(companies6))
print('unsuccessful: ', len(unsuccessful_links6))
unsuccessful_links6


846
unsuccessful:  80


['https://www.glassdoor.com/Overview/Working-at-Twilio-EI_IE410790.11,17.htm',
 'https://www.glassdoor.com/Overview/Working-at-Twilio-EI_IE410790.11,17.htm',
 'https://www.glassdoor.com/Overview/Working-at-Blue-Cross-Blue-Shield-of-Massachusetts-EI_IE6294.11,50.htm',
 'https://www.glassdoor.com/Overview/Working-at-American-Fidelity-Assurance-Company-EI_IE17535.11,46.htm',
 'https://www.glassdoor.com/Overview/Working-at-Blue-Cross-Blue-Shield-of-Massachusetts-EI_IE6294.11,50.htm',
 'https://www.glassdoor.com/Overview/Working-at-Great-American-Insurance-Group-EI_IE38078.11,41.htm',
 'https://www.glassdoor.com/Overview/Working-at-Colgate-Palmolive-EI_IE162.11,28.htm',
 'https://www.glassdoor.com/Overview/Working-at-Infostretch-EI_IE268709.11,22.htm',
 'https://www.glassdoor.com/Overview/Working-at-InterSystems-EI_IE6282.11,23.htm',
 'https://www.glassdoor.com/Overview/Working-at-National-Agents-Alliance-EI_IE225028.11,35.htm',
 'https://www.glassdoor.com/Overview/Working-at-Infostretch-EI

In [109]:
df6_846 = pd.DataFrame(companies6)
df6_846

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,Rockstar Games\nPart of Take-Two,1001 to 5000 Employees,"New York, NY",Video Games,4.3,4.7,4.4,4.0,4.0,4.3,3.9,See All 236 Reviews,Rockstar Games doesn't see why only the kids s...,
1,National Renewable Energy Lab,1001 to 5000 Employees,"Golden, CO",Energy,4.3,4.6,4.5,4.3,4.0,4.1,4.1,See All 370 Reviews,The National Renewable Energy Laboratory (NREL...,Mission: Our emphasis is on a comprehensive en...
2,FirstLight Home Care,1001 to 5000 Employees,"Cincinnati, OH",Health Care Services & Hospitals,4.3,4.0,4.3,4.1,4.2,3.7,4.0,See All 251 Reviews,FirstLight Home Care is a leading provider of ...,Mission: The FirstLight Home Care mission is r...
3,ManTech,5001 to 10000 Employees,"Herndon, VA",Research & Development,4.3,4.3,4.2,4.3,4.1,4.3,4.2,"See All 1,005 Reviews",ManTech was founded in 1968 to provide advance...,
4,MedVet,1001 to 5000 Employees,"Worthington, OH",Veterinary Services,4.3,4.2,4.3,3.7,4.1,4.2,4.2,See All 214 Reviews,"MedVet, with headquarters in Columbus, Ohio, i...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,Protective Life,1001 to 5000 Employees,"Birmingham, AL",Insurance Carriers,3.9,3.8,3.9,4.1,3.6,4.1,3.7,See All 257 Reviews,We are a growing company with a bright future....,
842,Legg Mason,1001 to 5000 Employees,"Baltimore, MD",Investment Banking & Asset Management,3.9,4.0,3.8,4.1,3.4,4.0,3.1,See All 195 Reviews,Legg Mason has served investors for over 110 y...,
843,Johns Manville,5001 to 10000 Employees,"Denver, CO",Miscellaneous Manufacturing,3.9,3.7,3.5,3.5,3.3,4.1,3.4,See All 213 Reviews,Johns Manville is a proud member of the Berksh...,
844,PATH,1001 to 5000 Employees,"Seattle, WA",Social Assistance,3.9,4.1,4.1,4.0,3.5,4.0,3.7,See All 231 Reviews,PATH is a global organization that works to ac...,


In [110]:
df_csv6 = df6_846.to_csv('glassdoor_ratings6_846.csv', index=False)

**CLOSE DRIVER:**

In [111]:
driver.close()
driver.quit()
#made it to page 233

#### Attept 5:

In [85]:
##5th try

unsuccessful_links5 = []
companies5 = []

def scraping_pages(num_pages):
    #Creating 'n' urls with url_roots to scrape
    url_root = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=0&page=' #root url
    nums = [x+21 for x in range(num_pages)] 
    url_mains = list(map(lambda n: url_root + str(n), nums)) #adding 'n' number to call url_root
    
    for u in url_mains:
        driver.get(u)
            
    #looking for 'Overview' links from each main search page
        elems = driver.find_elements_by_tag_name('a') #find links on an individual search page tagged with the 'a' tag
        company_links = []
        for elem in elems:
            company_link = elem.get_attribute('href') #returns every item with 'href' attribute (these are the links for each company)
            if 'Overview' in company_link:
                company_links.append(company_link) #each company's 'Overview' link added to company_link list  

        for url in company_links:
            try: #fail safe for inevitable errors
                driver.get(url)
                time.sleep(10)

##---------------------------------------- Handling login ------------------------------------------##
                name = 'n' # <---- ENTER GLASSDOOR CREDENTIALS HERE
                pw = 'pw'
                
                try: #login                    
                    username = driver.find_element_by_id("userEmail")
                    password = driver.find_element_by_id("userPassword")
                    submit = driver.find_element_by_xpath('//*[@id="InlineLoginModule"]/div/div[2]/div/div[1]/div[3]/form/div[3]/div[1]/button')
                    username.send_keys(name)
                    password.send_keys(pw)
                    submit.click()
                    time.sleep(6) #lettin page load
                except: #no login required
                    pass

##---------------------------------- Gathering Variables - Main Page ---------------------------------##                
                time.sleep(6)
                name = driver.find_element_by_xpath('//*[@id="EmpHeroAndEmpInfo"]/div[3]/div[2]').text
                size = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[3]/div').text
                headquarters = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[2]/div').text
                industry = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[6]/div').text
                try:
                    num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[3]/a').text
                except: 
                    num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[3]/a').text        

            #Gather Description - handling "Read More" button
                try:
                    read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class 
                    read_more.click()
                    time.sleep(2)
                    description = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[1]/span').text
                except:
                    description = "N/A"

            #Gather Mission - handling "Read More" button    
                try:
                    read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class
                    read_more.click()
                    time.sleep(2)
                    mission = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[2]').text
                except:
                    mission = "N/A"

##-------------------------------- Gathering Variables - Ratings Pop-up --------------------------------##    
            #Webpage layout 1
                try: 
                    driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]').click()
                    time.sleep(5) #let page load

                    rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
                    rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
                    rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
                    rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
                    rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
                    rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
                    rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

                    time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
            #Webpage layout 2
                except: 
                    driver.get(url) #recalling url
                    driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[1]/div[2]').click()
                    time.sleep(5) #let page load
                    
                    rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
                    rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
                    rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
                    rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
                    rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
                    rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
                    rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

                    time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
                                        
##---------------------------------------- Creating a Dictionary ----------------------------------------##
                companies5.append({
                    "NAME" : name,
                    "SIZE" : size,
                    "LOCATION_HQ" : headquarters,
                    "INDUSTRY" : industry,
                    "RATING_OVERALL" : rating_overall,
                    "RATING_DI" : rating_DI,
                    "RATING_CV" : rating_CV,
                    "RATING_WL" : rating_WL,
                    "RATING_SM" : rating_SM,
                    "RATING_CB" : rating_CB,
                    "RATING_CO" : rating_CO,
                    "NUM_REVIEWS" : num_reviews,
                    "DESCRIPTION" : description,
                    "MISSION" : mission
                                 })

            except: #fail safe for inevitable errors
                unsuccessful_links5.append(url) #adding unsuccessful urls to a list
                print('ERROR: ', url)
                time.sleep(10)
                
        print(f'Finished scraping {len(companies5)} companies')
        dfoops = pd.DataFrame(companies5)
    return dfoops

In [86]:
scraping_pages(130)


ERROR:  https://www.glassdoor.com/Overview/Working-at-Texas-Instruments-EI_IE651.11,28.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-Nokia-EI_IE3494.11,16.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-Schneider-Electric-EI_IE3956.11,29.htm
Finished scraping 7 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Memorial-Sloan-Kettering-EI_IE4711.11,35.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-NYU-Langone-Health-EI_IE121521.11,29.htm
Finished scraping 14 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Airbnb-EI_IE391850.11,17.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-NYU-Langone-Health-EI_IE121521.11,29.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-United-Airlines-EI_IE683.11,26.htm
Finished scraping 21 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-LexisNexis-Legal-and-Professional-EI_IE4146.11,44.htm
Finished scraping 30 companies
ERROR:  https://www.glassdoor.com/Overview/W

Finished scraping 474 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Coforge-EI_IE36319.11,18.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-Sogeti-EI_IE12028.11,17.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-Sunrise-Senior-Living-EI_IE6023.11,32.htm
Finished scraping 481 companies
Finished scraping 491 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Banfield-Pet-Hospital-EI_IE137890.11,32.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-ExxonMobil-EI_IE237.11,21.htm
Finished scraping 499 companies
Finished scraping 509 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-DISH-EI_IE4501.11,15.htm
Finished scraping 518 companies
Finished scraping 518 companies
Finished scraping 528 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Alorica-EI_IE17814.11,18.htm
Finished scraping 537 companies
Finished scraping 547 companies
Finished scraping 557 companies
Finished scraping 557 companies
ERROR:  htt

StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=88.0.4324.146)


In [88]:
print(len(companies5))
print("unsuccessful: ", len(unsuccessful_links5))
unsuccessful_links5


797
unsuccessful:  108


['https://www.glassdoor.com/Overview/Working-at-Texas-Instruments-EI_IE651.11,28.htm',
 'https://www.glassdoor.com/Overview/Working-at-Nokia-EI_IE3494.11,16.htm',
 'https://www.glassdoor.com/Overview/Working-at-Schneider-Electric-EI_IE3956.11,29.htm',
 'https://www.glassdoor.com/Overview/Working-at-Memorial-Sloan-Kettering-EI_IE4711.11,35.htm',
 'https://www.glassdoor.com/Overview/Working-at-NYU-Langone-Health-EI_IE121521.11,29.htm',
 'https://www.glassdoor.com/Overview/Working-at-Airbnb-EI_IE391850.11,17.htm',
 'https://www.glassdoor.com/Overview/Working-at-NYU-Langone-Health-EI_IE121521.11,29.htm',
 'https://www.glassdoor.com/Overview/Working-at-United-Airlines-EI_IE683.11,26.htm',
 'https://www.glassdoor.com/Overview/Working-at-LexisNexis-Legal-and-Professional-EI_IE4146.11,44.htm',
 'https://www.glassdoor.com/Overview/Working-at-IHS-Markit-EI_IE1337673.11,21.htm',
 'https://www.glassdoor.com/Overview/Working-at-Cengage-EI_IE20055.11,18.htm',
 'https://www.glassdoor.com/Overview/Wor

In [112]:
df5_797 = pd.DataFrame(companies5)
df5_797

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,Huron Consulting Group,1001 to 5000 Employees,"Chicago, IL",Consulting,4.1,4.0,4.2,3.9,3.8,3.9,3.8,See All 950 Reviews,Huron is a global consultancy that helps our c...,
1,Walt Disney Company,10000+ Employees,"Burbank, CA",Motion Picture Production & Distribution,4.1,4.3,4.2,3.6,3.5,3.9,3.7,"See All 7,415 Reviews",The monarch of this magic kingdom is no man bu...,
2,ServiceNow,10000+ Employees,"Santa Clara, CA",Enterprise Software & Network Solutions,4.1,4.4,4.1,3.8,3.9,4.2,3.9,"See All 1,322 Reviews","At ServiceNow, we make the world of work, work...",
3,Universal Studios\nPart of NBCUniversal,10000+ Employees,"Universal City, CA",Motion Picture Production & Distribution,4.1,4.4,4.0,3.7,3.4,4.0,3.7,"See All 1,163 Reviews",Movies and theme parks are the center of Unive...,
4,Experian\nHiring Surge,10000+ Employees,"Dublin, Ireland",Financial Analytics & Research,4.1,4.3,4.2,3.9,3.8,4.0,3.8,"See All 3,427 Reviews",Experian is the leading global information ser...,Mission: We help to give our customers the pow...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
792,Colorado State University,5001 to 10000 Employees,"Fort Collins, CO",Colleges & Universities,4.3,3.4,4.0,4.0,3.7,3.3,3.7,"See All 1,016 Reviews",Colorado State University (CSU) got its start ...,
793,Ascent Services Group,1001 to 5000 Employees,"Concord, CA",Staffing & Outsourcing,4.3,3.9,4.4,4.4,4.2,4.1,4.2,See All 163 Reviews,The Ascent Services Group (Ascent) is an Ameri...,
794,Jackson Health System,10000+ Employees,"Miami, FL",Health Care Services & Hospitals,4.3,4.5,3.9,3.7,3.5,4.3,3.9,See All 155 Reviews,"Jackson Health System, an integrated healthcar...",
795,Blue Yonder\nHiring Surge,5001 to 10000 Employees,"Scottsdale, AZ",Enterprise Software & Network Solutions,4.3,4.2,4.4,4.1,4.0,4.0,4.1,"See All 1,855 Reviews",Blue Yonder is the world leader in digital sup...,Mission: Our mission is to empower every perso...


In [113]:
df_csv5 = df5_797.to_csv('glassdoor_ratings5_797.csv', index=False) ###########################################

#### Attept 3:

In [58]:
##3rd try

unsuccessful_links3 = []
companies3 = []

def scraping_pages(num_pages):
    #Creating 'n' urls with url_roots to scrape
    url_root = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=0&page=' #root url
    nums = [x+11 for x in range(num_pages)] 
    url_mains = list(map(lambda n: url_root + str(n), nums)) #adding 'n' number to call url_root
    
    for u in url_mains:
        driver.get(u)
            
    #looking for 'Overview' links from each main search page
        elems = driver.find_elements_by_tag_name('a') #find links on an individual search page tagged with the 'a' tag
        company_links = []
        for elem in elems:
            company_link = elem.get_attribute('href') #returns every item with 'href' attribute (these are the links for each company)
            if 'Overview' in company_link:
                company_links.append(company_link) #each company's 'Overview' link added to company_link list  

        for url in company_links:
            try: #fail safe for inevitable errors
                driver.get(url)
                time.sleep(1)

##---------------------------------------- Handling login ------------------------------------------##
                name = 'n' # <---- ENTER GLASSDOOR CREDENTIALS HERE
                pw = 'pw'
                
                try: #login                    
                    username = driver.find_element_by_id("userEmail")
                    password = driver.find_element_by_id("userPassword")
                    submit = driver.find_element_by_xpath('//*[@id="InlineLoginModule"]/div/div[2]/div/div[1]/div[3]/form/div[3]/div[1]/button')
                    username.send_keys(name)
                    password.send_keys(pw)
                    submit.click()
                    time.sleep(3) #lettin page load
                except: #no login required
                    pass

##---------------------------------- Gathering Variables - Main Page ---------------------------------##                
                time.sleep(4)
                name = driver.find_element_by_xpath('//*[@id="EmpHeroAndEmpInfo"]/div[3]/div[2]').text
                size = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[3]/div').text
                headquarters = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[2]/div').text
                industry = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[6]/div').text
                try:
                    num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[3]/a').text
                except: 
                    num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[3]/a').text        

            #Gather Description - handling "Read More" button
                try:
                    read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class 
                    read_more.click()
                    time.sleep(1)
                    description = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[1]/span').text
                except:
                    description = "N/A"

            #Gather Mission - handling "Read More" button    
                try:
                    read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class
                    read_more.click()
                    time.sleep(1)
                    mission = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[2]').text
                except:
                    mission = "N/A"

##-------------------------------- Gathering Variables - Ratings Pop-up --------------------------------##    
            #Webpage layout 1
                try: 
                    driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]').click()
                    time.sleep(5) #let page load

                    rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
                    rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
                    rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
                    rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
                    rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
                    rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
                    rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

                    time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
            #Webpage layout 2
                except: 
                    driver.get(url) #recalling url
                    driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[1]/div[2]').click()
                    time.sleep(5) #let page load
                    
                    rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
                    rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
                    rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
                    rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
                    rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
                    rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
                    rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

                    time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
                                        
##---------------------------------------- Creating a Dictionary ----------------------------------------##
                companies3.append({
                    "NAME" : name,
                    "SIZE" : size,
                    "LOCATION_HQ" : headquarters,
                    "INDUSTRY" : industry,
                    "RATING_OVERALL" : rating_overall,
                    "RATING_DI" : rating_DI,
                    "RATING_CV" : rating_CV,
                    "RATING_WL" : rating_WL,
                    "RATING_SM" : rating_SM,
                    "RATING_CB" : rating_CB,
                    "RATING_CO" : rating_CO,
                    "NUM_REVIEWS" : num_reviews,
                    "DESCRIPTION" : description,
                    "MISSION" : mission
                                 })

            except: #fail safe for inevitable errors
                unsuccessful_links3.append(url) #adding unsuccessful urls to a list
                print('ERROR: ', url)
                time.sleep(10)
                
        print(f'Finished scraping {len(companies3)} companies')
        df = pd.DataFrame(companies)
    return df

In [59]:
scraping_pages(140)


Finished scraping 10 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Stryker-EI_IE1918.11,18.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-University-of-Southern-California-EI_IE3027.11,44.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-Penn-State-EI_IE2931.11,21.htm
Finished scraping 17 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Boston-Scientific-EI_IE2187.11,28.htm
Finished scraping 25 companies
Finished scraping 35 companies
Finished scraping 45 companies
Finished scraping 45 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-AstraZeneca-EI_IE9214.11,22.htm
Finished scraping 54 companies
Finished scraping 54 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Houston-Methodist-EI_IE4460.11,28.htm
Finished scraping 61 companies
Finished scraping 71 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Nokia-EI_IE3494.11,16.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-Schneider-E

StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=88.0.4324.146)


In [114]:
print(len(companies3))
print('unsuccessful: ', len(unsuccessful_links3))

unsuccessful_links3


199
unsuccessful:  35


['https://www.glassdoor.com/Overview/Working-at-Stryker-EI_IE1918.11,18.htm',
 'https://www.glassdoor.com/Overview/Working-at-University-of-Southern-California-EI_IE3027.11,44.htm',
 'https://www.glassdoor.com/Overview/Working-at-Penn-State-EI_IE2931.11,21.htm',
 'https://www.glassdoor.com/Overview/Working-at-Boston-Scientific-EI_IE2187.11,28.htm',
 'https://www.glassdoor.com/Overview/Working-at-AstraZeneca-EI_IE9214.11,22.htm',
 'https://www.glassdoor.com/Overview/Working-at-Houston-Methodist-EI_IE4460.11,28.htm',
 'https://www.glassdoor.com/Overview/Working-at-Nokia-EI_IE3494.11,16.htm',
 'https://www.glassdoor.com/Overview/Working-at-Schneider-Electric-EI_IE3956.11,29.htm',
 'https://www.glassdoor.com/Overview/Working-at-Texas-Instruments-EI_IE651.11,28.htm',
 'https://www.glassdoor.com/Overview/Working-at-Memorial-Sloan-Kettering-EI_IE4711.11,35.htm',
 'https://www.glassdoor.com/Overview/Working-at-NYU-Langone-Health-EI_IE121521.11,29.htm',
 'https://www.glassdoor.com/Overview/Work

In [63]:
df3_199 = pd.DataFrame(companies)
df3_199

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,Google,10000+ Employees,"Mountain View, CA",Internet,4.5,4.4,4.4,4.3,4.1,4.5,4.3,"See All 18,362 Reviews","Google is not a conventional company, and we d...",
1,Microsoft,10000+ Employees,"Redmond, WA",Computer Hardware & Software,4.4,4.4,4.3,4.1,4.0,4.1,4.1,"See All 29,390 Reviews",Our drive to change the world unites us!\n\nMi...,
2,Apple,10000+ Employees,"Cupertino, CA",Computer Hardware & Software,4.3,4.4,4.3,3.7,3.7,4.3,3.8,"See All 20,850 Reviews",We’re a diverse collective of thinkers and doe...,
3,US Air Force,10000+ Employees,"Washington, DC",Federal Agencies,4.2,4.2,4.0,3.3,3.4,4.3,4.2,"See All 16,601 Reviews",The mission of the US Department of the Air Fo...,
4,Cisco Systems,10000+ Employees,"San Jose, CA",Computer Hardware & Software,4.2,4.3,4.3,4.2,3.8,4.1,3.9,"See All 21,803 Reviews","#WeAreCisco, where each person is unique, but ...",Mission: Cisco wants you to bring your uniquen...
5,Capital One,10000+ Employees,"Mc Lean, VA",Banks & Credit Unions,4.1,4.2,4.1,4.1,3.6,4.2,3.7,"See All 9,223 Reviews",Capital One is where professional achievement ...,Mission: Capital One is an equal opportunity e...
6,Qualcomm,10000+ Employees,"San Diego, CA",Computer Hardware & Software,4.1,3.9,3.8,3.4,3.6,4.1,4.0,"See All 6,803 Reviews",We’re looking for inventive minds to help fuel...,
7,Intel Corporation,10000+ Employees,"Santa Clara, CA",Computer Hardware & Software,4.1,4.4,4.1,4.0,3.5,4.0,3.9,"See All 17,699 Reviews",Intel’s mission is to shape the future of tech...,
8,Bank of America,10000+ Employees,"Charlotte, NC",Banks & Credit Unions,4.0,4.3,4.0,3.8,3.5,4.0,3.8,"See All 26,089 Reviews",Bank of America is one of the world's largest ...,
9,Accenture,10000+ Employees,"Dublin, Ireland",Consulting,4.0,4.2,3.9,3.4,3.5,3.9,3.9,"See All 73,975 Reviews",Accenture is a global professional services co...,


In [93]:
df_csv3 = df3_199.to_csv('glassdoor_ratings3_199.csv', index=False)

#### Attempt 2:

In [44]:
##2nd try

unsuccessful_links = []
companies = []

def scraping_pages(num_pages):
    #Creating 'n' urls with url_roots to scrape
    url_root = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=0&page=' #root url
    nums = [x+7 for x in range(num_pages)] 
    url_mains = list(map(lambda n: url_root + str(n), nums)) #adding 'n' number to call url_root
    
    for u in url_mains:
        driver.get(u)
            
    #looking for 'Overview' links from each main search page
        elems = driver.find_elements_by_tag_name('a') #find links on an individual search page tagged with the 'a' tag
        company_links = []
        for elem in elems:
            company_link = elem.get_attribute('href') #returns every item with 'href' attribute (these are the links for each company)
            if 'Overview' in company_link:
                company_links.append(company_link) #each company's 'Overview' link added to company_link list  

        for url in company_links:
            try: #fail safe for inevitable errors
                driver.get(url)
                time.sleep(1)

##---------------------------------------- Handling login ------------------------------------------##
                name = 'n' # <---- ENTER GLASSDOOR CREDENTIALS HERE
                pw = 'pw'
                
                try: #login                    
                    username = driver.find_element_by_id("userEmail")
                    password = driver.find_element_by_id("userPassword")
                    submit = driver.find_element_by_xpath('//*[@id="InlineLoginModule"]/div/div[2]/div/div[1]/div[3]/form/div[3]/div[1]/button')
                    username.send_keys(name)
                    password.send_keys(pw)
                    submit.click()
                    time.sleep(3) #lettin page load
                except: #no login required
                    pass

##---------------------------------- Gathering Variables - Main Page ---------------------------------##                
                time.sleep(4)
                name = driver.find_element_by_xpath('//*[@id="EmpHeroAndEmpInfo"]/div[3]/div[2]').text
                size = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[3]/div').text
                headquarters = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[2]/div').text
                industry = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[6]/div').text
                try:
                    num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[3]/a').text
                except: 
                    num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[3]/a').text        

            #Gather Description - handling "Read More" button
                try:
                    read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class 
                    read_more.click()
                    time.sleep(1)
                    description = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[1]/span').text
                except:
                    description = "N/A"

            #Gather Mission - handling "Read More" button    
                try:
                    read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class
                    read_more.click()
                    time.sleep(1)
                    mission = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[2]').text
                except:
                    mission = "N/A"

##-------------------------------- Gathering Variables - Ratings Pop-up --------------------------------##    
            #Webpage layout 1
                try: 
                    driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]').click()
                    time.sleep(5) #let page load

                    rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
                    rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
                    rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
                    rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
                    rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
                    rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
                    rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

                    time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
            #Webpage layout 2
                except: 
                    driver.get(url) #recalling url
                    driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[1]/div[2]').click()
                    time.sleep(5) #let page load
                    
                    rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
                    rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
                    rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
                    rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
                    rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
                    rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
                    rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

                    time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
                                        
##---------------------------------------- Creating a Dictionary ----------------------------------------##
                companies.append({
                    "NAME" : name,
                    "SIZE" : size,
                    "LOCATION_HQ" : headquarters,
                    "INDUSTRY" : industry,
                    "RATING_OVERALL" : rating_overall,
                    "RATING_DI" : rating_DI,
                    "RATING_CV" : rating_CV,
                    "RATING_WL" : rating_WL,
                    "RATING_SM" : rating_SM,
                    "RATING_CB" : rating_CB,
                    "RATING_CO" : rating_CO,
                    "NUM_REVIEWS" : num_reviews,
                    "DESCRIPTION" : description,
                    "MISSION" : mission
                                 })

            except: #fail safe for inevitable errors
                unsuccessful_links.append(url) #adding unsuccessful urls to a list
                print('ERROR: ', url)
                time.sleep(10)
                
        print(f'Finished scraping {len(companies)} companies')
        df = pd.DataFrame(companies)
    return df

In [45]:
scraping_pages(140)


Finished scraping 10 companies
Finished scraping 20 companies
ERROR:  https://www.glassdoor.com/Overview/Working-at-Paycom-EI_IE136736.11,17.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-Intuit-EI_IE2293.11,17.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-H-E-B-EI_IE2824.11,16.htm
Finished scraping 27 companies
Finished scraping 37 companies


StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=88.0.4324.146)


In [47]:
len(companies)

37

In [49]:
print(len(companies))
unsuccessful_links


37


['https://www.glassdoor.com/Overview/Working-at-Paycom-EI_IE136736.11,17.htm',
 'https://www.glassdoor.com/Overview/Working-at-Intuit-EI_IE2293.11,17.htm',
 'https://www.glassdoor.com/Overview/Working-at-H-E-B-EI_IE2824.11,16.htm']

In [50]:
df2_37 = pd.DataFrame(companies)
df2_37

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,Google,10000+ Employees,"Mountain View, CA",Internet,4.5,4.4,4.4,4.3,4.1,4.5,4.3,"See All 18,362 Reviews","Google is not a conventional company, and we d...",
1,Microsoft,10000+ Employees,"Redmond, WA",Computer Hardware & Software,4.4,4.4,4.3,4.1,4.0,4.1,4.1,"See All 29,390 Reviews",Our drive to change the world unites us!\n\nMi...,
2,Apple,10000+ Employees,"Cupertino, CA",Computer Hardware & Software,4.3,4.4,4.3,3.7,3.7,4.3,3.8,"See All 20,850 Reviews",We’re a diverse collective of thinkers and doe...,
3,US Air Force,10000+ Employees,"Washington, DC",Federal Agencies,4.2,4.2,4.0,3.3,3.4,4.3,4.2,"See All 16,601 Reviews",The mission of the US Department of the Air Fo...,
4,Cisco Systems,10000+ Employees,"San Jose, CA",Computer Hardware & Software,4.2,4.3,4.3,4.2,3.8,4.1,3.9,"See All 21,803 Reviews","#WeAreCisco, where each person is unique, but ...",Mission: Cisco wants you to bring your uniquen...
5,Capital One,10000+ Employees,"Mc Lean, VA",Banks & Credit Unions,4.1,4.2,4.1,4.1,3.6,4.2,3.7,"See All 9,223 Reviews",Capital One is where professional achievement ...,Mission: Capital One is an equal opportunity e...
6,Qualcomm,10000+ Employees,"San Diego, CA",Computer Hardware & Software,4.1,3.9,3.8,3.4,3.6,4.1,4.0,"See All 6,803 Reviews",We’re looking for inventive minds to help fuel...,
7,Intel Corporation,10000+ Employees,"Santa Clara, CA",Computer Hardware & Software,4.1,4.4,4.1,4.0,3.5,4.0,3.9,"See All 17,699 Reviews",Intel’s mission is to shape the future of tech...,
8,Bank of America,10000+ Employees,"Charlotte, NC",Banks & Credit Unions,4.0,4.3,4.0,3.8,3.5,4.0,3.8,"See All 26,089 Reviews",Bank of America is one of the world's largest ...,
9,Accenture,10000+ Employees,"Dublin, Ireland",Consulting,4.0,4.2,3.9,3.4,3.5,3.9,3.9,"See All 73,975 Reviews",Accenture is a global professional services co...,


In [51]:
df_csv2 = df2_37.to_csv('glassdoor_ratings2_37.csv', index=False)

#### Attept 1:

In [34]:
df1_62 = pd.DataFrame(companies)
df1_62

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,Google,10000+ Employees,"Mountain View, CA",Internet,4.5,4.4,4.4,4.3,4.1,4.5,4.3,"See All 18,362 Reviews",,
1,Microsoft,10000+ Employees,"Redmond, WA",Computer Hardware & Software,4.4,4.4,4.3,4.1,4.0,4.1,4.1,"See All 29,389 Reviews",Our drive to change the world unites us!\n\nMi...,
2,Apple,10000+ Employees,"Cupertino, CA",Computer Hardware & Software,4.3,4.4,4.3,3.7,3.7,4.3,3.8,"See All 20,851 Reviews",We’re a diverse collective of thinkers and doe...,
3,US Air Force,10000+ Employees,"Washington, DC",Federal Agencies,4.2,4.2,4.0,3.3,3.4,4.3,4.2,"See All 16,600 Reviews",The mission of the US Department of the Air Fo...,
4,Cisco Systems,10000+ Employees,"San Jose, CA",Computer Hardware & Software,4.2,4.3,4.3,4.2,3.8,4.1,3.9,"See All 21,804 Reviews","#WeAreCisco, where each person is unique, but ...",Mission: Cisco wants you to bring your uniquen...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,NewYork-Presbyterian Hospital,10000+ Employees,"New York, NY",Health Care Services & Hospitals,4.2,4.3,4.2,4.0,3.8,4.4,4.2,"See All 1,763 Reviews","At NewYork-Presbyterian, we put patients first...",Mission: NYP is one of the nation’s most compr...
58,Bayer,10000+ Employees,"Leverkusen, Germany",Biotech & Pharmaceuticals,4.1,4.1,4.0,4.1,3.5,4.2,3.6,"See All 4,290 Reviews",Bayer is a global company with a more than 150...,
59,Netflix,5001 to 10000 Employees,"Los Gatos, CA",Internet,4.1,4.2,3.9,3.4,3.6,4.4,3.7,"See All 1,263 Reviews",Netflix is the world's leading streaming enter...,
60,Goldman Sachs,10000+ Employees,"New York, NY",Investment Banking & Asset Management,4.1,4.1,3.9,3.0,3.7,4.0,4.1,"See All 9,096 Reviews","The Goldman Sachs Group, Inc. is a leading glo...",


In [24]:
scraping_pages(200)

ERROR:  https://www.glassdoor.com/Overview/Working-at-Google-EI_IE9079.11,17.htm
ERROR:  https://www.glassdoor.com/Overview/Working-at-Microsoft-EI_IE1651.11,20.htm


KeyboardInterrupt: 

In [7]:
driver.close()
driver.quit()

MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=49521): Max retries exceeded with url: /session/63eb3a8cc63b98e6a8fb971171021166/window (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8ea425a908>: Failed to establish a new connection: [Errno 61] Connection refused',))

In [26]:
unsuccessful_links

['https://www.glassdoor.com/Overview/Working-at-Southwest-Airlines-EI_IE611.11,29.htm',
 'https://www.glassdoor.com/Overview/Working-at-World-Wide-Technology-EI_IE9553.11,32.htm',
 'https://www.glassdoor.com/Overview/Working-at-Stryker-EI_IE1918.11,18.htm',
 'https://www.glassdoor.com/Overview/Working-at-University-of-Southern-California-EI_IE3027.11,44.htm',
 'https://www.glassdoor.com/Overview/Working-at-Penn-State-EI_IE2931.11,21.htm',
 'https://www.glassdoor.com/Overview/Working-at-REI-EI_IE7319.11,14.htm',
 'https://www.glassdoor.com/Overview/Working-at-Nokia-EI_IE3494.11,16.htm',
 'https://www.glassdoor.com/Overview/Working-at-Google-EI_IE9079.11,17.htm',
 'https://www.glassdoor.com/Overview/Working-at-Microsoft-EI_IE1651.11,20.htm']

In [30]:
len(companies) #62


62

In [35]:
df_csv = df.to_csv('glassdoor_ratings1_62.csv', index=False)

In [52]:
df2_37

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,Google,10000+ Employees,"Mountain View, CA",Internet,4.5,4.4,4.4,4.3,4.1,4.5,4.3,"See All 18,362 Reviews","Google is not a conventional company, and we d...",
1,Microsoft,10000+ Employees,"Redmond, WA",Computer Hardware & Software,4.4,4.4,4.3,4.1,4.0,4.1,4.1,"See All 29,390 Reviews",Our drive to change the world unites us!\n\nMi...,
2,Apple,10000+ Employees,"Cupertino, CA",Computer Hardware & Software,4.3,4.4,4.3,3.7,3.7,4.3,3.8,"See All 20,850 Reviews",We’re a diverse collective of thinkers and doe...,
3,US Air Force,10000+ Employees,"Washington, DC",Federal Agencies,4.2,4.2,4.0,3.3,3.4,4.3,4.2,"See All 16,601 Reviews",The mission of the US Department of the Air Fo...,
4,Cisco Systems,10000+ Employees,"San Jose, CA",Computer Hardware & Software,4.2,4.3,4.3,4.2,3.8,4.1,3.9,"See All 21,803 Reviews","#WeAreCisco, where each person is unique, but ...",Mission: Cisco wants you to bring your uniquen...
5,Capital One,10000+ Employees,"Mc Lean, VA",Banks & Credit Unions,4.1,4.2,4.1,4.1,3.6,4.2,3.7,"See All 9,223 Reviews",Capital One is where professional achievement ...,Mission: Capital One is an equal opportunity e...
6,Qualcomm,10000+ Employees,"San Diego, CA",Computer Hardware & Software,4.1,3.9,3.8,3.4,3.6,4.1,4.0,"See All 6,803 Reviews",We’re looking for inventive minds to help fuel...,
7,Intel Corporation,10000+ Employees,"Santa Clara, CA",Computer Hardware & Software,4.1,4.4,4.1,4.0,3.5,4.0,3.9,"See All 17,699 Reviews",Intel’s mission is to shape the future of tech...,
8,Bank of America,10000+ Employees,"Charlotte, NC",Banks & Credit Unions,4.0,4.3,4.0,3.8,3.5,4.0,3.8,"See All 26,089 Reviews",Bank of America is one of the world's largest ...,
9,Accenture,10000+ Employees,"Dublin, Ireland",Consulting,4.0,4.2,3.9,3.4,3.5,3.9,3.9,"See All 73,975 Reviews",Accenture is a global professional services co...,


In [19]:
##1st try

unsuccessful_links = []
companies = []

def scraping_pages(num_pages):
    #Creating 'n' urls with url_roots to scrape
    url_root = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=0&page=' #root url
    nums = [x+1 for x in range(num_pages)] 
    url_mains = list(map(lambda n: url_root + str(n), nums)) #adding 'n' number to call url_root
    
    for u in url_mains:
        driver.get(u)
            
    #looking for 'Overview' links from each main search page
        elems = driver.find_elements_by_tag_name('a') #find links on an individual search page tagged with the 'a' tag
        company_links = []
        for elem in elems:
            company_link = elem.get_attribute('href') #returns every item with 'href' attribute (these are the links for each company)
            if 'Overview' in company_link:
                company_links.append(company_link) #each company's 'Overview' link added to company_link list  

        for url in company_links:
            try: #fail safe for inevitable errors
                driver.get(url)
                time.sleep(1)

##---------------------------------------- Handling login ------------------------------------------##
                name = 'n' # <---- ENTER GLASSDOOR CREDENTIALS HERE
                pw = 'pw'
                
                try: #login                    
                    username = driver.find_element_by_id("userEmail")
                    password = driver.find_element_by_id("userPassword")
                    submit = driver.find_element_by_xpath('//*[@id="InlineLoginModule"]/div/div[2]/div/div[1]/div[3]/form/div[3]/div[1]/button')
                    username.send_keys(name)
                    password.send_keys(pw)
                    submit.click()
                    time.sleep(3) #lettin page load
                except: #no login required
                    pass

##---------------------------------- Gathering Variables - Main Page ---------------------------------##                
                time.sleep(4)
                name = driver.find_element_by_xpath('//*[@id="EmpHeroAndEmpInfo"]/div[3]/div[2]').text
                size = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[3]/div').text
                headquarters = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[2]/div').text
                industry = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[6]/div').text
                try:
                    num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[3]/a').text
                except: 
                    num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[3]/a').text        

            #Gather Description - handling "Read More" button
                try:
                    read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class 
                    read_more.click()
                    time.sleep(1)
                    description = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[1]/span').text
                except:
                    description = "N/A"

            #Gather Mission - handling "Read More" button    
                try:
                    read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class
                    read_more.click()
                    time.sleep(1)
                    mission = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[2]').text
                except:
                    mission = "N/A"

##-------------------------------- Gathering Variables - Ratings Pop-up --------------------------------##    
            #Webpage layout 1
                try: 
                    driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]').click()
                    time.sleep(5) #let page load

                    rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
                    rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
                    rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
                    rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
                    rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
                    rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
                    rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

                    time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
            #Webpage layout 2
                except: 
                    driver.get(url) #recalling url
                    driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[1]/div[2]').click()
                    time.sleep(5) #let page load
                    
                    rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
                    rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
                    rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
                    rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
                    rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
                    rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
                    rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

                    time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
                                        
##---------------------------------------- Creating a Dictionary ----------------------------------------##
                companies.append({
                    "NAME" : name,
                    "SIZE" : size,
                    "LOCATION_HQ" : headquarters,
                    "INDUSTRY" : industry,
                    "RATING_OVERALL" : rating_overall,
                    "RATING_DI" : rating_DI,
                    "RATING_CV" : rating_CV,
                    "RATING_WL" : rating_WL,
                    "RATING_SM" : rating_SM,
                    "RATING_CB" : rating_CB,
                    "RATING_CO" : rating_CO,
                    "NUM_REVIEWS" : num_reviews,
                    "DESCRIPTION" : description,
                    "MISSION" : mission
                                 })

            except: #fail safe for inevitable errors
                unsuccessful_links.append(url) #adding unsuccessful urls to a list
                print('ERROR: ', url)
                time.sleep(10)
                
        print(f'Finished scraping {len(companies)} companies')
        df = pd.DataFrame(companies)
    return df

In [22]:
len(companies) #62

62

In [34]:
df1_62 = pd.DataFrame(companies)
df1_62

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,Google,10000+ Employees,"Mountain View, CA",Internet,4.5,4.4,4.4,4.3,4.1,4.5,4.3,"See All 18,362 Reviews",,
1,Microsoft,10000+ Employees,"Redmond, WA",Computer Hardware & Software,4.4,4.4,4.3,4.1,4.0,4.1,4.1,"See All 29,389 Reviews",Our drive to change the world unites us!\n\nMi...,
2,Apple,10000+ Employees,"Cupertino, CA",Computer Hardware & Software,4.3,4.4,4.3,3.7,3.7,4.3,3.8,"See All 20,851 Reviews",We’re a diverse collective of thinkers and doe...,
3,US Air Force,10000+ Employees,"Washington, DC",Federal Agencies,4.2,4.2,4.0,3.3,3.4,4.3,4.2,"See All 16,600 Reviews",The mission of the US Department of the Air Fo...,
4,Cisco Systems,10000+ Employees,"San Jose, CA",Computer Hardware & Software,4.2,4.3,4.3,4.2,3.8,4.1,3.9,"See All 21,804 Reviews","#WeAreCisco, where each person is unique, but ...",Mission: Cisco wants you to bring your uniquen...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,NewYork-Presbyterian Hospital,10000+ Employees,"New York, NY",Health Care Services & Hospitals,4.2,4.3,4.2,4.0,3.8,4.4,4.2,"See All 1,763 Reviews","At NewYork-Presbyterian, we put patients first...",Mission: NYP is one of the nation’s most compr...
58,Bayer,10000+ Employees,"Leverkusen, Germany",Biotech & Pharmaceuticals,4.1,4.1,4.0,4.1,3.5,4.2,3.6,"See All 4,290 Reviews",Bayer is a global company with a more than 150...,
59,Netflix,5001 to 10000 Employees,"Los Gatos, CA",Internet,4.1,4.2,3.9,3.4,3.6,4.4,3.7,"See All 1,263 Reviews",Netflix is the world's leading streaming enter...,
60,Goldman Sachs,10000+ Employees,"New York, NY",Investment Banking & Asset Management,4.1,4.1,3.9,3.0,3.7,4.0,4.1,"See All 9,096 Reviews","The Goldman Sachs Group, Inc. is a leading glo...",


In [35]:
df_csv = df.to_csv('glassdoor_ratings1_62.csv', index=False)

In [36]:
df1_62

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,Google,10000+ Employees,"Mountain View, CA",Internet,4.5,4.4,4.4,4.3,4.1,4.5,4.3,"See All 18,362 Reviews",,
1,Microsoft,10000+ Employees,"Redmond, WA",Computer Hardware & Software,4.4,4.4,4.3,4.1,4.0,4.1,4.1,"See All 29,389 Reviews",Our drive to change the world unites us!\n\nMi...,
2,Apple,10000+ Employees,"Cupertino, CA",Computer Hardware & Software,4.3,4.4,4.3,3.7,3.7,4.3,3.8,"See All 20,851 Reviews",We’re a diverse collective of thinkers and doe...,
3,US Air Force,10000+ Employees,"Washington, DC",Federal Agencies,4.2,4.2,4.0,3.3,3.4,4.3,4.2,"See All 16,600 Reviews",The mission of the US Department of the Air Fo...,
4,Cisco Systems,10000+ Employees,"San Jose, CA",Computer Hardware & Software,4.2,4.3,4.3,4.2,3.8,4.1,3.9,"See All 21,804 Reviews","#WeAreCisco, where each person is unique, but ...",Mission: Cisco wants you to bring your uniquen...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,NewYork-Presbyterian Hospital,10000+ Employees,"New York, NY",Health Care Services & Hospitals,4.2,4.3,4.2,4.0,3.8,4.4,4.2,"See All 1,763 Reviews","At NewYork-Presbyterian, we put patients first...",Mission: NYP is one of the nation’s most compr...
58,Bayer,10000+ Employees,"Leverkusen, Germany",Biotech & Pharmaceuticals,4.1,4.1,4.0,4.1,3.5,4.2,3.6,"See All 4,290 Reviews",Bayer is a global company with a more than 150...,
59,Netflix,5001 to 10000 Employees,"Los Gatos, CA",Internet,4.1,4.2,3.9,3.4,3.6,4.4,3.7,"See All 1,263 Reviews",Netflix is the world's leading streaming enter...,
60,Goldman Sachs,10000+ Employees,"New York, NY",Investment Banking & Asset Management,4.1,4.1,3.9,3.0,3.7,4.0,4.1,"See All 9,096 Reviews","The Goldman Sachs Group, Inc. is a leading glo...",


# Iterative process
Remaining code below this point is my iterative process in working on the `scraping_pages` function. 

In [4]:
# #2nd try: 2m 46s
# scraping_pages(1)

Finished scraping 10 companies


Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,Google,10000+ Employees,"Mountain View, CA",Internet,4.5,4.4,4.4,4.3,4.1,4.5,4.3,"See All 18,362 Reviews",,
1,Microsoft,10000+ Employees,"Redmond, WA",Computer Hardware & Software,4.4,4.4,4.3,4.1,4.0,4.1,4.1,"See All 29,389 Reviews",,
2,,10000+ Employees,"Cupertino, CA",Computer Hardware & Software,4.3,4.4,4.3,3.7,3.7,4.3,3.8,"See All 20,848 Reviews",,
3,US Air Force,10000+ Employees,"Washington, DC",Federal Agencies,4.2,4.2,4.0,3.3,3.4,4.3,4.2,"See All 16,600 Reviews",,
4,,10000+ Employees,"San Jose, CA",Computer Hardware & Software,4.2,4.3,4.3,4.2,3.8,4.1,3.9,"See All 21,803 Reviews",,
5,,10000+ Employees,"Mc Lean, VA",Banks & Credit Unions,4.1,4.2,4.1,4.1,3.6,4.2,3.7,"See All 9,223 Reviews",,
6,,10000+ Employees,"San Diego, CA",Computer Hardware & Software,4.1,3.9,3.8,3.4,3.6,4.1,4.0,"See All 6,804 Reviews",,
7,,10000+ Employees,"Santa Clara, CA",Computer Hardware & Software,4.1,4.4,4.1,4.0,3.5,4.0,3.9,"See All 17,700 Reviews",,
8,Bank of America,10000+ Employees,"Charlotte, NC",Banks & Credit Unions,4.0,4.3,4.0,3.8,3.5,4.0,3.8,"See All 26,088 Reviews",,
9,,10000+ Employees,"Dublin, Ireland",Consulting,4.0,4.2,3.9,3.4,3.5,3.9,3.9,"See All 73,975 Reviews",,


In [4]:
# #HEADLESS: 2m 55s
# scraping_pages(1)

  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,Google,10000+ Employees,"Mountain View, CA",Internet,4.5,4.4,4.4,4.3,4.1,4.5,4.3,"See All 18,365 Reviews",,
1,Microsoft,10000+ Employees,"Redmond, WA",Computer Hardware & Software,4.4,4.4,4.3,4.1,4.0,4.1,4.1,"See All 29,390 Reviews",,
2,Apple,10000+ Employees,"Cupertino, CA",Computer Hardware & Software,4.3,4.4,4.3,3.7,3.7,4.3,3.8,"See All 20,849 Reviews",We’re a diverse collective of thinkers and doe...,
3,Cisco Systems,10000+ Employees,"San Jose, CA",Computer Hardware & Software,4.2,4.3,4.3,4.2,3.8,4.1,3.9,"See All 21,808 Reviews","#WeAreCisco, where each person is unique, but ...",Mission: Cisco wants you to bring your uniquen...
4,US Air Force,10000+ Employees,"Washington, DC",Federal Agencies,4.2,4.2,4.0,3.3,3.4,4.3,4.2,"See All 16,600 Reviews",The mission of the US Department of the Air Fo...,
5,Capital One,10000+ Employees,"Mc Lean, VA",Banks & Credit Unions,4.1,4.2,4.1,4.1,3.6,4.2,3.7,"See All 9,222 Reviews",Capital One is where professional achievement ...,Mission: Capital One is an equal opportunity e...
6,Qualcomm,10000+ Employees,"San Diego, CA",Computer Hardware & Software,4.1,3.9,3.8,3.4,3.6,4.1,4.0,"See All 6,803 Reviews",We’re looking for inventive minds to help fuel...,
7,Intel Corporation,10000+ Employees,"Santa Clara, CA",Computer Hardware & Software,4.1,4.4,4.1,4.0,3.5,4.0,3.9,"See All 17,698 Reviews",Intel’s mission is to shape the future of tech...,
8,Bank of America,10000+ Employees,"Charlotte, NC",Banks & Credit Unions,4.0,4.3,4.0,3.8,3.5,4.0,3.8,"See All 26,087 Reviews",Bank of America is one of the world's largest ...,
9,Accenture,10000+ Employees,"Dublin, Ireland",Consulting,4.0,4.2,3.9,3.4,3.5,3.9,3.9,"See All 73,973 Reviews",Accenture is a global professional services co...,


In [6]:
# #OG: 3m 37s
# scraping_pages(1)

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,Microsoft,10000+ Employees,"Redmond, WA",Computer Hardware & Software,4.4,4.4,4.3,4.1,4.0,4.1,4.1,"See All 29,392 Reviews",Our drive to change the world unites us!\n\nMi...,
1,Apple,10000+ Employees,"Cupertino, CA",Computer Hardware & Software,4.3,4.4,4.3,3.7,3.7,4.3,3.8,"See All 20,849 Reviews",We’re a diverse collective of thinkers and doe...,
2,US Air Force,10000+ Employees,"Washington, DC",Federal Agencies,4.2,4.2,4.0,3.3,3.4,4.3,4.2,"See All 16,597 Reviews",The mission of the US Department of the Air Fo...,
3,Cisco Systems,10000+ Employees,"San Jose, CA",Computer Hardware & Software,4.2,4.3,4.3,4.2,3.8,4.1,3.9,"See All 21,807 Reviews","#WeAreCisco, where each person is unique, but ...",Mission: Cisco wants you to bring your uniquen...
4,Capital One,10000+ Employees,"Mc Lean, VA",Banks & Credit Unions,4.1,4.2,4.1,4.1,3.6,4.2,3.7,"See All 9,222 Reviews",Capital One is where professional achievement ...,Mission: Capital One is an equal opportunity e...
5,Qualcomm,10000+ Employees,"San Diego, CA",Computer Hardware & Software,4.1,3.9,3.8,3.4,3.6,4.1,4.0,"See All 6,804 Reviews",We’re looking for inventive minds to help fuel...,
6,Intel Corporation,10000+ Employees,"Santa Clara, CA",Computer Hardware & Software,4.1,4.4,4.1,4.0,3.5,4.0,3.9,"See All 17,700 Reviews",Intel’s mission is to shape the future of tech...,
7,Bank of America,10000+ Employees,"Charlotte, NC",Banks & Credit Unions,4.0,4.3,4.0,3.8,3.5,4.0,3.8,"See All 26,087 Reviews",Bank of America is one of the world's largest ...,
8,Accenture,10000+ Employees,"Dublin, Ireland",Consulting,4.0,4.2,3.9,3.4,3.5,3.9,3.9,"See All 73,971 Reviews",Accenture is a global professional services co...,


In [None]:
# ##TESTING to create function TESTING to create function  TESTING to create function  TESTING

# def scraping_pages(num_pages):
#     #Creating 'n' urls with url_roots to scrape
#     url_root = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=0&page=' ## root url
#     nums = [x+1 for x in range(num_pages)] 
#     url_mains = list(map(lambda n: url_root + str(n), nums)) ## add 'n' number to call url page 'n'
    
#     for u in url_mains:
#         driver.get(u)
            
#     #looking for 'Overview' links from each main search page
#         elems = driver.find_elements_by_tag_name('a') ## find links on an individual search page tagged with the 'a' tag
#         company_links = []
#         for elem in elems:
#             company_link = elem.get_attribute('href') ## returns every item with 'href' attribute (links for each company)
#             if 'Overview' in company_link:
#                 company_links.append(company_link) ## each company's 'Overview' link added to list  

#         for url in company_links:
#             try: #fail safe for inevitable errors
#                 driver.get(url)
#                 #source = driver.page_source
#                 #page_source = driver.page_source

# ##---------------------------------------- Handling login ------------------------------------------##
#                 name = 'n' # <---- ENTER GLASSDOOR CREDENTIALS HERE
#                 pw = 'pw'
                
#                 try: #login
#                     username = driver.find_element_by_id("userEmail")
#                     password = driver.find_element_by_id("userPassword")
#                     submit = driver.find_element_by_xpath('//*[@id="InlineLoginModule"]/div/div[2]/div/div[1]/div[3]/form/div[3]/div[1]/button')
#                     username.send_keys(name)
#                     password.send_keys(pw)
#                     submit.click()
#                     time.sleep(3) #lettin page load
#                 except: #no login required
#                     pass

# ##---------------------------------- Gathering Variables - Main Page ---------------------------------##                
#                 name = driver.find_element_by_xpath('//*[@id="EmpHeroAndEmpInfo"]/div[3]/div[2]').text
#                 size = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[3]/div').text
#                 headquarters = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[2]/div').text
#                 industry = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[6]/div').text
#                 try:
#                     num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[3]/a').text
#                 except: 
#                     num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[3]/a').text        

#             #Gather Description - handling "Read More" button
#                 try:
#                     read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class 
#                     read_more.click()
#                     description = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[1]/span').text
#                 except:
#                     description = "N/A"

#             #Gather Mission - handling "Read More" button    
#                 try:
#                     read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class
#                     read_more.click()
#                     mission = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[2]').text
#                 except:
#                     mission = "N/A"

# ##-------------------------------- Gathering Variables - Ratings Pop-up --------------------------------##    
#             #Webpage layout 1
#                 try: 
#                     driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]').click()
#                     time.sleep(5) #let page load

#                     rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
#                     rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
#                     rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
#                     rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
#                     rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
#                     rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
#                     rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

#                     #print(name, size, headquarters, industry, rating_overall, rating_DI, rating_CV, rating_WL, rating_SM, rating_CB, rating_CO, num_reviews, description, mission)
#                     time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
#             #Webpage layout 2
#                 except: 
#                     driver.get(url) #recalling url
#                     driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[1]/div[2]').click()
#                     time.sleep(5) #let page load
                    
#                     rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
#                     rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
#                     rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
#                     rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
#                     rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
#                     rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
#                     rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

#                     #print(name, size, headquarters, industry, rating_overall, rating_DI, rating_CV, rating_WL, rating_SM, rating_CB, rating_CO, num_reviews, description, mission)
#                     time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
                                        
# ##-------------------------------- Creating a Dictionary --------------------------------##
#                 companies.append({
#                     "NAME" : name,
#                     "SIZE" : size,
#                     "LOCATION_HQ" : headquarters,
#                     "INDUSTRY" : industry,
#                     "RATING_OVERALL" : rating_overall,
#                     "RATING_DI" : rating_DI,
#                     "RATING_CV" : rating_CV,
#                     "RATING_WL" : rating_WL,
#                     "RATING_SM" : rating_SM,
#                     "RATING_CB" : rating_CB,
#                     "RATING_CO" : rating_CO,
#                     "NUM_REVIEWS" : num_reviews,
#                     "DESCRIPTION" : description,
#                     "MISSION" : mission
#                                  })

#             except: #fail safe for inevitable errors
#                 unsuccessful_links.append(url) #adding unsuccessful urls to a list
#                 print('ERROR: ', url)
#                 print(unsuccessful_links)
#                 time.sleep(10)
#     return pd.DataFrame(companies)

In [5]:
# ##USE THIS CODE! - pandas dataframe created

# unsuccessful_links = []
# companies = []

# def scraping_pages(num_pages):
#     #Creating 'n' urls with url_roots to scrape
#     url_root = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=0&page=' #root url
#     nums = [x+1 for x in range(num_pages)] 
#     url_mains = list(map(lambda n: url_root + str(n), nums)) #adding 'n' number to call url_root
    
#     for u in url_mains:
#         driver.get(u)
            
#     #looking for 'Overview' links from each main search page
#         elems = driver.find_elements_by_tag_name('a') #find links on an individual search page tagged with the 'a' tag
#         company_links = []
#         for elem in elems:
#             company_link = elem.get_attribute('href') #returns every item with 'href' attribute (these are the links for each company)
#             if 'Overview' in company_link:
#                 company_links.append(company_link) #each company's 'Overview' link added to company_link list  

#         for url in company_links:
#             try: #fail safe for inevitable errors
#                 driver.get(url)

# ##---------------------------------------- Handling login ------------------------------------------##
#                 name = 'n' # <---- ENTER GLASSDOOR CREDENTIALS HERE
#                 pw = 'pw'
                
#                 try: #login
#                     username = driver.find_element_by_id("userEmail")
#                     password = driver.find_element_by_id("userPassword")
#                     submit = driver.find_element_by_xpath('//*[@id="InlineLoginModule"]/div/div[2]/div/div[1]/div[3]/form/div[3]/div[1]/button')
#                     username.send_keys(name)
#                     password.send_keys(pw)
#                     submit.click()
#                     time.sleep(3) #lettin page load
#                 except: #no login required
#                     pass

# ##---------------------------------- Gathering Variables - Main Page ---------------------------------##                
#                 name = driver.find_element_by_xpath('//*[@id="EmpHeroAndEmpInfo"]/div[3]/div[2]').text
#                 size = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[3]/div').text
#                 headquarters = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[2]/div').text
#                 industry = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[6]/div').text
#                 try:
#                     num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[3]/a').text
#                 except: 
#                     num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[3]/a').text        

#             #Gather Description - handling "Read More" button
#                 try:
#                     read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class 
#                     read_more.click()
#                     description = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[1]/span').text
#                 except:
#                     description = "N/A"

#             #Gather Mission - handling "Read More" button    
#                 try:
#                     read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class
#                     read_more.click()
#                     mission = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[2]').text
#                 except:
#                     mission = "N/A"

# ##-------------------------------- Gathering Variables - Ratings Pop-up --------------------------------##    
#             #Webpage layout 1
#                 try: 
#                     driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]').click()
#                     time.sleep(5) #let page load

#                     rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
#                     rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
#                     rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
#                     rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
#                     rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
#                     rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
#                     rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

#                     time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
#             #Webpage layout 2
#                 except: 
#                     driver.get(url) #recalling url
#                     driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[1]/div[2]').click()
#                     time.sleep(5) #let page load
                    
#                     rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
#                     rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
#                     rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
#                     rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
#                     rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
#                     rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
#                     rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

#                     time.sleep(np.random.choice([x/10 for x in range(7,22)])) #some time to rest 
                                        
# ##---------------------------------------- Creating a Dictionary ----------------------------------------##
#                 companies.append({
#                     "NAME" : name,
#                     "SIZE" : size,
#                     "LOCATION_HQ" : headquarters,
#                     "INDUSTRY" : industry,
#                     "RATING_OVERALL" : rating_overall,
#                     "RATING_DI" : rating_DI,
#                     "RATING_CV" : rating_CV,
#                     "RATING_WL" : rating_WL,
#                     "RATING_SM" : rating_SM,
#                     "RATING_CB" : rating_CB,
#                     "RATING_CO" : rating_CO,
#                     "NUM_REVIEWS" : num_reviews,
#                     "DESCRIPTION" : description,
#                     "MISSION" : mission
#                                  })

#             except: #fail safe for inevitable errors
#                 unsuccessful_links.append(url) #adding unsuccessful urls to a list
#                 print('ERROR: ', url)
#                 time.sleep(10)
#         driver.close()
#         driver.quit()
#     return pd.DataFrame(companies)

In [107]:
# # THIS CODE WORKS - run through whole page of links
# links = ["https://www.glassdoor.com/Overview/Working-at-Google-EI_IE9079.11,17.htm",
#     "https://www.glassdoor.com/Overview/Working-at-Cisco-Systems-EI_IE1425.11,24.htm"]

# unsuccessful_links = []

# for url in links:
#     try:
#         driver.get(url)
#         source = driver.page_source
#         page_source = driver.page_source

#         name = 'n'
#         pw = 'pw'
    
# ##---------------------------------------- Handling login ------------------------------------------##
#         try: #login
#             username = driver.find_element_by_id("userEmail")
#             password = driver.find_element_by_id("userPassword")
#             submit = driver.find_element_by_xpath('//*[@id="InlineLoginModule"]/div/div[2]/div/div[1]/div[3]/form/div[3]/div[1]/button')
#             username.send_keys(name)
#             password.send_keys(pw)
#             submit.click()
#             time.sleep(3) #lettin page load
#         except: #no login required
#             pass
        
# ##---------------------------------- Gathering Variables - Main Page ---------------------------------##                
#         name = driver.find_element_by_xpath('//*[@id="EmpHeroAndEmpInfo"]/div[3]/div[2]').text
#         size = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[3]/div').text
#         headquarters = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[2]/div').text
#         industry = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[6]/div').text
#         try:
#             num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[3]/a').text
#         except: 
#             num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[3]/a').text        

#     #Gather Description - handling "Read More" button
#         try:
#             read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class 
#             read_more.click()
#             description = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[1]/span').text
#         except:
#             description = "Description: N/A"

#     #Gather Mission - handling "Read More" button    
#         try:
#             read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class
#             read_more.click()
#             mission = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[2]').text
#         except:
#             mission = "Mission: N/A"

# ##-------------------------------- Gathering Variables - RatingsPop-up --------------------------------##    
#     #webpage layout 1: (Google, Microsoft, Apple, etc)
#         try: 
#             driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]').click()
#             time.sleep(4) #lettin page load
        
#             rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
#             rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
#             rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
#             rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
#             rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
#             rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
#             rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text
    
#             print(name, size, headquarters, industry, rating_overall, rating_DI, rating_CV, rating_WL, rating_SM, rating_CB, rating_CO, num_reviews, description, mission)
#             time.sleep(np.random.choice([x/10 for x in range(7,20)])) #some time to rest 
#     #webpage layout 2: (Cisco, Capital One, etc)
#         except: 
#             driver.get(url) #recalling url
        
#             driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[1]/div[2]').click()
#             time.sleep(4) #lettin page load
#             rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
#             rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
#             rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
#             rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
#             rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
#             rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
#             rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text
        
#             print(name, size, headquarters, industry, rating_overall, rating_DI, rating_CV, rating_WL, rating_SM, rating_CB, rating_CO, num_reviews, description, mission)
#             time.sleep(np.random.choice([x/10 for x in range(7,15)])) #some time to rest 
        
#     except: #fail safe for inevitable errors
#         unsuccessful_links.append(url) #adding unsuccessful urls to a list
#         print('ERROR: ', url)
#         print(unsuccessful_links)
#         time.sleep(10)


Google 10000+ Employees Mountain View, CA Internet 4.5 4.4 4.4 4.3 4.1 4.5 4.3 See All 18,361 Reviews Google is not a conventional company, and we don’t intend to become one. True, we share attributes with the world’s most successful organizations – a focus on innovation and smart business practices comes to mind – but even as we continue to grow, we’re committed to retaining a small-company feel. At Google, we know that every employee has something important to say, and that every employee is integral to our success. We provide individually-tailored compensation packages that can be comprised of competitive salary, bonus, and equity components, along with the opportunity to earn further financial bonuses and rewards.

Googlers thrive in small, focused teams and high-energy environments, believe in the ability of technology to change the world, and are as passionate about their lives as they are about their work. For more information, visit careers.google.com. Mission: N/A
Microsoft 10

In [4]:
# # THIS CODE WORKS - double check that google's description and mission go through (I think I fixed after running)

# for u in url:
#     try:
#         driver.get(u)
#         source = driver.page_source
#         page_source = driver.page_source

#         name = 'n'
#         pw = 'pw'
    
# ##---------------------------------------- Handling login ------------------------------------------##
#         try: #login required
#             username = driver.find_element_by_id("userEmail")
#             password = driver.find_element_by_id("userPassword")
#             submit = driver.find_element_by_xpath('//*[@id="InlineLoginModule"]/div/div[2]/div/div[1]/div[3]/form/div[3]/div[1]/button')
#             username.send_keys(name)
#             password.send_keys(pw)
#             submit.click()
#             time.sleep(3) #lettin page load
#         except: #no login required
#             pass
        
# ##---------------------------------- Gathering Variables - Main Page ---------------------------------##                
#         name = driver.find_element_by_xpath('//*[@id="EmpHeroAndEmpInfo"]/div[3]/div[2]').text
#         size = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[3]/div').text
#         headquarters = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[2]/div').text
#         industry = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[6]/div').text
#         try:
#             num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[3]/a').text
#         except: 
#             num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[3]/a').text        

#     #Gather Description - handling "Read More" button
#         try:
#             read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class 
#             read_more.click()
#             description = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[1]/span').text
#         except:
#             description = "Description: N/A"

#     #Gather Mission - handling "Read More" button    
#         try:
#             read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class
#             read_more.click()
#             mission = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[2]').text
#         except:
#             mission = "Mission: N/A"

# ##-------------------------------- Gathering Variables - RatingsPop-up --------------------------------##    
#         try: #webpage layout 1: (Google, Microsoft, Apple, etc)
#             driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]').click()
#             time.sleep(4) #lettin page load
        
#             rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
#             rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
#             rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
#             rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
#             rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
#             rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
#             rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text
    
#             print(name, size, headquarters, industry, rating_overall, rating_DI, rating_CV, rating_WL, rating_SM, rating_CB, rating_CO, num_reviews, description, mission)
#             time.sleep(np.random.choice([x/10 for x in range(7,20)])) #some time to rest 
    
#         except: #webpage layout 2: (Cisco, Capital One, etc)
#             driver.get(u) #recalling url
        
#             driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[1]/div[2]').click()
#             time.sleep(4) #lettin page load
#             rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
#             rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
#             rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
#             rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
#             rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
#             rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
#             rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text
        
#             print(name, size, headquarters, industry, rating_overall, rating_DI, rating_CV, rating_WL, rating_SM, rating_CB, rating_CO, num_reviews, description, mission)
#             time.sleep(np.random.choice([x/10 for x in range(7,15)])) #some time to rest 
        
#     except: #fail safe for inevitable errors
#         print('ERROR: ', u)
#         time.sleep(10)


Google 10000+ Employees Mountain View, CA Internet 4.5 4.4 4.4 4.3 4.1 4.5 4.3 See All 18,359 Reviews Description: N/A Mission: N/A
Microsoft 10000+ Employees Redmond, WA Computer Hardware & Software 4.4 4.4 4.3 4.1 4.0 4.1 4.1 See All 29,387 Reviews Our drive to change the world unites us!

Microsoft is the ideal place for people who have passion for their work and the desire to make an impact—in their careers, in the community and on the world. Microsoft is a unique company; and not just within the tech industry. Here, smart people thrive on their own terms and push their intelligence to its limit. The variety of job opportunities and career advancement at Microsoft is incredible and empowers you to constantly challenge yourself and chart your own course. Mission: N/A
Cisco Systems 10000+ Employees San Jose, CA Computer Hardware & Software 4.2 4.3 4.3 4.2 3.8 4.1 3.9 See All 21,801 Reviews #WeAreCisco, where each person is unique, but we bring our talents to work as a team and make a

In [11]:
# ## THIS CODE WORKS - description/mission successful

# for u in url:

#     driver.get(u)
#     source = driver.page_source
#     page_source = driver.page_source
    
#     name = 'n'
#     pw = 'pw'
    
# ##------------------------------------ Handling login ------------------------------------------##
#     try: #login required
#         username = driver.find_element_by_id("userEmail")
#         password = driver.find_element_by_id("userPassword")
#         submit = driver.find_element_by_xpath('//*[@id="InlineLoginModule"]/div/div[2]/div/div[1]/div[3]/form/div[3]/div[1]/button')
#         username.send_keys(name)
#         password.send_keys(pw)
#         submit.click()
#         time.sleep(1) #lettin page load
#     except: #no login required
#         pass

# ##---------------------------------- Gathering Variables - Main Page ------------------------------------##                

#     name = driver.find_element_by_xpath('//*[@id="EmpHeroAndEmpInfo"]/div[3]/div[2]').text
    
#     #Gather Description - handling "Read More" button
#     try:
#         read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class 
#         read_more.click()
#         description = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[1]/span').text
#     except:
#         description = "Description: N/A"
        
#     #Gather Mission - handling "Read More" button    
#     try:
#         read_more = driver.find_element_by_class_name('css-1tgo67c.e16x8fv00') #button class
#         read_more.click()
#         mission = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/div[2]').text
#     except:
#         mission = "Mission: N/A"
    
    
# ##---------------------------- Gathering Variables - Ratings pop-up --------------------------------##    
#     try: #webpage layout 1: (Google, Microsoft, Apple, etc)
#         driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]').click()
#         time.sleep(4) #lettin page load
    
#         rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
#         rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
#         rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
#         rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
#         rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
#         rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
#         rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text
    
#         print(name, "\n", description, "\n", mission, "\n")
#         time.sleep(np.random.choice([x/10 for x in range(7,20)])) #some time to rest 
    
#     except: #webpage layout 2: (Cisco, Capital One, etc)
#         driver.get(u) #recalling url
    
#         driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[1]/div[2]').click()
#         time.sleep(4) #lettin page load
#         rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
#         rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
#         rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
#         rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
#         rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
#         rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
#         rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text
    
#         print(name, "\n", description, "\n", mission, "\n")
#         time.sleep(np.random.choice([x/10 for x in range(7,15)])) #some time to rest 
    



Microsoft 
 Our drive to change the world unites us!

Microsoft is the ideal place for people who have passion for their work and the desire to make an impact—in their careers, in the community and on the world. Microsoft is a unique company; and not just within the tech industry. Here, smart people thrive on their own terms and push their intelligence to its limit. The variety of job opportunities and career advancement at Microsoft is incredible and empowers you to constantly challenge yourself and chart your own course. 
 Mission: N/A 

Cisco Systems 
 Our edge doesn’t come from technology, but from our people! From entertainment, retail, healthcare, and education, to public and private sectors, smart cities, and everyday devices, we strive for an inclusive culture that has earned Cisco recognition as a multinational Great Place To Work. Leading in 12 of 18 markets, Cisco is building teams around the globe - expanding our technology solutions in the Mobile, Cloud, Security, IT, and 

In [None]:

# # THIS CODE WORKS - all elements except for mission statement and company review

# for u in url:
#     try:
#         driver.get(u)
#         source = driver.page_source
#         page_source = driver.page_source
#         name = 'n'
#         pw = 'pw'
    
# ##---------------------------------------- Handling login ------------------------------------------##
#         try: #login required
#             username = driver.find_element_by_id("userEmail")
#             password = driver.find_element_by_id("userPassword")
#             submit = driver.find_element_by_xpath('//*[@id="InlineLoginModule"]/div/div[2]/div/div[1]/div[3]/form/div[3]/div[1]/button')
#             username.send_keys(name)
#             password.send_keys(pw)
#             submit.click()
#             time.sleep(1) #lettin page load
#         except: #no login required
#             pass
        
# ##---------------------------------- Gathering Variables ---------------------------------##                
#         name = driver.find_element_by_xpath('//*[@id="EmpHeroAndEmpInfo"]/div[3]/div[2]').text
#         size = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[3]/div').text
#         headquarters = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[2]/div').text
#         industry = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[1]/ul/li[6]/div').text
#         try:
#             num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[3]/a').text
#         except: 
#             num_reviews = driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[3]/a').text        


# ##-------------------------------- Handling different page layouts --------------------------------##    
#         try: #webpage layout 1: (Google, Microsoft, Apple, etc)
#             driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]').click()
#             time.sleep(4) #lettin page load
        
#             rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
#             rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
#             rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
#             rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
#             rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
#             rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
#             rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text
    
#             print(name, size, headquarters, industry, rating_overall, rating_DI, rating_CV, rating_WL, rating_SM, rating_CB, rating_CO, num_reviews)
#             time.sleep(np.random.choice([x/10 for x in range(7,20)])) #some time to rest 
    
#         except: #webpage layout 2: (Cisco, Capital One, etc)
#             driver.get(u) #recalling url
        
#             driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[1]/div[2]').click()
#             time.sleep(3) #lettin page load
#             rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
#             rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
#             rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
#             rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
#             rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
#             rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
#             rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text
        
#             print(name, size, headquarters, industry, rating_overall, rating_DI, rating_CV, rating_WL, rating_SM, rating_CB, rating_CO, num_reviews)
#             time.sleep(np.random.choice([x/10 for x in range(7,15)])) #some time to rest 
        
#     except: #fail safe for inevitable errors
#         print('ERROR: ', u)
#         time.sleep(10)


In [None]:

# # THIS CODE WORKS

# for u in url:
#     try:
#         driver.get(u)
#         source = driver.page_source
#         page_source = driver.page_source
#         name = 'n'
#         pw = 'pw'
    
# ##------------------------ Handling login --------------------------##
#         try: #login required
#             username = driver.find_element_by_id("userEmail")
#             password = driver.find_element_by_id("userPassword")
#             submit = driver.find_element_by_xpath('//*[@id="InlineLoginModule"]/div/div[2]/div/div[1]/div[3]/form/div[3]/div[1]/button')
#             username.send_keys(name)
#             password.send_keys(pw)
#             submit.click()
#         except: #no login required
#             pass

# ##---------------- Handling different page layouts ----------------##
    
#         try: #webpage layout 1: (Google, Microsoft, Apple, etc)
#             driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]').click()
#             time.sleep(4)
        
#             rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
#             rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
#             rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
#             rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
#             rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
#             rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
#             rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text
    
#             print(rating_overall, rating_DI, rating_CV, rating_WL, rating_SM, rating_CB, rating_CO)
#             time.sleep(np.random.choice([x/10 for x in range(7,20)]))
    
#         except: #webpage layout 2: (Cisco, Capital One, etc)
#             driver.get(u) #recalling url
        
#             driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[1]/div[2]').click()
#             time.sleep(3)
#             rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
#             rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
#             rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
#             rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
#             rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
#             rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
#             rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text
        
#             print(rating_overall, rating_DI, rating_CV, rating_WL, rating_SM, rating_CB, rating_CO)
#             time.sleep(np.random.choice([x/10 for x in range(7,15)]))
        
#     except: #fail safe for inevitable errors
#         time.sleep(10)
#         pass


In [12]:
# url = ["https://www.glassdoor.com/Overview/Working-at-Google-EI_IE9079.11,17.htm"
#           "https://www.glassdoor.com/Overview/Working-at-Microsoft-EI_IE1651.11,20.htm"
# ]

# # WHY DON'T I GET GOOGLE SCRAPE HERE?
# for u in url:
#     try:
#         driver.get(u)
#         source = driver.page_source
#         page_source = driver.page_source
#         name = 'n'
#         pw = 'pw'
        
# ##------------------------ Handling login --------------------------##
#         try: #login required
#             username = driver.find_element_by_id("userEmail")
#             password = driver.find_element_by_id("userPassword")
#             submit = driver.find_element_by_xpath('//*[@id="InlineLoginModule"]/div/div[2]/div/div[1]/div[3]/form/div[3]/div[1]/button')
#             username.send_keys(name)
#             password.send_keys(pw)
#             submit.click()
#         except: #no login required
#             pass
        
#         name = driver.find_element_by_xpath('//*[@id="EmpHeroAndEmpInfo"]/div[3]/div[2]').text

# ##---------------- Handling different page layouts ----------------##
    
#         try: #webpage layout 1: (Google, Microsoft, Apple, etc)
#             driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]').click()
#             time.sleep(4)
        
#             rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
#             try: #failsafe for companies with do DI rating
#                 rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
#             except:
#                 rating_DI = 'None'
#             rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
#             rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
#             rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
#             rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
#             rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text
    
#             print(name, rating_overall, rating_DI, rating_CV, rating_WL, rating_SM, rating_CB, rating_CO)
#             time.sleep(np.random.choice([x/10 for x in range(7,20)]))
    
#         except: #webpage layout 2: (Cisco, Capital One, etc)
#             driver.get(u) #recalling url
        
#             driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[1]/div[2]').click()
#             time.sleep(3)
#             rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
#             rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
#             rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
#             rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
#             rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
#             rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
#             rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text
        
#             print(name, rating_overall, rating_DI, rating_CV, rating_WL, rating_SM, rating_CB, rating_CO)
#             time.sleep(np.random.choice([x/10 for x in range(7,15)]))
        
        
#     except:
#         print('ERROR: ', u)
#         time.sleep(10)


Microsoft 4.4 4.4 4.3 4.1 4.0 4.1 4.1


In [None]:
# #group 1 code

# # Google
# url = "https://www.glassdoor.com/Overview/Working-at-Microsoft-EI_IE1651.11,20.htm"

# driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]').click()
# time.sleep(2)
# rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
# rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
# rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
# rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
# rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
# rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
# rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

# print(rating_overall, rating_DI, rating_CV, rating_WL, rating_SM, rating_CB, rating_CO)
        

In [None]:
# #group 2 code

# # Cisco:
# # url = "https://www.glassdoor.com/Overview/Working-at-Cisco-Systems-EI_IE1425.11,24.htm"
# # Capital One:
# #url = 'https://www.glassdoor.com/Overview/Working-at-Capital-One-EI_IE3736.11,22.htm'

# driver.get(url)

# driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[4]/div[1]/div[2]').click()
# time.sleep(3)
# rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
# rating_overall = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[1]/div/div[3]').text
# rating_DI = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[3]/div/div[3]').text
# rating_CV = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[2]/div/div[3]').text
# rating_WL = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[4]/div/div[3]').text
# rating_SM = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[5]/div/div[3]').text
# rating_CB = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[6]/div/div[3]').text
# rating_CO = driver.find_element_by_xpath('//*[@id="reviewDetailsModal"]/div[2]/div[2]/div/div/div/div[1]/div[1]/div/div[7]/div/div[3]').text

# time.sleep(np.random.choice([x/10 for x in range(7,15)]))
# print(rating_overall, rating_DI, rating_CV, rating_WL, rating_SM, rating_CB, rating_CO)
# #print(rating_overall)


In [None]:
# #Testing mission statement
# driver.find_element_by_xpath('//*[@id="EIOverviewContainer"]/div/div[3]/div[1]/div[2]/span').click()
# time.sleep(2)

URLs for testing:

In [3]:
# url = ["https://www.glassdoor.com/Overview/Working-at-Google-EI_IE9079.11,17.htm",
#     "https://www.glassdoor.com/Overview/Working-at-Microsoft-EI_IE1651.11,20.htm",
#     "https://www.glassdoor.com/Overview/Working-at-Cisco-Systems-EI_IE1425.11,24.htm",
#     "https://www.glassdoor.com/Overview/Working-at-Capital-One-EI_IE3736.11,22.htm",
#     "https://www.glassdoor.com/Overview/Working-at-Bank-of-America-EI_IE8874.11,26.htm",
#     "https://www.glassdoor.com/Overview/Working-at-Apple-EI_IE1138.11,16.htm",
#     "https://www.glassdoor.com/Overview/Working-at-Intel-Corporation-EI_IE1519.11,28.htm",
#     "https://www.glassdoor.com/Overview/Working-at-Orian-Engineers-EI_IE608061.11,26.htm",
#     "https://www.glassdoor.com/Overview/Working-at-US-Air-Force-EI_IE41283.11,23.htm"#no DI rating, description, or mission stateme

#       ]

In [9]:
# # looking for 'Overview' links from each main search page

# # Initiate main links
# elems = driver.find_elements_by_tag_name('a') ## all links located in 'a' tag

# links = []
# for elem in elems:
#     link = elem.get_attribute('href') ## returns every item with 'href' attribute
#     if 'Overview' in link:
#         links.append(link) ## each 'Overview' link added to list

In [3]:
# # Creating 'n' urls to scrape
# num_pages = 2 #change depending on how many pages you want to scrape

# url_root = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=0&page='
# nums = [x+1 for x in range(num_pages)]

# url_mains = list(map(lambda n: url_root + str(n), nums))
# #url_mains #sanity check


['https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=0&page=1',
 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=0&page=2']

In [5]:
# for u in url_mains:
#     driver.get(u)
#     # Initiate main links
#     elems = driver.find_elements_by_tag_name('a') ## all links located in 'a' tag

#     links = []
#     for elem in elems:
#         link = elem.get_attribute('href') ## returns every item with 'href' attribute
#         if 'Overview' in link:
#             links.append(link) ## each 'Overview' link added to list
# # links #sanity check