# Glassdoor.com:  'Data Scientist' Job Search

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Get all 'data scientist' results from past 14 days for full-time positions located in NYC with salary info.

In [1]:
import pandas as pd
import numpy as np
import time
import os
from datetime import datetime as dt
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
import re
import json
from random import randint
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt
import html

%matplotlib inline

In [2]:
def get_links(URL):
    '''Collect individual links for relevant search results.'''
    chromedriver = ''.join([os.getcwd(), '/chromedriver'])
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(executable_path = chromedriver)
    try:
        driver.set_page_load_timeout(60)
        driver.get(URL)
        soup = BeautifulSoup(driver.page_source, 'lxml')
        listy = []
        for x in soup.find_all('a', class_ = 'jobLink', href = re.compile('/partner/jobListing')):
            full_x = ''.join(['http://www.glassdoor.com', x['href']])
            listy.append(full_x)        
        driver.close()
        driver.quit()
    except TimeoutException: 
        ## try/exc is workaround, see https://bit.ly/2umrGrq
        try:
            driver.close()
        except:
            pass
        driver.quit() 
        print(URL)
        listy = None
    return listy

In [3]:
start = dt.now()
print(start)

base_url = [
    'https://www.glassdoor.com/Job/new-york-data-scientist-jobs-SRCH_IL.0,8_IC1132348_KO9,23_IP', 
    '.htm?radius=0&fromAge=14&minSalary=26000&jobType=fulltime'
]

count = 1
all_links = []
switch = False
while switch == False:
    time.sleep(1 + 1/randint(1, 3))
    URL = ''.join([base_url[0], str(count), base_url[1]])
    new_list = get_links(URL)
    if new_list:
        all_links.extend(new_list)
        print('.', end = '')
        count += 1
    else:
        switch = True 
        
end = dt.now()
print('\nelapsed: {}'.format(end - start))

df = pd.DataFrame(all_links, columns = ['url'])
df.drop_duplicates(inplace = True)
df.to_csv('assets/links.csv', index = False)
df.head()

2018-04-22 15:23:49.059477
.....................
elapsed: 0:03:26.567977


Unnamed: 0,url
0,http://www.glassdoor.com/partner/jobListing.ht...
2,http://www.glassdoor.com/partner/jobListing.ht...
4,http://www.glassdoor.com/partner/jobListing.ht...
6,http://www.glassdoor.com/partner/jobListing.ht...
8,http://www.glassdoor.com/partner/jobListing.ht...


In [4]:
df = pd.read_csv('assets/links.csv')
print(len(df))

1301


In [5]:
def unescape(s):
    '''Unescape some HTML to get rid of tags, etc.'''
    s = s.replace("\&amp;#034;", "\'")
    s = s.replace("&amp;#034;", "\'")
    s = s.replace("&amp;", "&")
    s = s.replace("&lt;", "<")
    s = s.replace("&gt;", ">")
    s = s.replace("\\", " ")
    return s

In [6]:
def listing_parts(soup):
    '''Get listing info from scrape output.'''
    
    x = soup.find('script', {'type' : 'application/ld+json'}).text
    xx = x.encode().replace(b'\n', b'').replace(b'\t', b'').decode('utf-8')
    xxx = BeautifulSoup(unescape(xx), 'lxml').get_text(' ')
    info_dict = json.loads(xxx)
    
    ## title
    try:
        title = info_dict["title"]
    except:
        title = np.nan
        
    ## datePosted
    try:
        dateposted = info_dict["datePosted"]
    except:
        dateposted = np.nan
        
    ## validthrough
    try:
        validthrough = info_dict['validThrough']
    except:
        validthrough = np.nan
        
    ## industry
    try:
        industry = info_dict['industry']
    except:
        industry = np.nan
        
    ## organization name
    try:
        orgname = info_dict['hiringOrganization']['name']
    except:
        orgname = np.nan
        
    ## category
    try:
        cat = info_dict['occupationalCategory']
    except:
        cat = np.nan
    
    ## description
    try:
        descr = info_dict['description']
    except:
        descr = np.nan
    
    ## stars
    stars = np.nan
    for x in soup.findAll('span', class_ = 'compactStars margRtSm'):
        stars = float((x.text).lstrip().rstrip())
    
    ## salary
    jobid = np.nan
    minsal = np.nan
    maxsal = np.nan
    medsal = np.nan
    empid = np.nan
    jobtitleid = np.nan
    jobreqid = np.nan 
    
    for x in soup.findAll('i', {'class' : 'info infoSalEst _ok'}):
        try:
            jobid = x['data-job-id']
        except:
            pass
        try:
            minsal = x['data-displayed-min-salary']
        except:
            pass
        try:
            maxsal = x['data-displayed-max-salary']
        except:
            pass
        try:
            medsal = x['data-displayed-med-salary']
        except:
            pass
        try:
            empid = x['data-employer-id']
        except:
            pass
        try:
            jobtitleid = x['data-jobtitle-id']
        except:
            pass
        try:
            jobreqid = x['data-job-req-id']
        except:
            pass  
        
    listy = [
        title,
        dateposted,
        validthrough,
        industry,
        orgname,
        cat,
        descr,
        stars,
        jobid,
        minsal,
        maxsal,
        medsal,
        empid,
        jobtitleid,
        jobreqid
    ]
    lil_df = pd.DataFrame(listy)
    return lil_df

In [7]:
def get_listing(URL):
    '''Collect individual job posting info.'''
    chromedriver = ''.join([os.getcwd(), '/chromedriver'])
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(executable_path = chromedriver)
    try:
        driver.set_page_load_timeout(60)
        driver.get(URL)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        result = listing_parts(soup)       
        driver.close()
        driver.quit()
    except Exception as e: 
        print('\nposs expired listing: {}'.format(str(e)))
        ## try/exc is workaround, see https://bit.ly/2umrGrq
        time.sleep(3)
        try:
            driver.close()
        except Exception as e:
            print('get_listing 2: {}'.format(str(e)))
        driver.quit() 
        result = None
    return result

In [8]:
start = dt.now()
print(start)

big_df = pd.DataFrame()
urls = []
skipped = []
count = 0
print(len(df))
for url in df['url']:
    time.sleep(1/randint(1,3) + 5/randint(1, 5))
    try:
        lil_df = get_listing(url).T
        big_df = pd.concat([big_df, lil_df], axis = 0)
        urls.append(url)
    except Exception as e:
        skipped.append(url)
        print('url loop: {}'.format(str(e)))
    if count%100 == 0:
        print(count, end = '')
    elif count%10 == 0:
        print('.', end = '')
    count += 1
        
big_df.columns = [
    'title',
    'dateposted',
    'validthrough',
    'industry',
    'orgname',
    'cat',
    'descr',
    'stars',
    'jobid',
    'minsal',
    'maxsal',
    'medsal',
    'empid',
    'jobtitleid',
    'jobreqid'
]

big_df['url'] = urls
print(len(big_df))
big_df.head(2)
big_df.drop_duplicates(subset = ['jobid', 'validthrough', 'title'], inplace = True)
print('\n', len(big_df))
big_df.to_csv('assets/glassdoor_22april2018.csv', index = False)

end = dt.now()
print('\nelapsed: {}'.format(end - start))

2018-04-22 15:27:16.010025
1301
0.........100.........200.........300.........400.........500.........600.........700.........800.........900.........1000.........
poss expired listing: 'NoneType' object has no attribute 'text'
url loop: 'NoneType' object has no attribute 'T'
1100...
poss expired listing: 'NoneType' object has no attribute 'text'
url loop: 'NoneType' object has no attribute 'T'
....
poss expired listing: 'NoneType' object has no attribute 'text'
url loop: 'NoneType' object has no attribute 'T'
..1200
poss expired listing: 'NoneType' object has no attribute 'text'
url loop: 'NoneType' object has no attribute 'T'
.....
poss expired listing: 'NoneType' object has no attribute 'text'
url loop: 'NoneType' object has no attribute 'T'

poss expired listing: 'NoneType' object has no attribute 'text'
url loop: 'NoneType' object has no attribute 'T'
...
poss expired listing: 'NoneType' object has no attribute 'text'
url loop: 'NoneType' object has no attribute 'T'
.13001294

 60

In [9]:
skipped

['http://www.glassdoor.com/partner/jobListing.htm?pos=1802&ao=186496&s=58&guid=00000162eed27352bbfa14c228bc2031&src=GD_JOB_AD&t=SR&extid=1&exst=OL&ist=&ast=OL&vt=w&slr=true&rtp=0&cs=1_fe90273d&cb=1524425192937&jobListingId=2739078444',
 'http://www.glassdoor.com/partner/jobListing.htm?pos=1802&ao=186496&s=58&guid=00000162eed27352bbfa14c228bc2031&src=GD_JOB_AD&vt=w&rtp=0&cs=1_fe90273d&cb=1524425192985&jobListingId=2739078444',
 'http://www.glassdoor.com/partner/jobListing.htm?pos=1914&ao=175631&s=58&guid=00000162eed2985d9e63456f8a1ffb17&src=GD_JOB_AD&t=SR&extid=1&exst=OL&ist=&ast=OL&vt=w&slr=true&rtp=0&cs=1_8805b47a&cb=1524425202470&jobListingId=2728429085',
 'http://www.glassdoor.com/partner/jobListing.htm?pos=1914&ao=175631&s=58&guid=00000162eed2985d9e63456f8a1ffb17&src=GD_JOB_AD&vt=w&rtp=0&cs=1_8805b47a&cb=1524425202502&jobListingId=2728429085',
 'http://www.glassdoor.com/partner/jobListing.htm?pos=2026&ao=186496&s=58&guid=00000162eed2bfe993875553ca0f76c5&src=GD_JOB_AD&t=SR&extid=1&e