# Open Glassdoor Jobs
Beware: Glassdoor will return Data Science Jobs with a Statistics Query and vice-versa.

In [127]:
from scrapy import *
import pandas as pd

In [128]:
def remove_duplicates(ds_links, stats_links, ds_ids, stats_ids):
    bad_ids = list(set(ds_ids).intersection(stats_ids))
    for ID in bad_ids:
        stats_links = [x for x in stats_links if ID not in x]
        ds_links = [x for x in ds_links if ID not in x]

    return ds_links, stats_links



def get_text(link):

    soup = get_soup(link)

    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = ' '.join(chunk for chunk in chunks if chunk)

    return text



def parse_ttl_loc_gd(text):
    try:
        chunks = text.split(' Job in ')
        job_title = chunks[0]
        location = " ".join(chunks[1].split(' ')).split(' | ')[0]
    except IndexError:
        job_title = 'NA'
        location = 'NA'
        
    return job_title, location



def get_gd_df(links):
    ct = 0
    regex = r'ListingId=(.*)'
    text = {}

    for link in links:
        ct += 1
        ID = re.search(regex, link).group(1)
        text[ID] = get_text(link)
        
        if ct % 10 == 0:
            print('Opened link {0} of {1}'.format(ct,len(links)))
        
        time.sleep(5)
    
    series = pd.Series(text)
    parsed = series.apply(parse_ttl_loc_gd)
    df = pd.concat([series, parsed], axis=1)
    df[['Title', 'Location']] = df[1].apply(pd.Series)
    df = df.drop(1,1)
    df = df.rename(columns = {0:'Text'})
    return df

### Get the Links to the JDs

In [129]:
ds_links, ds_ids = crawl_gd()

pulling links from page 1
Got 30 links from page 1
pulling links from page 2
Got 29 links from page 2
pulling links from page 3
Got 19 links from page 3
pulling links from page 4
Got 14 links from page 4
pulling links from page 5
Got 0 links from page 5


In [130]:
stats_links, stats_ids = crawl_gd(q='stats')

pulling links from page 1
Got 30 links from page 1
pulling links from page 2
Got 24 links from page 2
pulling links from page 3
Got 28 links from page 3
pulling links from page 4
Got 21 links from page 4
pulling links from page 5
Got 27 links from page 5


### Remove Duplicates
Since we are seaching Glassdoor for both Data Science jobs and Statistician jobs, some of them will be duplicate. This step ensures that each group of jobs is only present during a search for that specific job.

In [131]:
ds_links, stats_links = remove_duplicates(ds_links,stats_links,ds_ids,stats_ids)

### Get the DataFrames

In [132]:
ds_df = get_gd_df(ds_links)

Opened link 10 of 80
Opened link 20 of 80
Opened link 30 of 80
Opened link 40 of 80
Opened link 50 of 80
Opened link 60 of 80
Opened link 70 of 80
Opened link 80 of 80


In [133]:
stats_df = get_gd_df(stats_links)

Opened link 10 of 118
Opened link 20 of 118
Opened link 30 of 118
Opened link 40 of 118
Opened link 50 of 118
Opened link 60 of 118
Opened link 70 of 118
Opened link 80 of 118
Opened link 90 of 118
Opened link 100 of 118
Opened link 110 of 118


In [136]:
ds_df

Unnamed: 0,Text,Title,Location
1229635527,"Concurrency, Inc. - Brookfield, WI Project Man...","Concurrency, Inc. - Brookfield, WI Project Man...","Chicago, IL"
1308356035,Tech contingency Data Scientist| Statistician|...,Tech contingency Data Scientist| Statistician|...,"Chicago, IL"
1349690519,Soci�t� G�n�rale Information Technology - Data...,Soci�t� G�n�rale Information Technology - Data...,"Chicago, IL"
1421650344,Sears Data Scientist (Hoffman Estates Job in M...,Sears Data Scientist (Hoffman Estates,"Melrose Park, IL"
1421758966,Home Services - Parts Supply Chain Data Scient...,Home Services - Parts Supply Chain Data Scient...,"Melrose Park, IL"
1434129448,IBM Watson Analytics Data Scientist Job in Chi...,IBM Watson Analytics Data Scientist,"Chicago, IL"
1462351566,GE Transportation Lead Data Scientist Job in C...,GE Transportation Lead Data Scientist,"Chicago, IL"
1477728236,Uptake Data Scientist Programmer Intern *FALL ...,Uptake Data Scientist Programmer Intern *FALL ...,"Chicago, IL"
1490686856,Raise Marketplace Inc. Data Scientist Job in C...,Raise Marketplace Inc. Data Scientist,"Chicago, IL"
1493048803,IBM Watson Health Data Scientist Job in Chicag...,IBM Watson Health Data Scientist,"Chicago, IL"


In [135]:
stats_df

Unnamed: 0,Text,Title,Location
1150672576,Slalom LLC. Digital Analytics Consultant Job i...,Slalom LLC. Digital Analytics Consultant,"Chicago, IL"
1150672610,"Slalom LLC. Data Analyst Job in Chicago, IL | ...",Slalom LLC. Data Analyst,"Chicago, IL"
1150672917,Slalom LLC. Business Intelligence and Analytic...,Slalom LLC. Business Intelligence and Analytic...,"Chicago, IL"
1152109459,Clarity Solution Group Data Analyst Job in Chi...,Clarity Solution Group Data Analyst,"Chicago, IL"
1437932634,Innovative Systems Group Statistical Analyst J...,Innovative Systems Group Statistical Analyst,"Chicago, IL"
1447299255,Job Details Skip Navigation Sign In to access ...,,
1486813529,Job Opportunity - Alliance DataGo to the main ...,,
1486813775,Job Opportunity - Alliance DataGo to the main ...,,
1489379587,Belvedere Trading Quantitative Analyst Job in ...,Belvedere Trading Quantitative Analyst,"Chicago, IL"
1493887570,Razorfish Careers - Join Our Agency and Redefi...,,


###Write to CSV

In [147]:
import datetime
now = datetime.datetime.now()
ds_df.to_csv('DS_Glassdoor_{0}_{1}_{2}.csv'.format(now.month,now.day,now.year))
stats_df.to_csv('STATS_Glassdoor_{0}_{1}_{2}.csv'.format(now.month,now.day,now.year))