# Open Glassdoor Jobs
Beware: Glassdoor will return Data Science Jobs with a Statistics Query and vice-versa.

In [25]:
from scrapy import *
import pandas as pd
import datetime as dt  

In [26]:
def remove_duplicates(ds_links, stats_links, ds_ids, stats_ids):
    bad_ids = list(set(ds_ids).intersection(stats_ids))
    for ID in bad_ids:
        stats_links = [x for x in stats_links if ID not in x]
        ds_links = [x for x in ds_links if ID not in x]

    return ds_links, stats_links



def get_text(link):

    soup = get_soup(link)

    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = ' '.join(chunk for chunk in chunks if chunk)

    return text



def parse_ttl_loc_gd(text):
    try:
        chunks = text.split(' Job in ')
        job_title = chunks[0]
        location = " ".join(chunks[1].split(' ')).split(' | ')[0]
    except IndexError:
        job_title = 'NA'
        location = 'NA'
        
    return job_title, location



def get_gd_df(links):
    ct = 0
    regex = r'ListingId=(.*)'
    text = {}
    link_dict = {}
    for link in links:
        ct += 1
        ID = re.search(regex, link).group(1)
        try:
            text[ID] = get_text(link)
            link_dict[ID] = link
        except:
            continue
        if ct % 10 == 0:
            print('Opened link {0} of {1}'.format(ct,len(links)))
        
        time.sleep(5)
    
    series = pd.Series(text)
    parsed = series.apply(parse_ttl_loc_gd)
    df = pd.concat([series, parsed], axis=1)
    df[['Title', 'Location']] = df[1].apply(pd.Series)
    df = df.drop(1,1)
    df = df.rename(columns = {0:'Text'})
    df['link'] = pd.Series(link_dict)
    df['date'] = dt.datetime.today().strftime("%m/%d/%Y")
    return df

### Get the Links to the JDs

In [16]:
ds_links, ds_ids = crawl_gd()

pulling links from page 1
Got 29 links from page 1
pulling links from page 2
Got 25 links from page 2
pulling links from page 3
Got 27 links from page 3
pulling links from page 4
Got 24 links from page 4
pulling links from page 5
Got 0 links from page 5


In [17]:
stats_links, stats_ids = crawl_gd(q='stats')

pulling links from page 1
Got 30 links from page 1
pulling links from page 2
Got 30 links from page 2
pulling links from page 3
Got 30 links from page 3
pulling links from page 4
Got 26 links from page 4
pulling links from page 5
Got 25 links from page 5


### Remove Duplicates
Since we are seaching Glassdoor for both Data Science jobs and Statistician jobs, some of them will be duplicate. This step ensures that each group of jobs is only present during a search for that specific job.

In [18]:
ds_links, stats_links = remove_duplicates(ds_links,stats_links,ds_ids,stats_ids)

### Get the DataFrames

In [27]:
ds_df = get_gd_df(ds_links[:10])

Opened link 10 of 10


In [20]:
stats_df = get_gd_df(stats_links)

Opened link 10 of 121
Opened link 20 of 121
Opened link 30 of 121
Opened link 40 of 121
Opened link 50 of 121
Opened link 60 of 121
Opened link 70 of 121
Opened link 80 of 121
Opened link 90 of 121
Opened link 100 of 121
Opened link 110 of 121
Opened link 120 of 121


In [28]:
ds_df

Unnamed: 0,Text,Title,Location,link,date
1150672634,"Slalom LLC. Data Scientist Job in Chicago, IL ...",Slalom LLC. Data Scientist,"Chicago, IL",http://www.glassdoor.com/partner/jobListing.ht...,11/04/2015
1569054278,APPLY NOW!HOME | ABOUT CNA | CAREERSGo to the ...,,,http://www.glassdoor.com/partner/jobListing.ht...,11/04/2015
1575309427,Ninjob | Employment Marketing Health Data Scie...,Ninjob | Employment Marketing Health Data Scie...,"Chicago, IL",http://www.glassdoor.com/partner/jobListing.ht...,11/04/2015
1600519722,Michael Resource Group LLC NLP Data Science He...,Michael Resource Group LLC NLP Data Science Head,"Chicago, IL",http://www.glassdoor.com/partner/jobListing.ht...,11/04/2015
1606821276,"KPMG Data Scientist Job in Chicago, IL | Glass...",KPMG Data Scientist,"Chicago, IL",http://www.glassdoor.com/partner/jobListing.ht...,11/04/2015
1618194058,Anthem Blue Cross Blue Shield Data Scientist J...,Anthem Blue Cross Blue Shield Data Scientist,"Chicago, IL",http://www.glassdoor.com/partner/jobListing.ht...,11/04/2015
1623590415,Trunk Club Data Engineer (Data Science) Job in...,Trunk Club Data Engineer (Data Science),"Chicago, IL",http://www.glassdoor.com/partner/jobListing.ht...,11/04/2015
1624662407,Ninjob | Employment Marketing PhD or Master's ...,Ninjob | Employment Marketing PhD or Master's ...,"Chicago, IL",http://www.glassdoor.com/partner/jobListing.ht...,11/04/2015
1624694832,Spectraforce Senior Data Scientist/Informatics...,Spectraforce Senior Data Scientist/Informatics...,"Chicago, IL",http://www.glassdoor.com/partner/jobListing.ht...,11/04/2015
1625792200,"iCrossing Manager, Insights and Data Science -...","iCrossing Manager, Insights and Data Science -...","Chicago, IL",http://www.glassdoor.com/partner/jobListing.ht...,11/04/2015


In [22]:
stats_df

Unnamed: 0,Text,Title,Location,link
1150672610,"Slalom LLC. Data Analyst Job in Chicago, IL | ...",Slalom LLC. Data Analyst,"Chicago, IL",http://www.glassdoor.com/partner/jobListing.ht...
1152109459,Clarity Solution Group Data Analyst Job in Chi...,Clarity Solution Group Data Analyst,"Chicago, IL",http://www.glassdoor.com/partner/jobListing.ht...
1171897154,Old Mission Capital Quantitative Trading Inter...,Old Mission Capital Quantitative Trading Inter...,"Chicago, IL",http://www.glassdoor.com/partner/jobListing.ht...
1232249451,Theorem Clinical Research Senior Prinicipal St...,Theorem Clinical Research Senior Prinicipal St...,"Chicago, IL",http://www.glassdoor.com/partner/jobListing.ht...
1288388742,Uptake Predictive Analytics Architect Job in C...,Uptake Predictive Analytics Architect,"Chicago, IL",http://www.glassdoor.com/partner/jobListing.ht...
1313141349,Jump Trading Quantitative Researcher Job in Ch...,Jump Trading Quantitative Researcher,"Chicago, IL",http://www.glassdoor.com/partner/jobListing.ht...
1374077357,Business Intelligence and Analytics Associate ...,,,http://www.glassdoor.com/partner/jobListing.ht...
1447299255,Job Details Skip Navigation Sign In to access ...,,,http://www.glassdoor.com/partner/jobListing.ht...
1457548911,APPLY NOW!HOME | ABOUT CNA | CAREERSGo to the ...,,,http://www.glassdoor.com/partner/jobListing.ht...
1470552355,"Peak6 Investments, Inc. Director, Analytics Jo...","Peak6 Investments, Inc. Director, Analytics","Chicago, IL",http://www.glassdoor.com/partner/jobListing.ht...


###Write to CSV

In [147]:
import datetime
now = datetime.datetime.now()
ds_df.to_csv('DS_Glassdoor_{0}_{1}_{2}.csv'.format(now.month,now.day,now.year))
stats_df.to_csv('STATS_Glassdoor_{0}_{1}_{2}.csv'.format(now.month,now.day,now.year))