# Open LinkedIn Jobs


In [13]:
from scrapy import *
import pandas as pd
import re
import datetime as dt  

In [14]:
def parse_ttl_loc_li(text):
    regex_co = r'^(.*?)[-]'
    regex_ttl = r'[-](.*?)[(]'
    regex_loc = r'[(](.*?)[)]'
    try:
        job_title = text.split(' at ')[0].strip()
        location = text.split(' at ')[1].split(' in ')[1].split(' - ')[0].strip()
        company = text.split(' at ')[1].split(' in ')[0].strip()
    except:
        job_title = 'NA'
        location = 'NA'
        company = 'NA'
        
    return job_title, location, company



def get_li_df(links):
    ct = 0
    regex = r'/jobs2/view/(.*?)[?]'
    text = {}
    link_dict = {}
    
    for link in links:
        ct += 1
        ID = re.search(regex, link).group(1)
        
        try:
            text[ID] = get_text(link)
            link_dict[ID] = link
        except:
            continue
        
        if ct % 10 == 0:
            print('Opened link {0} of {1}'.format(ct,len(links)))
        
        time.sleep(5)
    
    series = pd.Series(text)
    parsed = series.apply(parse_ttl_loc_li)
    df = pd.concat([series, parsed], axis=1)
    df[['Title', 'Location', 'Company']] = df[1].apply(pd.Series)
    df = df.drop(1,1)
    df = df.rename(columns = {0:'Text'})
    df['link'] = pd.Series(link_dict)
    df['date'] = dt.datetime.today().strftime("%m/%d/%Y")
    return df

In [15]:
ds_links, ds_ids = crawl_li()
stats_links, stats_ids = crawl_li(q='stats')

pulling links from page 1
Got 25 links from page 1
pulling links from page 2
Got 25 links from page 2
pulling links from page 3
Got 24 links from page 3
pulling links from page 4
Got 25 links from page 4
pulling links from page 5
Got 25 links from page 5
pulling links from page 6
Got 13 links from page 6
pulling links from page 7
Got 25 links from page 7
pulling links from page 8
Got 25 links from page 8
pulling links from page 9
Got 17 links from page 9
pulling links from page 10
Got 18 links from page 10
pulling links from page 1
Got 25 links from page 1
pulling links from page 2
Got 23 links from page 2
pulling links from page 3
Got 14 links from page 3
pulling links from page 4
Got 0 links from page 4
pulling links from page 5
Got 0 links from page 5
pulling links from page 6
Got 0 links from page 6
pulling links from page 7
Got 25 links from page 7
pulling links from page 8
Got 20 links from page 8
pulling links from page 9
Got 22 links from page 9
pulling links from page 10
Got 2

In [16]:
ds_links, stats_links = remove_duplicates(ds_links,stats_links,ds_ids,stats_ids)

In [17]:
ds_df = get_li_df(ds_links)
ds_df

Opened link 10 of 205
Opened link 20 of 205
Opened link 30 of 205
Opened link 40 of 205
Opened link 50 of 205
Opened link 60 of 205
Opened link 70 of 205
Opened link 80 of 205
Opened link 90 of 205
Opened link 100 of 205
Opened link 110 of 205
Opened link 120 of 205
Opened link 130 of 205
Opened link 140 of 205
Opened link 150 of 205
Opened link 160 of 205
Opened link 170 of 205
Opened link 180 of 205
Opened link 190 of 205
Opened link 200 of 205


Unnamed: 0,Text,Title,Location,Company,link
32059243,Think Big Principal Data Scientist at Think Bi...,Think Big Principal Data Scientist,US-Illinois-Chicago,"Think Big, A Teradata Company",https://www.linkedin.com/jobs2/view/32059243?t...
38611321,Data Scientist at Uptake in Chicago - Job | Li...,Data Scientist,Chicago,Uptake,https://www.linkedin.com/jobs2/view/38611321?t...
64641085,Big Data Analytics Business Consultant at KPMG...,Big Data Analytics Business Consultant,"Chicago, IL",KPMG US,https://www.linkedin.com/jobs2/view/64641085?t...
64679092,Sr. Director Big Data Platforms at McDonald's ...,Sr. Director Big Data Platforms,US -Illinois -Oak Brook,McDonald's Corporation,https://www.linkedin.com/jobs2/view/64679092?t...
65726658,"Data Scientist II, Drug Discovery at AbbVie in...","Data Scientist II, Drug Discovery","Chicago, IL, US",AbbVie,https://www.linkedin.com/jobs2/view/65726658?t...
65993027,Principal Clinical Trials Data Scientist at Ab...,Principal Clinical Trials Data Scientist,"Chicago, IL, US",AbbVie,https://www.linkedin.com/jobs2/view/65993027?t...
68331598,Healthcare Data Analyst - Actuarial at Blue Cr...,Healthcare Data Analyst - Actuarial,IL,"Blue Cross and Blue Shield of Illinois, Montan...",https://www.linkedin.com/jobs2/view/68331598?t...
69338902,Big Data Software Engineer at Conversant Inc. ...,Big Data Software Engineer,"L5-Chicago, IL",Conversant Inc.,https://www.linkedin.com/jobs2/view/69338902?t...
69338903,"Big Data Software Engineer, Senior at Conversa...","Big Data Software Engineer, Senior","L5-Chicago, IL",Conversant Inc.,https://www.linkedin.com/jobs2/view/69338903?t...
69340446,Big Data Solutions Engineer at Conversant Inc....,Big Data Solutions Engineer,"L5-Chicago, IL",Conversant Inc.,https://www.linkedin.com/jobs2/view/69340446?t...


In [18]:
stats_df = get_li_df(stats_links)
stats_df

Opened link 10 of 132
Opened link 20 of 132
Opened link 30 of 132
Opened link 40 of 132
Opened link 50 of 132
Opened link 60 of 132
Opened link 70 of 132
Opened link 80 of 132
Opened link 90 of 132
Opened link 100 of 132
Opened link 110 of 132
Opened link 120 of 132
Opened link 130 of 132


Unnamed: 0,Text,Title,Location,Company,link
43245566,Statistical Pairs Trading Position at T3 Tradi...,Statistical Pairs Trading Position,"Chicago, IL, US",T3 Trading Group LLC,https://www.linkedin.com/jobs2/view/43245566?t...
55416914,Image Tuning Engineer at Motorola Mobility in ...,Image Tuning Engineer,US-Illinois-Chicago,Motorola Mobility,https://www.linkedin.com/jobs2/view/55416914?t...
55417825,"Director, Image Signal Processing Technology a...","Director, Image Signal Processing Technology",US-Illinois-Chicago,Motorola Mobility,https://www.linkedin.com/jobs2/view/55417825?t...
55417830,Principal Image Algorithm Engineer at Motorola...,Principal Image Algorithm Engineer,US-Illinois-Chicago,Motorola Mobility,https://www.linkedin.com/jobs2/view/55417830?t...
55825335,Research Database Manager-SAS at KP Recruiting...,Research Database Manager-SAS,"Buffalo Grove, IL, US",KP Recruiting Group,https://www.linkedin.com/jobs2/view/55825335?t...
58686294,Senior Statistical Analyst at AbbVie in Chicag...,Senior Statistical Analyst,"Chicago, IL, US",AbbVie,https://www.linkedin.com/jobs2/view/58686294?t...
62955150,Director - EV MTA at Nielsen in North America-...,Director - EV MTA,North America-US-IL-Evanston,Nielsen,https://www.linkedin.com/jobs2/view/62955150?t...
63872914,"Software Engineer at Groupon in Chicago, IL, U...",Software Engineer,"Chicago, IL, US",Groupon,https://www.linkedin.com/jobs2/view/63872914?t...
63897537,"Digital Media Analyst I at IRI in Chicago, IL ...",Digital Media Analyst I,"Chicago, IL",IRI,https://www.linkedin.com/jobs2/view/63897537?t...
64606031,Network Administrator at Ceannate Corp. in Gre...,Network Administrator,Greater Chicago Area,Ceannate Corp.,https://www.linkedin.com/jobs2/view/64606031?t...


In [13]:
import datetime
now = datetime.datetime.now()
ds_df.to_csv('DS_LinkedIn_{0}_{1}_{2}.csv'.format(now.month,now.day,now.year))
stats_df.to_csv('STATS_LinkedIn_{0}_{1}_{2}.csv'.format(now.month,now.day,now.year))
