# Steps: 

- set up urls filters and tracking 
- run google chrome scrape 

- extraction of data and create data frame 


In [10]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from datetime import datetime
import urllib.request

import pandas as pd



class AppURLopener(urllib.request.FancyURLopener):
    version = "Mozilla/5.0"



def is_good_response(resp):
    """
    Returns 'True' if the response seems to be HTML, 'False' otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. This function just prints them, but you can make it do anything.
    """
    print(e)


def get_names(url):
    """
    Downloads the LinkedIn page to create a beautiful soup object with html information
    """
    
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                response = resp.content
                
                
                if response is not None:
                    html = BeautifulSoup(response, 'lxml')
                    #print(html)
                    return html
                # Raise an exception if we failed to get any data from the url
                #raise Exception('Error retrieving contents at {}'.format(url))

                
            else:
                pass

    except RequestException as e:
        # MAIN ERROR? 
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


In [11]:
def get_url(city, role):
    
    '''gets the url for the city (ie 'San Francisco, California') and job role (ie 'Data Scientist')'''
    search_role = ''
        
    for word in role.split():
        search_role = search_role + '+' + word
    search_role = search_role[1:len(search_role)]
    
    for word in city.split():
        search_role = search_role + '+' + word
    url = f'https://www.google.com/search?q={search_role}&ibp=htl;jobs#fpstate=tldetail&htidocid=bNuueDIcYnkfbPOvAAAAAA%3D%3D&htivrt=jobs'
    # https://www.google.com/search?q=data+science+jobs+new+york,+new+york&ibp=htl;jobs#fpstate=tldetail&htidocid=bNuueDIcYnkfbPOvAAAAAA%3D%3D&htivrt=jobs

    return url 



In [3]:
get_url('New York, New York', 'Data Engineer')

'https://www.google.com/search?q=Data+Engineer+New+York,+New+York&ibp=htl;jobs#fpstate=tldetail&htidocid=bNuueDIcYnkfbPOvAAAAAA%3D%3D&htivrt=jobs'

# For URL extraction

In [4]:
def get_divs(soup):
    urls = []
    soups = []
    companies = []
    titles = []
    descriptions = []
    locations = []
    post_dates = []
    
    '''Extract company names'''
    # for item in soup.find_all('a', attrs={'class': 'result-card__subtitle-link job-result-card__subtitle-link'}):        
        
    
    for item in soup.find_all('h4', attrs={'class': 'result-card__subtitle job-result-card__subtitle'}):
        companies.append(item.text)
        
    #print()
    #<h4 class="result-card__subtitle job-result-card__subtitle">
    
    '''Extract location'''

    for item in soup.find_all('span', attrs={'class': 'job-result-card__location'}):
        locations.append(item.text)
    
    '''Extract title information'''
    for item in soup.find_all('span', attrs={'class': 'screen-reader-text'}):
        titles.append(item.text)

    '''Extract post date of position'''
    for item in soup.find_all('time'):
        post_dates.append(item.get('datetime'))
    
    '''Extract descriptions'''
    for item in soup.find_all('p', attrs={'class': 'job-result-card__snippet'}):
        descriptions.append(item.text)
    
    '''Extract urls'''
    for item in soup.find_all('a', href=True):

        if item.get('href')[29:34] == '/view':
            new_url = item.get('href')
            urls.append(new_url)

    '''   
    print(len(companies))
    print(companies)
    
    print(len(locations))
    print(locations)
    
    print(len(titles))
    print(titles)
    
    print(len(post_dates))
    print(len(descriptions))
    print(len(urls))
    print(urls)
    
    print()
    print('soup')
    print(soup)
    '''
    
    # Make a dataframe object out of a dictionary of arrays
    df = pd.DataFrame({
        "company": companies,
        "location": locations,
        "title": titles,
        "post_date": post_dates,
        "description": descriptions
        , "url": urls
    })
    
    return df
    
    



In [5]:
def add_full_desc(df):
    
    'Extracts full linkedin post info and adds to 5 columns to existing dataframe per each city/role'
    
    full_job_desc = []
    seniority_level = []
    employment_type = []
    job_function = []
    industries = []
    
    '''Gets url from url column in each row'''        
    for index, row in df.iterrows():
        soup = get_names(row[5])
                
        '''Extracting job post body'''
        info = get_linkedin_info(soup)
        full_job_desc.append(info['full_desc'])
                    
        if info['Seniority level'] is not None:
            seniority_level.append(info['Seniority level'])
        else:
            seniority_level.append('NA')
                
        if info['Employment type'] is not None:
            employment_type.append(info['Employment type'])
        else:
            employment_type.append('NA')
                
        if info['Job function'] is not None:
            job_function.append(info['Job function'])
        else:
            job_function.append('NA')
                
        if info['Industries'] is not None:
            industries.append(info['Industries'])
        else:
            industries.append('NA')
                
    df['full_desc'] = full_job_desc
    df['seniority_level']= seniority_level
    df['employment_type']= employment_type
    df['job_function']= job_function
    df['industries']= industries
    
    return df

In [6]:
def get_linkedin_info(soup):
    
    "takes soup object and turns into LinkedIn full description"
    additional_info = {}
    full_desc = ''
    for item in soup.find_all('p'):
        #print(item)
        #print()
        full_desc = full_desc + item.text
        full_desc = full_desc + ' \n'
        #print(full_desc)
    
    additional_info['full_desc'] = full_desc

    '''Extracting Additional Categories from LinkedIn Post'''
    for item in soup.find_all('h3', attrs={'class': 'job-criteria__subheader'}):
        '''Make a library with the key names as the category (ie seniority level, industry, job functions, employment type)'''
        additional_info[item.text] = item.next_sibling.text
        
            
        item2 = item.next_sibling
        while item2.next_sibling is not None:
            additional_info[item.text] = additional_info[item.text] + ', ' + item2.next_sibling.text
            #print(additional_info[item])
            item2 = item2.next_sibling
        #print()
        
        
        
    return additional_info
            


In [13]:
# For testing google jobs

cities = ['San Francisco, California', 'Honolulu, Hawaii', 'New York, New York']

roles = ['Data Scientist',
    'Data Analyst']
    #'Business Analyst',
    #'Business Intelligence',
    #'Data Engineer',
    #'Machine Learning Engineer',
    #'Machine Learning Scientist',
    #'Artificial Intelligence Researcher',
    #'Statistical Modeler']

df_main = pd.DataFrame()
for city in cities:
    for role in roles:
        url =get_url(city, role)
        print(url)
        soup = get_names(url)
        print(soup)
        
        

https://www.google.com/search?q=Data+Scientist+San+Francisco,+California&ibp=htl;jobs#fpstate=tldetail&htidocid=bNuueDIcYnkfbPOvAAAAAA%3D%3D&htivrt=jobs
<!DOCTYPE html>
<html><head><meta charset="utf-8"/><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"/><title>Jobs</title><script nonce="8rBOOD7/kwN4N/uGAG+TUg==">(function(){window.google={kEI:'vgo-XdS2CIaU0gKa3IawBQ',authuser:0};google.sn='web';google.kHL='en';google.jsfs='';})();(function(){google.lc=[];google.li=0;google.getEI=function(a){for(var b;a&&(!a.getAttribute||!(b=a.getAttribute("eid")));)a=a.parentNode;return b||google.kEI};google.getLEI=function(a){for(var b=null;a&&(!a.getAttribute||!(b=a.getAttribute("leid")));)a=a.parentNode;return b};google.https=function(){return"https:"==window.location.protocol};google.ml=function(){return null};google.time=function(){return(new Date).getTime()};google.log=function(a,b,e,c,g){if(a=google.logUrl(a,b,e,c,g)){b=new Image;var d=google.lc,f=go

<!DOCTYPE html>
<html><head><meta charset="utf-8"/><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"/><title>Jobs</title><script nonce="VsSYcijAwKi0vKelgW57tQ==">(function(){window.google={kEI:'vgo-XcDOE4y80wL7saCICQ',authuser:0};google.sn='web';google.kHL='en';google.jsfs='';})();(function(){google.lc=[];google.li=0;google.getEI=function(a){for(var b;a&&(!a.getAttribute||!(b=a.getAttribute("eid")));)a=a.parentNode;return b||google.kEI};google.getLEI=function(a){for(var b=null;a&&(!a.getAttribute||!(b=a.getAttribute("leid")));)a=a.parentNode;return b};google.https=function(){return"https:"==window.location.protocol};google.ml=function(){return null};google.time=function(){return(new Date).getTime()};google.log=function(a,b,e,c,g){if(a=google.logUrl(a,b,e,c,g)){b=new Image;var d=google.lc,f=google.li;d[f]=b;b.onerror=b.onload=b.onabort=function(){delete d[f]};google.vel&&google.vel.lu&&google.vel.lu(a);b.src=a;google.li=f+1}};google.logUrl=fun

<!DOCTYPE html>
<html><head><meta charset="utf-8"/><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"/><title>Jobs</title><script nonce="3SKLjaOFsgCGxVSMRmM2cw==">(function(){window.google={kEI:'vgo-XeLzI-fh0gKyzriADw',authuser:0};google.sn='web';google.kHL='en';google.jsfs='';})();(function(){google.lc=[];google.li=0;google.getEI=function(a){for(var b;a&&(!a.getAttribute||!(b=a.getAttribute("eid")));)a=a.parentNode;return b||google.kEI};google.getLEI=function(a){for(var b=null;a&&(!a.getAttribute||!(b=a.getAttribute("leid")));)a=a.parentNode;return b};google.https=function(){return"https:"==window.location.protocol};google.ml=function(){return null};google.time=function(){return(new Date).getTime()};google.log=function(a,b,e,c,g){if(a=google.logUrl(a,b,e,c,g)){b=new Image;var d=google.lc,f=google.li;d[f]=b;b.onerror=b.onload=b.onabort=function(){delete d[f]};google.vel&&google.vel.lu&&google.vel.lu(a);b.src=a;google.li=f+1}};google.logUrl=fun

In [16]:
soup

<!DOCTYPE html>
<html><head><meta charset="utf-8"/><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"/><title>Jobs</title><script nonce="3SKLjaOFsgCGxVSMRmM2cw==">(function(){window.google={kEI:'vgo-XeLzI-fh0gKyzriADw',authuser:0};google.sn='web';google.kHL='en';google.jsfs='';})();(function(){google.lc=[];google.li=0;google.getEI=function(a){for(var b;a&&(!a.getAttribute||!(b=a.getAttribute("eid")));)a=a.parentNode;return b||google.kEI};google.getLEI=function(a){for(var b=null;a&&(!a.getAttribute||!(b=a.getAttribute("leid")));)a=a.parentNode;return b};google.https=function(){return"https:"==window.location.protocol};google.ml=function(){return null};google.time=function(){return(new Date).getTime()};google.log=function(a,b,e,c,g){if(a=google.logUrl(a,b,e,c,g)){b=new Image;var d=google.lc,f=google.li;d[f]=b;b.onerror=b.onload=b.onabort=function(){delete d[f]};google.vel&&google.vel.lu&&google.vel.lu(a);b.src=a;google.li=f+1}};google.logUrl=fun

In [17]:
url

'https://www.google.com/search?q=Data+Analyst+New+York,+New+York&ibp=htl;jobs#fpstate=tldetail&htidocid=bNuueDIcYnkfbPOvAAAAAA%3D%3D&htivrt=jobs'

In [193]:
if __name__ == '__main__':

    cities = ['San Francisco, California', 'Honolulu, Hawaii', 'New York, New York']

    roles = ['Data Scientist',
    'Data Analyst']
    #'Business Analyst',
    #'Business Intelligence',
    #'Data Engineer',
    #'Machine Learning Engineer',
    #'Machine Learning Scientist',
    #'Artificial Intelligence Researcher',
    #'Statistical Modeler']

    df_main = pd.DataFrame()
    for city in cities:
        for role in roles:
            url =get_url(city, role)
            print(url)
            soup = get_names(url)
            df = get_divs(soup)
            
            df = add_full_desc(df)
            
            frames = [df_main, df]
            result = pd.concat(frames)
            df_main = result
            print(df_main.shape)

    datetime.now().month, datetime.now().day
        

https://www.linkedin.com/jobs/search?keywords=Data%20Scientist&location=San%20Francisco%2C%20California%2C%20United%20States&trk=homepage-basic_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0
(25, 11)
https://www.linkedin.com/jobs/search?keywords=Data%20Analyst&location=San%20Francisco%2C%20California%2C%20United%20States&trk=homepage-basic_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0
(50, 11)
https://www.linkedin.com/jobs/search?keywords=Data%20Scientist&location=Honolulu%2C%20Hawaii%2C%20United%20States&trk=homepage-basic_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0
(68, 11)
https://www.linkedin.com/jobs/search?keywords=Data%20Analyst&location=Honolulu%2C%20Hawaii%2C%20United%20States&trk=homepage-basic_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0
(93, 11)
https://www.linkedin.com/jobs/search?keywords=Data%20Scientist&location=New%20York%2C%20York%2C%20United%20States&trk=homepage-basic_jobs-search-bar_s

In [198]:
datetime.now().day

28

In [194]:
df_main.shape

(143, 11)

In [480]:
url = 'https://www.linkedin.com/jobs/view/data-analyst-at-kgs-technology-group-inc-1163865464?position=3&pageNum=0&trk=guest_job_search_job-result-card_result-card_full-click'
soup = get_names(url)




In [13]:
test_url = df.iloc[0, 5]

In [348]:
df = get_divs(soup)

In [349]:
df.head()

Unnamed: 0,company,location,title,post_date,description,url
0,Patch.com,"New York, New York",Data Analyst,2019-07-22,Patch is seeking a motivated and meticulous da...,https://www.linkedin.com/jobs/view/data-analys...
1,Infosys Limited,"New York, New York",Data Analyst,2019-07-23,Desired Skills & Experience. Experie...,https://www.linkedin.com/jobs/view/data-analys...
2,"KGS Technology Group, Inc","New York, New York, United States",Data Analyst,2019-06-28,We are looking for a passionate Data Analyst t...,https://www.linkedin.com/jobs/view/data-analys...
3,Jobs @ TheJobNetwork,"New York City, NY, US","Senior Data Operations Analyst, Data Analytics",2019-07-26,Some knowledge of advanced statistical techniq...,https://www.linkedin.com/jobs/view/senior-data...
4,Barclays Investment Bank,"Whippany, NJ, US",Data Analyst,2019-06-25,What will you be doing? What We’re Looking For...,https://www.linkedin.com/jobs/view/data-analys...


In [301]:
url
soup = get_names(url)

In [302]:
url

'https://www.linkedin.com/jobs/search?keywords=Data%20Analyst&location=%2C%20York%2C%20United%20States&trk=homepage-basic_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0'