# Indeed Job Postings

## Requirements:
- Pull job postings by specific town
 - Scrape job listings from specific search results
 - Iterate through all results for a town
 - Loop through specific towns
 - Combine results into single df, and save
- 

In [283]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display
import datetime as dt
import os
start_time = dt.datetime.now()

# Create the Timestamp object 
ts = pd.Timestamp(year = 2011,  month = 11, day = 21, hour = 10, second = 49, tz = 'US/Central')  
today = ts.today()
today

Timestamp('2020-04-21 10:16:09.153900')

## Job Listings Function

Scrapes specific job listings off an Indeed search results page.


In [284]:
def job_listings(town, start, df):
    new_url = 'https://www.indeed.com/jobs?l={},+MA&radius=0&sort=date&sr=directhire&start={}'.format(town,start)
    today = pd.to_datetime('today')
    #dupes = dupes
    new_start = start
    page = requests.get(new_url)
    html = BeautifulSoup(page.text, 'html.parser')
    #html.find('td', attrs={'id': 'resultsCol'})
    #html.find('div',attrs={'class','jobsearch-SerpJobCard'})
    results = html.find_all('div',attrs={'class','jobsearch-SerpJobCard'})
    
    job_already = 0
    new_jobs = 0
    
    for item in results:
        title = str(item.find('a',attrs={'class','jobtitle'}).contents)[4:-2]
        job_id = item['id']
        if job_id in df.index:
            job_already += 1
        else:
            new_jobs +=1
            try:
                company = str(item.find('span',attrs={'class','company'}).a.contents)[4:-2]
                company_rating = float(item.find('span',attrs={'class','ratingsContent'}).contents[0].strip())
                company_link = item.find('span',attrs={'class','company'}).a['href']
            except:
                try:
                    company = str(item.find('span',attrs={'class','company'}).contents)[4:-2]
                except:
                    company = None
                company_rating = None
                company_link = None
            df.loc[job_id,['title','company','company_link','company_rating', 'town', 'orig_date']] = [title, company,company_link,company_rating, town, today]
    if job_already == len(results):
        new_start += job_already
        return new_start, df
    else:
        new_start = len(df)
        return new_start, df

## Town Results function

Loops through pages returned for a specific town, then uses the Job listings func to return job posting results.

In [285]:
def town_results(town):
    df = pd.DataFrame(columns=['title','company','company_link','company_rating','town','orig_date'])
    url = "https://www.indeed.com/jobs?as_and=&as_phr=&as_any=&as_not=&as_ttl=&as_cmp=&jt=all&st=&sr=directhire&as_src=&salary=&radius=0&l={}%2C+MA&fromage=any&limit=10&sort=date&psf=advsrch&from=advancedsearch".format(town)
    page = requests.get(url)
    html = BeautifulSoup(page.text, 'html.parser')
        
    num_results = str(html.find('div', attrs={'id': 'searchCountPages'}).contents)
    results = int(num_results[num_results.find('of ')+3:num_results.find(' jobs')].replace(',',''))
    
    f = widgets.IntProgress(
        value=len(df),
        min=0,
        max=results,
        description='{}:'.format(town),
        bar_style='info',
        orientation='horizontal'
    )

    int_prog = widgets.HTML(
        value="{} out of {}".format(len(df),results)
    )
    
    display(f)
    display(int_prog)
    
    start = len(df)
    while start < results:
        start, df = job_listings(town, start, df)
        f.value = len(df)
        int_prog.value = "{} out of {}".format(len(df),results)
    return df

## Town Loop

Loops through target towns, then combines results into a single dataframe


In [286]:
towns = [
    'Framingham',
    'Natick',
    'Wayland',
    'Worcester',
    'Marlborough',
    'Southborough',
    'Westborough',
    'Hopkinton',
    'Wellesley',
    'Waltham',
    'Newton',
    'Shrewsbury',
    'Hudson',
    'Needham',
    'Weston',
    'Sudbury',
    'Berlin',
    'Northborough',
    'Milford',
    'Medway',
    'Milford',
    'Millis'
]

folder = os.listdir(os.getcwd() + '/listings')
town_listings_master = []

for town in towns:
    town_listings = town_results(town)
    town_listings_master.append(town_listings)
    
combined_towns = pd.concat(town_listings_master)

IntProgress(value=0, bar_style='info', description='Framingham:', max=1134)

HTML(value='0 out of 1134')

IntProgress(value=0, bar_style='info', description='Natick:', max=579)

HTML(value='0 out of 579')

IntProgress(value=0, bar_style='info', description='Wayland:', max=69)

HTML(value='0 out of 69')

IntProgress(value=0, bar_style='info', description='Worcester:', max=2001)

HTML(value='0 out of 2001')

IntProgress(value=0, bar_style='info', description='Marlborough:', max=524)

HTML(value='0 out of 524')

IntProgress(value=0, bar_style='info', description='Southborough:', max=133)

HTML(value='0 out of 133')

IntProgress(value=0, bar_style='info', description='Westborough:', max=546)

HTML(value='0 out of 546')

IntProgress(value=0, bar_style='info', description='Hopkinton:', max=144)

HTML(value='0 out of 144')

IntProgress(value=0, bar_style='info', description='Wellesley:', max=304)

HTML(value='0 out of 304')

IntProgress(value=0, bar_style='info', description='Waltham:', max=1345)

HTML(value='0 out of 1345')

IntProgress(value=0, bar_style='info', description='Newton:', max=851)

HTML(value='0 out of 851')

IntProgress(value=0, bar_style='info', description='Shrewsbury:', max=316)

HTML(value='0 out of 316')

IntProgress(value=0, bar_style='info', description='Hudson:', max=204)

HTML(value='0 out of 204')

IntProgress(value=0, bar_style='info', description='Needham:', max=426)

HTML(value='0 out of 426')

IntProgress(value=0, bar_style='info', description='Weston:', max=112)

HTML(value='0 out of 112')

IntProgress(value=0, bar_style='info', description='Sudbury:', max=92)

HTML(value='0 out of 92')

IntProgress(value=0, bar_style='info', description='Berlin:', max=34)

HTML(value='0 out of 34')

IntProgress(value=0, bar_style='info', description='Northborough:', max=163)

HTML(value='0 out of 163')

IntProgress(value=0, bar_style='info', description='Milford:', max=390)

HTML(value='0 out of 390')

IntProgress(value=0, bar_style='info', description='Medway:', max=63)

HTML(value='0 out of 63')

IntProgress(value=0, bar_style='info', description='Milford:', max=390)

HTML(value='0 out of 390')

IntProgress(value=0, bar_style='info', description='Millis:', max=37)

HTML(value='0 out of 37')

In [287]:
len(combined_towns)
combined_towns.head(5)


Unnamed: 0,title,company,company_link,company_rating,town,orig_date
p_b0ac2e4c103f9a95,"Division Manager, Community Care Program",Encompass Health,/cmp/Encompass-Health,3.5,Framingham,2020-04-21 10:16:10.308788
pj_b9bca235b9e699f2,Customer Service Associate - Fruit Expert,Edible Arrangements,/cmp/Edible-Arrangements,3.8,Framingham,2020-04-21 10:16:10.308788
pj_0ce7f8679de7329c,Temp Mother's Day Delivery Driver: No- Contact...,Edible Arrangements Framingham,/cmp/Edible-Arrangements,3.8,Framingham,2020-04-21 10:16:10.308788
p_2a7e8402fcb1b4b8,Accounts Payable Specialist,Rave Mobile Safety,/cmp/Rave-Mobile-Safety,3.8,Framingham,2020-04-21 10:16:10.308788
p_d7000f5d7fc84ab6,Project Manager,Voltech Electric,,,Framingham,2020-04-21 10:16:10.308788


### Save the combined listings to a csv

In [288]:
combined_towns['id_uncleaned'] = combined_towns.index
combined_towns['id_cleaned'] = combined_towns['id_uncleaned'].apply(lambda x: x.replace('p_','').replace('pj_',''))
file = os.getcwd() + '/listings/combined_listings_{}.csv'.format(str(today)[:10])
combined_towns.to_csv(file)


## Scrape the Descriptions for the job listings

This code block scrapes the job descriptions from the job detail pages.

In [289]:
f = widgets.IntProgress(
        value=0,
        min=0,
        max=len(combined_towns),
        description='progress',
        bar_style='info',
        orientation='horizontal'
    )

int_prog = widgets.HTML(
        value="{} out of {}".format(len(combined_with_desc),f.value)
    )

display(f)
display(int_prog)

def job_desc(id):
    url = 'https://www.indeed.com/viewjob?jk={}'.format(id)
    f.value += 1
    int_prog.value = "{} out of {}".format(f.value, len(combined_with_desc))
    page = requests.get(url)
    html = BeautifulSoup(page.text, 'html.parser')
    page_results = html.find_all('div',attrs={'class','jobsearch-jobDescriptionText'})
    if len(page_results) > 0:
        return page_results[0].get_text()
    
        
combined_with_desc = combined_towns.copy()
combined_with_desc['description'] = combined_with_desc['id_cleaned'].apply(job_desc)
combined_with_desc.to_csv(file)

IntProgress(value=0, bar_style='info', description='progress', max=8043)

HTML(value='7648 out of 0')

## Target listings with keywords

This code block flags the job listings with keywords in it

In [290]:
combined_with_desc['salesforce'] = False
combined_with_desc['sql'] = False
combined_with_desc['python'] = False
combined_with_desc['analyst'] = False
combined_with_desc['sales_operations'] = False
combined_with_desc['bus_operations'] = False
combined_with_desc['rev_operations'] = False

combined_with_desc['salesforce'] = combined_with_desc['description'].str.contains('salesforce',regex=True, case=False)
print('Salesforce done')
combined_with_desc['sql'] = combined_with_desc['description'].str.contains('sql',regex=True, case=False)
print('sql done')
combined_with_desc['python'] = combined_with_desc['description'].str.contains('python',regex=True, case=False)
print('python done')
combined_with_desc['analyst'] = combined_with_desc['title'].str.contains('analyst',regex=True, case=False)
print('analyst done')
combined_with_desc['sales_operations'] = combined_with_desc['description'].str.contains('sales operations',regex=True, case=False)
print('sales_ops done')
combined_with_desc['bus_operations'] = combined_with_desc['description'].str.contains('business operations',regex=True, case=False)
print('bus_ops done')
combined_with_desc['rev_operations'] = combined_with_desc['description'].str.contains('revenue operations',regex=True, case=False)
print('bus_ops done')

combined_with_desc['rev_operations'].value_counts()



Salesforce done
sql done
python done
analyst done
sales_ops done
bus_ops done
bus_ops done


False    8040
True        1
Name: rev_operations, dtype: int64

## Filter only targeted jobs

In [291]:
target_listings = combined_with_desc.loc[
    (combined_with_desc['salesforce'] == True) | 
    (combined_with_desc['sql'] == True) |
    (combined_with_desc['python'] == True) |
    (combined_with_desc['analyst'] == True) |
    (combined_with_desc['sales_operations'] == True) |
    (combined_with_desc['bus_operations'] == True) |
    (combined_with_desc['rev_operations'] == True)
].copy()

print('target_listings',len(target_listings))

jobs = target_listings['title'].unique()

excluded_jobs = [
    'Account Executive',
    'Account Manager',
    'Intern',
    'Business Development',
    'Account Development',
    'Board Certified Behavior',
    'Sales Manager',
    'Tax Accountant',
    'Loan',
    'Telemarketer',
    'Teacher',
    'Representative',
    'Executive Assistant'
    'Dental',
    'Consultant',
    'Thermodynamic',
    'Lab',
    'Tesla',
    'Neutronics',
    'Bi-Lingual'
]

final_target_listings = target_listings[
    (~target_listings['title'].str.contains('|'.join(excluded_jobs))) & 
    (~target_listings['company'].str.contains('Confidential'))
]


final_target_listings = final_target_listings.drop(['description'], axis = 1)

print(len(final_target_listings))


target_listings 484
293


Remove dupes

In [292]:
final_target_listings.drop_duplicates(subset ="id_cleaned", 
                     keep = False, inplace = True)

final_target_listings.head()

Unnamed: 0,title,company,company_link,company_rating,town,orig_date,id_uncleaned,id_cleaned,salesforce,sql,python,analyst,sales_operations,bus_operations,rev_operations
p_f3c2e0f5a6d70294,Health Economics & Outcomes Research,An-L-It-Iks Inc.,,,Framingham,2020-04-21 10:16:11.315318,p_f3c2e0f5a6d70294,f3c2e0f5a6d70294,False,False,True,False,False,False,False
p_b6d902a108815e50,LIMS Business Analyst/Admin,"Sequence, Inc.",,,Framingham,2020-04-21 10:16:12.950321,p_b6d902a108815e50,b6d902a108815e50,False,False,False,True,False,False,False
p_4fe0bdf131e4866b,SG- Research Associate II -563,NewAge Clinical,/cmp/Newage-Clinical,5.0,Framingham,2020-04-21 10:16:20.018413,p_4fe0bdf131e4866b,4fe0bdf131e4866b,False,False,True,False,False,False,False
pj_5256fa8510b25b69,Inside Sales Associate,Lytx,/cmp/Lytx,3.6,Framingham,2020-04-21 10:16:29.231979,pj_5256fa8510b25b69,5256fa8510b25b69,True,False,False,False,False,False,False
p_aa6c74dc8b26e25a,Customer Success Technical Specialist,"Globoforce, Inc.",/cmp/Workhuman,4.1,Framingham,2020-04-21 10:16:31.336435,p_aa6c74dc8b26e25a,aa6c74dc8b26e25a,True,False,False,False,False,False,False


## Score the jobs

If there's a keyword present, add 1.

If the company rating is over 4, add 1 -- if it's between 3-4, then add 0.5

In [293]:
def job_score(job):
    row = final_target_listings.loc[job]
    #print(row)
    final_target_listings.loc[job,'score'] = 0
    score = 0
    attrib = [
        row['salesforce'],
        row['sql'],
        row['python'],
        row['sales_operations'],
        row['bus_operations'],
        row['rev_operations']
    ]
    for at in list(attrib):
        if at == True:
            score += 1
    try:
        if row['company_rating'] >= 4:
            score += 1
        elif row['company_rating'] >= 3:
            score += 0.5
    except: pass
    return score

final_target_listings['score'] = final_target_listings['id_uncleaned'].apply(job_score)
final_target_listings = final_target_listings.sort_values('score', ascending = False)

### Exclude already saved jobs

In [295]:
saved_jobs = pd.read_excel('/Users/bgetman/Dropbox/Python/housing_prices/listings/saved_listings.xlsx')

saved_ids = saved_jobs['Unnamed: 1']

saved_ids

final_target_listings = final_target_listings[~final_target_listings['id_uncleaned'].isin(saved_ids)]

print('final_targets',len(final_target_listings))

final_targets 203


### Save target file

In [296]:
file = os.getcwd() + '/listings/target_listings_{}.csv'.format(str(today)[:10])

final_target_listings.to_csv(file)



In [297]:
#boston_listings, boston_dupes = town_results('boston')