In [12]:
# Import Packages 
import csv
from datetime import *
import numpy
import math
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from nltk import *
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
from nltk.corpus import stopwords

In [13]:
def get_job_results(url: str, info_id: str) -> list:
    """"Return page results from job site"""
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find(id=info_id) 
    return results

def collect_individual_search_info(site: str, individual_searches: list, total_results_frame, unique_results_frame, search_times: list, total_search_time: float, notes:str = None):
    """Return individual search info as one DataFrame"""
    # Create framework for the individual search data. 
    individual_searches_frame = pd.DataFrame(individual_searches, columns=['search_location', 'search_title'])
    individual_searches_frame['SearchTimes'] = search_times
    individual_searches_frame['Site'] = site
    individual_searches_frame['Date'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Collect the counts for the results before removing duplicates.
    total_results = total_results_frame[['search_title', 'search_location', 'description']].groupby(['search_title', 'search_location'], as_index=False).count()

    # Collect the counts for the results after removing duplicates.
    unique_results = unique_results_frame[['search_title', 'search_location', 'description']].groupby(['search_title', 'search_location'], as_index=False).count()

    # Add the Total Results count to the individual searches frame.
    individual_searches_frame = pd.merge(
        individual_searches_frame, 
        pd.DataFrame({
            'search_title': total_results['search_title'], 
            'search_location': total_results['search_location'], 
            'TotalResults': total_results['description']
            }),
        on=['search_title', 'search_location'], 
        how='left'
        )

    # Add the Unique Results count to the individual searches frame.
    individual_searches_frame = pd.merge(
        individual_searches_frame, 
        pd.DataFrame({
            'search_title': unique_results['search_title'], 
            'search_location': unique_results['search_location'], 
            'UniqueResults': unique_results['description']
            }),
            on=['search_title', 'search_location'], 
            how='left'
            )
    
    # Where we didn't have a results count we must not have found anything, so replace the NaN with a 0.
    for col in ['TotalResults', 'UniqueResults']:
        individual_searches_frame[col][individual_searches_frame[col].isnull()] = 0

    # Add the total search time to the data frame.
    individual_searches_frame['TotalSearchTime'] = total_search_time
    # Add any notes we have to the frame.
    individual_searches_frame['Notes'] = notes
    individual_searches_frame.to_csv('Diagnostics/ScraperPerformancebySearch.csv', mode='a', header=False)
  

def preprocess_jobs_recent(initial_data):
    """Clean up the jobs frame"""
    for col in [col for col in indeed_0.columns if col is not 'posted']:
        initial_data[col] = initial_data[col].str.replace('\n', ' ')
    processed_data = initial_data.drop_duplicates(subset=['title', 'company', 'description'], keep='first')
    processed_data['description'] = processed_data['description'].str.replace(r"([a-z])([A-Z])", r"\1 \.\2")
    processed_data['description'] = processed_data['description'].str.replace(r' \\.', '')
    processed_data['posted'] = pd.to_numeric(processed_data['posted'])
    return processed_data


def categorize_jobs(posting_number: int):
    """Place jobs into one of four categories (y, n, u, o)"""
    job_status = input('Is this a job you would want to apply to? (y/n/u/o)')
    if job_status.lower() in ['yes', 'yeah', 'y']:
        yes.append(posting_number)
    elif job_status.lower() in ['no', 'nah', 'n']:
        no.append(posting_number)
    elif job_status.lower() in ['u', 'underqualified']:
        underqualified.append(posting_number)
    elif job_status.lower() in ['o', 'other', 'm', 'maybe']:
        other.append(posting_number)

def JobPostBisandTris(posting_number: int, n_results_per: int = 10, sorting: bool = True):
    '''
        Print Important Bigrams and Trigrams 
        0. Assemble stopword set
        1. Tokenize Sentences
        2. Remove sentences with particular words 
        3. Tokenize words from remaining sentences
        4. Remove stopwords
        5. Print collection of important bigrams and trigrams
        6. Print the entire job description
        7. If we are sorting, prompt user for whether it's a job to apply to
    ''' 
    stopset = set(stopwords.words('english') + word_tokenize(df['company'][posting_number].lower()))
    sentences = sent_tokenize(df.description[posting_number])
    sentences = [sentence for sentence in sentences if not [p for p in ['equal opportunity', 'not discriminate', 'diversity', 'affirmative action', 'gender', 'eoe'] if p  in sentence.lower()]]
    words = [word for words in sentences for word in word_tokenize(words.lower()) if word not in stopset]
    words = [word for word in words if (word.isalpha() or word.isnumeric()) and (len(word) < 2 or len(word) > 3)]
    bcf = BigramCollocationFinder.from_words(words)
    tcf = TrigramCollocationFinder.from_words(words)
    # Print company and job title
    print(df.company[posting_number])
    print(df.title[posting_number])
    print()
    # print bigrams
    print('Bigrams')
    print(set(bcf.nbest(BigramAssocMeasures.likelihood_ratio, n_results_per) + bcf.nbest(BigramAssocMeasures.raw_freq, n_results_per) + bcf.nbest(BigramAssocMeasures.chi_sq, n_results_per)))
    print()
    # Print Trigrams
    print('Trigrams')
    print(set(tcf.nbest(TrigramAssocMeasures.likelihood_ratio, n_results_per) + tcf.nbest(TrigramAssocMeasures.raw_freq, n_results_per) + tcf.nbest(TrigramAssocMeasures.chi_sq, n_results_per)))
    print()
    print(df.description[posting_number])
    print('Link:\n', df.link[posting_number])
    # If we are sorting them to train an algo
    if sorting:
        # If we are sorting through them
        categorize_jobs(posting_number)

    
def save_search_stats(site: str, total_returned_jobs: int, unique_jobs: int, search_time: float, total_searches: int, notes: str):
    """Write search statistics as new row in ScraperPerformance.csv"""
    with open('Diagnostics/ScraperPerformance.csv', 'a+', newline='') as file:
        write = csv.writer(file, delimiter=',')
        notes = notes.replace(',', ';')
        write.writerow([site, total_returned_jobs, unique_jobs, search_time, total_searches, date.today(), notes])


In [14]:
###############################
# Initialize Empty Containers #
###############################

# Index lists for sorting jobs (to train the algo)
yes = []
no = []
underqualified = []
other = []

# Empty container for posting info
jobs_indeed = []
jobs_monster = []

In [15]:

####################
# Data Frame Columns
columns = ['search_title', 'search_location', 'location', 'title', 'company', 'posted', 'salary', 'summary', 'link', 'description']

###############
# Job titles
prefixes = ['entry level ', 'junior ', 'associate ', '']
titles = ['business analyst', 'data analyst', 'data scientist', 'financial analyst', 'macro analyst']

################
# Job Locations
search_locations = [
    'New York, NY',
    'Newark, NJ', 'Princeton, NJ', 'Jersey City, NJ', 'Trenton, NJ', 'Bridgewater, NJ', 'Somerville, NJ', 'Summit, NJ', 'Morristown, NJ', 'Edison, NJ', 'Metuchen, NJ', 'Hackensack, NJ',
    'Philadelphia, PA', 
    'Stamford, CT', 'Greenwich, CT',
    'New York', 'New Jersey', 'New Hampshire', 'Pennsylvania', 'Connecticut',
    'NY', 'NJ', 'NH', 'PA', 'CT',
    'remote'
    ]


n_searches = (len(titles) * len(prefixes)) * len(search_locations)


# Indeed

In [16]:
################################
################################
#### Scrape Indeed for Jobs ####
################################
################################

# Four Fors: Because who needs a head hunter anyways?
# 1. Search each location
#   2. Search each job title
#       3. Search each page of results 
#           4. Get each job's description
# nlocations * ntitles * npages * njobdescriptions = niterations
# 
# NOTE We sleep before hitting the page because if we over do it they won't give us the data
# 

#######################
# Change these fields #
#######################

#######################
# URL Fields
# How many pages per locale
location_results = 100
# How far out should we look?
radius = 50


########################
# Performance Monitors #
########################
search_times_indeed = []
individual_searches = []

# Initialize these for performance monitoring
ttm = True
start_time = time.time()
counter = 1

# Main two iterators for the search, 
for search_location in search_locations: 
    for search_title in [(s + t) for s in prefixes for t in titles]:

       ####################################
        # Monitoring Chunk
        individual_searches.append([search_location, search_title])
        if counter > 1: 
            search_time = time.time() - search_start
            search_times_indeed.append(search_time)
            if ttm:
                print(f'The search took {search_time} seconds.\n')
        if ttm:
            print(f'Searching {search_location} for {search_title} positions \tSearch number {counter} of {n_searches}') 
        counter += 1
        search_start = time.time()
        #####################################

        for nresults in range(0, location_results, 10):            
            # Go to the url below and get the results from each page
            s = search_location.replace(' ', '%20').replace(',', '%2C')
            st = search_title.replace(' ', '%20')
            url = f'https://www.indeed.com/jobs?q={st}+%2420%2C000&l={s}+&radius={radius}&start={str(nresults)}'
            try:
                results = get_job_results(url, 'resultsCol')
            except:
                continue
            # If we have run out of postings move on
            if results is None:
                continue
            job_elems = results.find_all('div', class_='jobsearch-SerpJobCard')
            
            ##############################
            # FIND THE MOST RECENT JOB POSTS!!! 
            for job in job_elems:
                # Make sure it's a valid post
                title_elem = job.find('h2', class_='title')
                if title_elem is None:  
                    continue
                
                # If it was posted 'days ago' we don't want it
                when_elem = job.find('span', class_='date')
                if 'days' in when_elem.text.strip():
                    continue

                title = title_elem.text.strip()              
                     
                # Company:
                company_elem = job.find('span', class_='company')
                if company_elem is None:
                    continue
                else:
                    company_elem = company_elem.text.strip()


                #########################
                #   Location: Where is the job
                #   Salary: How much are they paying
                #   Summary: What's the provided job summary
                location_elem = job.find('span', class_='location')
                salary_elem = job.find('span', class_='salary')
                summary_elem = job.find('div', class_='summary')
                 
                
                # If there isn't a location provided, denote that
                if location_elem is None:
                    location_elem = 'No Location Provided'
                else:
                    location_elem = location_elem.text.strip()

                if 'day' in when_elem.text.strip():
                    when_elem = 1
                else:
                    when_elem = 0

                # If there isn't a salary provided, denote that
                if salary_elem is None:
                    salary_elem = 'No Salary Provided'
                else:
                    salary_elem = salary_elem.text.strip()
                
                 # If there isn't a job summary provided, denote that
                if summary_elem is None:
                    summary_elem = 'No Summary Provided'
                else:
                    summary_elem = summary_elem.text.strip()


                ##################################
                # Go to the Job Description Page #
                ##################################
                # Wait {sleep_time} seconds before hitting the page so they don't kick us out
                # 
                time.sleep(0.5)
                link = job.find('a')['href']
                description_url = f'https://www.indeed.com{link}'
                try:
                    description_results = get_job_results(description_url, 'jobDescriptionText')
                except:
                    continue
                # If we screwed up locating the description, denote that
                if description_results is None:
                    description_text = 'No Description Found'
                else:
                    description_text = description_results.text.strip()

                # Create an array of info for this job posting
                info = [
                    search_title,
                    search_location,
                    location_elem, 
                    title,
                    company_elem,
                    when_elem,
                    salary_elem,
                    summary_elem,
                    description_url,
                    description_text
                ]

                # Append the job info array to the array of job posting arrays
                jobs_indeed.append(info)

# This adds the time for the final search.
search_time = time.time() - search_start
search_times_indeed.append(search_time)

# Calculate the total search time.
total_search_time = time.time() - start_time

if ttm:
     print(f'Indeed search completed in {total_search_time} seconds. {len(jobs_indeed)} results found (including duplicates).') 

# Convert the array of job arrays into a data frame.
indeed_0 = pd.DataFrame(jobs_indeed, columns=columns)


# Filter and modify the original and put it in a new variable.
indeed_1 = preprocess_jobs_recent(indeed_0)

# Save the monitoring data into their respective csv's for later analysis.
# collect_individual_search_info saves into a new csv which monitors performance of each individual search
collect_individual_search_info('Indeed', individual_searches, indeed_0, indeed_1, search_times_indeed, total_search_time, 'Recent Jobs Only')
save_search_stats('Indeed', len(jobs_indeed), len(indeed_1), total_search_time, n_searches, 'Recent Jobs Only')

# Tell us when we're done!
print(f'Returned {len(indeed_1)} unique entries from Indeed.')

               

ching New Hampshire for financial analyst positions 	Search number 359 of 520
The search took 2.6653237342834473 seconds.

Searching New Hampshire for macro analyst positions 	Search number 360 of 520
The search took 2.6184465885162354 seconds.

Searching Pennsylvania for entry level business analyst positions 	Search number 361 of 520
The search took 2.5526678562164307 seconds.

Searching Pennsylvania for entry level data analyst positions 	Search number 362 of 520
The search took 2.7577733993530273 seconds.

Searching Pennsylvania for entry level data scientist positions 	Search number 363 of 520
The search took 2.6325910091400146 seconds.

Searching Pennsylvania for entry level financial analyst positions 	Search number 364 of 520
The search took 2.541064739227295 seconds.

Searching Pennsylvania for entry level macro analyst positions 	Search number 365 of 520
The search took 2.474977970123291 seconds.

Searching Pennsylvania for junior business analyst positions 	Search number 366

# Monster

In [20]:

########################
# Performance Monitors #
########################
search_times_monster = []
individual_searches = []

# Initialize these for performance monitoring
ttm = True
start_time = time.time()
counter = 1

for search_location in search_locations:
    for search_title in [(s + t) for s in prefixes for t in titles]:

        ####################################
        # Monitoring Chunk
        individual_searches.append([search_location, search_title])
        if counter > 1: 
            search_time = time.time() - search_start
            search_times_monster.append(search_time)
            if ttm:
                print(f'The search took {search_time} seconds.\n')
        if ttm:
            print(f'Searching {search_location} for {search_title} positions \tSearch number {counter} of {n_searches}') 
        counter += 1
        search_start = time.time()
        #####################################

        sl = search_location.replace(' ', '-').replace(',', '__2C')
        st = search_title.replace(' ', '-')
        URL = f'https://www.monster.com/jobs/search/?q={st}&where={sl}&stpage=1&page=10'
        try:
            results = get_job_results(URL, 'ResultsContainer')
        except:
            continue
        job_elems = results.find_all('section', class_='card-content')
        if results is None:
            continue

        for job in job_elems:
            # If the title or company isn't present, continue to the next one
            title_elem = job.find('h2', class_='title')
            if title_elem is None:
                continue
            else:
                title = title_elem.text.strip()
            
            if 'days' in job.find('time').text.strip():
                continue
            elif 'day' in job.find('time').text.strip():
                posted = 1
            else:
                posted = 0

           
            # If the company element isn't present continue to the next
            company_elem = job.find('div', class_='company')
            if company_elem is None:
                continue
            else:
                company = company_elem.text.strip()
            
            # If there isn't a job location provided, indicate that
            location_elem = job.find('div', class_='location')
            if location_elem is None:
                location = 'No Location Found'
            else:
                location = location_elem.text.strip()
            #####################
            # Get the link for the page with the full job description
            link = job.find('a')['href']

            description_results = get_job_results(link, 'main-content')
            details_elem = description_results.findAll('div', class_='detail-row')
            salary = None
            job_type = None
            for detail in details_elem:
                dt = detail.text.strip()
                if 'Salary' in dt:
                    salary = dt
                if 'Job Type' in dt:
                    job_type = dt                    
                if posted is None and 'Posted' in dt:
                    posted = dt

            if salary is None:
                salary = 'No Salary Provided'
            if job_type is None:
                job_type = 'No Job Type Provided'
            

            description_elem = description_results.find('div', class_='job-description')
            if description_elem is None:
                description = 'No Description Found'
            else:
                description = description_elem.text.strip()
            
            item = [
                search_title,
                search_location,
                location,
                title,
                company,
                posted,
                salary,
                job_type,
                link,
                description
            ]
            jobs_monster.append(item)

# This adds the time for the final search.
search_time = time.time() - search_start
search_times_monster.append(search_time)

total_search_time_monster = time.time() - start_time

if ttm:
     print(f'Monster search took {total_search_time_monster} seconds to retreive {len(jobs_monster)} total results (including some duplicates).')

# Convert array of arrays into data frame
monster_0 = pd.DataFrame(jobs_monster, columns=columns)

# filter and modify df going forward
monster_1 = preprocess_jobs_recent(monster_0)

# Save the monitoring data into their respective csv's for later analysis.
# collect_individual_search_info saves into a new csv which monitors performance of each individual search
collect_individual_search_info('Monster', individual_searches, monster_0, monster_1, search_times_monster, total_search_time, 'Recent Jobs Only')
save_search_stats('Monster', len(monster_0), len(monster_1), total_search_time_monster, n_searches, 'Recent Jobs Only')

print(f'Returned {len(monster_1)} unique entries from Monster.com')
    


 seconds.

Searching New Hampshire for macro analyst positions 	Search number 360 of 520
The search took 6.927181959152222 seconds.

Searching Pennsylvania for entry level business analyst positions 	Search number 361 of 520
The search took 0.1592864990234375 seconds.

Searching Pennsylvania for entry level data analyst positions 	Search number 362 of 520
The search took 0.18192481994628906 seconds.

Searching Pennsylvania for entry level data scientist positions 	Search number 363 of 520
The search took 6.576783895492554 seconds.

Searching Pennsylvania for entry level financial analyst positions 	Search number 364 of 520
The search took 6.425246238708496 seconds.

Searching Pennsylvania for entry level macro analyst positions 	Search number 365 of 520
The search took 6.454071283340454 seconds.

Searching Pennsylvania for junior business analyst positions 	Search number 366 of 520
The search took 0.20774245262145996 seconds.

Searching Pennsylvania for junior data analyst positions 	S

In [21]:
df_0 = pd.concat([indeed_1, monster_1], ignore_index=True)
df_1 = df_0.drop_duplicates(subset=['description'])
# df = PreprocessJobs(df)
print('Both pre duplicate removal:', len(df_0), '\n', 'Post duplicate removal:', len(df_1))

df = df_1

Both pre duplicate removal: 204 
 Post duplicate removal: 177


In [22]:
df['description'] = df['description'].str.replace(',', ' ')

In [23]:
df.to_csv('data/' + datetime.today().strftime("%Y-%m-%d") + '_recentjobs.tsv', sep='\t', index=False)