In [16]:
# Import Packages 
import csv
from datetime import *
import numpy
import math
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

In [17]:
def get_job_results(url: str, info_id: str) -> list:
    """"Return page results from job site"""
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find(id=info_id) 
    return results

def collect_individual_search_info(site: str, individual_searches: list, total_results_frame, unique_results_frame, search_times: list, total_search_time: float, notes:str = None):
    """Return individual search info as one DataFrame"""
    # Create framework for the individual search data. 
    individual_searches_frame = pd.DataFrame(individual_searches, columns=['search_location', 'search_title'])
    individual_searches_frame['SearchTimes'] = search_times
    individual_searches_frame['Site'] = site
    individual_searches_frame['Date'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Collect the counts for the results before removing duplicates.
    total_results = total_results_frame[['search_title', 'search_location', 'description']].groupby(['search_title', 'search_location'], as_index=False).count()

    # Collect the counts for the results after removing duplicates.
    unique_results = unique_results_frame[['search_title', 'search_location', 'description']].groupby(['search_title', 'search_location'], as_index=False).count()

    # Add the Total Results count to the individual searches frame.
    individual_searches_frame = pd.merge(
        individual_searches_frame, 
        pd.DataFrame({
            'search_title': total_results['search_title'], 
            'search_location': total_results['search_location'], 
            'TotalResults': total_results['description']
            }),
        on=['search_title', 'search_location'], 
        how='left'
        )

    # Add the Unique Results count to the individual searches frame.
    individual_searches_frame = pd.merge(
        individual_searches_frame, 
        pd.DataFrame({
            'search_title': unique_results['search_title'], 
            'search_location': unique_results['search_location'], 
            'UniqueResults': unique_results['description']
            }),
            on=['search_title', 'search_location'], 
            how='left'
            )
    
    # Where we didn't have a results count we must not have found anything, so replace the NaN with a 0.
    for col in ['TotalResults', 'UniqueResults']:
        individual_searches_frame[col][individual_searches_frame[col].isnull()] = 0

    # Add the total search time to the data frame.
    individual_searches_frame['TotalSearchTime'] = total_search_time
    # Add any notes we have to the frame.
    individual_searches_frame['Notes'] = notes
    individual_searches_frame.to_csv('Diagnostics/ScraperPerformancebySearch.csv', mode='a', header=False)
  

def preprocess_jobs_recent(initial_data):
    """Clean up the jobs frame"""
    for col in [col for col in indeed_0.columns if col is not 'posted']:
        initial_data[col] = initial_data[col].str.replace('\n', ' ')
    processed_data = initial_data.drop_duplicates(subset=['link'], keep='first')
    processed_data['description'] = processed_data['description'].str.replace(r"([a-z])([A-Z])", r"\1 \.\2")
    processed_data['description'] = processed_data['description'].str.replace(r' \\.', '')
    return processed_data


    
def save_search_stats(site: str, total_returned_jobs: int, unique_jobs: int, search_time: float, total_searches: int, notes: str):
    """Write search statistics as new row in ScraperPerformance.csv"""
    with open('Diagnostics/ScraperPerformance.csv', 'a+', newline='') as file:
        write = csv.writer(file, delimiter=',')
        notes = notes.replace(',', ';')
        write.writerow([site, total_returned_jobs, unique_jobs, search_time, total_searches, date.today(), notes])


  for col in [col for col in indeed_0.columns if col is not 'posted']:


In [37]:

####################
# Data Frame Columns
columns = [
    'search_title', 'search_location', 'location', 'title', 'company', 'posted', 'salary', 'summary', 'link', 'description'
    ]

###############
# Job titles
prefixes = [
    'entry level ', 
    # 'junior ', 
    # 'associate ', 
    ''
    ]
titles = [
    'data', 
    # 'data analyst', 
    # 'data scientist',
    # # 'data engineer',
    # 'business analyst', 
    # 'financial analyst', 
    # 'macro analyst'
    ]

################
# Job Locations
search_locations = [
    # 'New York, NY',
    # # 'Newark, NJ', 
    # # 'Princeton, NJ', 'Jersey City, NJ', 'Trenton, NJ', 
    # # 'Bridgewater, NJ', 'Somerville, NJ', 'Summit, NJ', 
    # # 'Morristown, NJ', 'Edison, NJ', 'Metuchen, NJ', 
    # # 'Hackensack, NJ',
    # # 'Philadelphia, PA', 
    # 'Stamford, CT', 'Greenwich, CT',
    # 'Hartford, CT',
    # 'New Haven, CT', 
    # 'Boston, MA', 
    # # 'New York', 'New Jersey', 'New Hampshire', 'Pennsylvania', 'Connecticut',
    # # 'NY', 'NJ', 'NH', 'PA', 
    # 'CT', 'MA',
    'remote'
    ]


n_searches = (len(titles) * len(prefixes)) * len(search_locations)


In [28]:
posted_date_cutoff = 7
# posted_date_cutoff = input('What is our date cutoff for finding jobs? ')

# Indeed

In [47]:
################################
################################
#### Scrape Indeed for Jobs ####
################# ###############
################################

# Four Fors: Because who needs a head hunter anyways?
# 1. Search each location
#   2. Search each job title
#       3. Search each page of results 
#           4. Get each job's description
# nlocations * ntitles * npages * njobdescriptions = niterations
# 
# NOTE We sleep before hitting the page because if we over do it they won't give us the data
# 

#######################
# Change these fields #
#######################

#######################
# URL Fields
# How many pages per locale
location_results = 100
# How far out should we look?
radius = 50



########################
# Performance Monitors #
########################
search_times_indeed = []
individual_searches = []

# Initialize these for performance monitoring
ttm = True
start_time = time.time()
counter = 1

# Array of jobs
jobs_indeed = []
visited_links = set()

# Main two iterators for the search, 
for search_location in search_locations: 
    for search_title in [(s + t) for s in prefixes for t in titles]:

       ####################################
        # Monitoring Chunk
        individual_searches.append([search_location, search_title])
        if counter > 1: 
            search_time = time.time() - search_start
            search_times_indeed.append(search_time)
            if ttm:
                print(f'The search took {search_time} seconds.\n')
        if ttm:
            print(f'Searching {search_location} for {search_title} positions \tSearch number {counter} of {n_searches}') 
        counter += 1
        search_start = time.time()
        #####################################

        for nresults in range(0, location_results, 10):            
            # Go to the url below and get the results from each page
            s = search_location.replace(' ', '%20').replace(',', '%2C')
            st = search_title.replace(' ', '%20')
            url = f'https://www.indeed.com/jobs?q={st}+%2420%2C000&l={s}+&radius={radius}&start={str(nresults)}'
            try:
                results = get_job_results(url, 'resultsCol')
            except:
                continue
            # If we have run out of postings move on
            if results is None:
                continue
            job_elems = results.find_all('div', class_='jobsearch-SerpJobCard')
            
            ##############################
            # FIND THE MOST RECENT JOB POSTS!!! 
            for job in job_elems:
                # Make sure it's a valid post
                title_elem = job.find('h2', class_='title')
                if title_elem is None:  
                    continue
                else:
                    title = title_elem.text.strip()

                # Use the link to the job description to determine if it's
                # a job post that we have already scraped during this run.
                # If it is a job we've seen, continue on to the next one.
                link = job.find('a')['href']
                description_url = f'https://www.indeed.com{link}'
                if description_url in visited_links:
                    continue
                else:
                    visited_links.add(description_url)
                
                # The number of days ago it was posted is in the span with class date 
                when_elem = job.find('span', class_='date').text.strip()

                # If it was posted over our limit we don't want it
                if "+" in when_elem:
                    continue

                if when_elem[0] == 'A':
                    try:
                        n_days_ago = pd.to_numeric(when_elem[7:9])
                        if n_days_ago > posted_date_cutoff:
                            continue
                    except ValueError:                
                        # T for today and J for Just posted both get a value of zero.
                        if (when_elem[7] == "t") | (when_elem[7] == "j"):
                            n_days_ago = 0

                        # If there's text that we haven't classified yet, let us know. 
                        # We are also setting the days since posting to -1 so that the entry 
                        # will be more visible when we go to sort the jobs.
                        else:
                            n_days_ago = -1
                            print(f'We have come across unclassified text in the posting date, it looks like:\t {when_elem}.')

                else:

                    try:
                        n_days_ago = pd.to_numeric(when_elem[0:2])
                        if n_days_ago > posted_date_cutoff:
                            continue
                    except ValueError:                
                        # T for today and J for Just posted both get a value of zero.
                        if (when_elem[0] == "T") | (when_elem[0] == "J"):
                            n_days_ago = 0

                        # If there's text that we haven't classified yet, let us know. 
                        # We are also setting the days since posting to -1 so that the entry 
                        # will be more visible when we go to sort the jobs.
                        else:
                            n_days_ago = -1
                            print(f'We have come across unclassified text in the posting date, it looks like:\t {when_elem}.')
          
                # Company:
                company_elem = job.find('span', class_='company')
                if company_elem is None:
                    continue
                else:
                    company_elem = company_elem.text.strip()


                #########################
                #   Location: Where is the job
                #   Salary: How much are they paying
                #   Summary: What's the provided job summary
                location_elem = job.find('span', class_='location')
                salary_elem = job.find('span', class_='salary')
                summary_elem = job.find('div', class_='summary')
                 
                
                # If there isn't a location provided, denote that
                if location_elem is not None:
                    location_elem = location_elem.text.strip()

                # If there isn't a salary provided, denote that
                if salary_elem is not None:
                    salary_elem = salary_elem.text.strip()
                
                 # If there isn't a job summary provided, denote that
                if summary_elem is not None:
                    summary_elem = summary_elem.text.strip()


                ##################################
                # Go to the Job Description Page #
                ##################################
                # Wait {sleep_time} seconds before hitting the page so they don't kick us out
                # 
                time.sleep(2)
                try:
                    description_results = get_job_results(description_url, 'jobDescriptionText')
                except:
                    continue
                # If we screwed up locating the description, denote that
                if description_results is not None:
                    description_text = description_results.text.strip()

                # Create an array of info for this job posting
                info = [
                    search_title,
                    search_location,
                    location_elem, 
                    title,
                    company_elem,
                    when_elem,
                    salary_elem,
                    summary_elem,
                    description_url,
                    description_text
                ]

                # Append the job info array to the array of job posting arrays
                jobs_indeed.append(info)

# This adds the time for the final search.
search_time = time.time() - search_start
search_times_indeed.append(search_time)

# Calculate the total search time.
total_search_time = time.time() - start_time

if ttm:
     print(f'Indeed search completed in {total_search_time} seconds. {len(jobs_indeed)} results found (including duplicates).') 

# Convert the array of job arrays into a data frame.
indeed_0 = pd.DataFrame(jobs_indeed, columns=columns)

# Save raw data just in case
indeed_0.to_csv('data/0rawdata/' + datetime.today().strftime("%Y-%m-%d") + '_indeed.csv', index=False)
indeed_0.to_csv('Z:/data/0rawdata/' + datetime.today().strftime("%Y-%m-%d") + '_indeed.csv', index=False)

# Filter and modify the original and put it in a new variable.
indeed_1 = preprocess_jobs_recent(indeed_0)
indeed_1.to_csv('data/' + datetime.today().strftime("%Y-%m-%d") + '_indeed.csv', index=False)
indeed_1.to_csv('Z:/data/' + datetime.today().strftime("%Y-%m-%d") + '_indeed.csv', index=False)


# Save the monitoring data into their respective csv's for later analysis.
# collect_individual_search_info saves into a new csv which monitors performance of each individual search
# collect_individual_search_info('Indeed', individual_searches, indeed_0, indeed_1, search_times_indeed, total_search_time, 'Recent Jobs Only')
# save_search_stats('Indeed', len(jobs_indeed), len(indeed_1), total_search_time, n_searches, 'Recent Jobs Only')

# Tell us when we're done!
print(f'Returned {len(indeed_1)} unique entries from Indeed.')

               

Searching remote for entry level data positions 	Search number 1 of 2


KeyboardInterrupt: 

In [46]:
# try:
#     pd.to_numeric(a)
# except ValueError:
#     print('letters not numbers')
# except:
#     print('hi')
len('Active 3 days ago.')

phrase = 'Active today'

phrase[7]

't'

# Monster

In [48]:

jobs_monster = []
visited_links = set()
########################
# Performance Monitors #
########################
search_times_monster = []
individual_searches = []

# Initialize these for performance monitoring
ttm = True
start_time = time.time()
counter = 1

for search_location in search_locations:
    for search_title in [(s + t) for s in prefixes for t in titles]:

        ####################################
        # Monitoring Chunk
        individual_searches.append([search_location, search_title])
        if counter > 1: 
            search_time = time.time() - search_start
            search_times_monster.append(search_time)
            if ttm:
                print(f'The search took {search_time} seconds.\n')
        if ttm:
            print(f'Searching {search_location} for {search_title} positions \tSearch number {counter} of {n_searches}') 
        counter += 1
        search_start = time.time()
        #####################################

        sl = search_location.replace(' ', '-').replace(',', '__2C')
        st = search_title.replace(' ', '-')
        URL = f'https://www.monster.com/jobs/search/?q={st}&where={sl}&stpage=1&page=10'
        try:
            results = get_job_results(URL, 'ResultsContainer')
            if results is None:
                continue
        except:
            continue
        
        job_elems = results.find_all('section', class_='card-content')
        for job in job_elems:
            # If the title or company isn't present, continue to the next one
            title_elem = job.find('h2', class_='title')
            if title_elem is None:
                continue
            else:
                title = title_elem.text.strip()
            
            # If it was posted over our limit we don't want it
                when_elem = job.find('time').text.strip()
                if (when_elem[0] == "P") | (when_elem[0] == "J"):
                    when_elem = 0
                elif when_elem[0] == "+":
                    continue
                elif pd.to_numeric(when_elem[:2]) > posted_date_cutoff:
                    continue
                else:
                    when_elem = pd.to_numeric(when_elem[:2])
            
            #####################
            # Get the link for the page with the full job description,
            # then check to see if we have already been to that page.
            # If it's a link to a page we've been to already, 
            # continue on and get the next one.
            link = job.find('a')['href']
            if link in visited_links:
                continue
            else:
                visited_links.add(link)

           
            # If the company element isn't present continue to the next
            company_elem = job.find('div', class_='company')
            if company_elem is not None:
                company = company_elem.text.strip()
            
            # If there isn't a job location provided, indicate that
            location_elem = job.find('div', class_='location')
            if location_elem is not None:
                location = location_elem.text.strip()
            
            # Wait some time, then go to the job description page and get the relevant info.
            time.sleep(2)
            description_results = get_job_results(link, 'main-content')
            
            # Get the job details (Salary, etc)
            details_elem = description_results.findAll('div', class_='detail-row')
            salary = None
            job_type = None
            for detail in details_elem:
                dt = detail.text.strip()
                if 'Salary' in dt:
                    salary = dt
                if 'Job Type' in dt:
                    job_type = dt                    
                # if when_elem is None and 'Posted' in dt:
                    # posted = dt

            # Get the job description.
            description_elem = description_results.find('div', class_='job-description')
            if description_elem is not None:
                description = description_elem.text.strip()
            else:
                description = None
            
            # Bundle all the info
            item = [
                search_title,
                search_location,
                location,
                title,
                company,
                when_elem,
                salary,
                job_type,
                link,
                description
            ]
            jobs_monster.append(item)

# This adds the time for the final search.
search_time = time.time() - search_start
search_times_monster.append(search_time)

total_search_time_monster = time.time() - start_time

if ttm:
     print(f'Monster search took {total_search_time_monster} seconds to retreive {len(jobs_monster)} total results (including some duplicates).')

# Convert array of arrays into data frame
monster_0 = pd.DataFrame(jobs_monster, columns=columns)
# Save raw data just in case
monster_0.to_csv('data/0rawdata/' + datetime.today().strftime("%Y-%m-%d") + '_monster.csv', index=False)
monster_0.to_csv('Z:/data/0rawdata/' + datetime.today().strftime("%Y-%m-%d") + '_monster.csv', index=False)
# filter and modify df going forward
monster_1 = preprocess_jobs_recent(monster_0)
monster_1.to_csv('data/' + datetime.today().strftime("%Y-%m-%d") + '_monster.csv', index=False)
monster_1.to_csv('Z:/data/' + datetime.today().strftime("%Y-%m-%d") + '_monster.csv', index=False)
# Save the monitoring data into their respective csv's for later analysis.
# collect_individual_search_info saves into a new csv which monitors performance of each individual search
# collect_individual_search_info('Monster', individual_searches, monster_0, monster_1, search_times_monster, total_search_time, 'Recent Jobs Only')
# save_search_stats('Monster', len(monster_0), len(monster_1), total_search_time_monster, n_searches, 'Recent Jobs Only')

print(f'Returned {len(monster_1)} unique entries from Monster.com')
    


Searching remote for entry level data positions 	Search number 1 of 2
The search took 2.044952154159546 seconds.

Searching remote for data positions 	Search number 2 of 2
Monster search took 3.428002119064331 seconds to retreive 0 total results (including some duplicates).
Returned 0 unique entries from Monster.com


In [51]:
# df_0 = pd.concat([indeed_1, monster_1], ignore_index=True)
# df_1 = df_0.drop_duplicates(subset=['description'])
# # df = PreprocessJobs(df)
# print('Both pre duplicate removal:', len(df_0), '\n', 'Post duplicate removal:', len(df_1))

# df = df_1
results
URL

'https://www.monster.com/jobs/search/?q=data&where=remote&stpage=1&page=10'

In [24]:
# df['description'] = df['description'].str.replace(',', ' ')

In [25]:
# df.to_csv('data/' + datetime.today().strftime("%Y-%m-%d") + '_recentjobs.tsv', sep='\t', index=False)