In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import re
import os
import time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import spacy

from bs4 import BeautifulSoup
from urllib.request import urlopen, Request

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.keys import Keys

In [2]:
# Functions
def clear_auto(element):
    auto_input = str(element.get_attribute("value"))
    auto_length = len(auto_input)
    # Delete everything
    for i in range(auto_length):
        element.send_keys(Keys.BACKSPACE)
        
def indeed_query(title, location, dirty_jobs):
    # Go to Indeed homepage
    global browser
    browser.get("https://www.indeed.com/")
    print('Scraping:', title, '-', location)
    # Find input fields
    what = browser.find_element_by_name("q")
    what.send_keys(str(title))
    where = browser.find_element_by_name("l")
    clear_auto(where)
    where.send_keys(str(location))

    # Click Search
    button = browser.find_element_by_class_name("icl-Button")
    button.click()

    # Collect pages to scrape
    url = browser.current_url

    # Get soup for the results page
    html = urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')

    # extract job results
    for link in soup.find_all('a', {'class':'jobtitle turnstileLink'}):
        dirty_jobs.append(link.attrs['href'])
        
def deep_indeed_query(title, location, dirty_jobs):
    # Go to Indeed homepage
    global browser
    browser.get("https://www.indeed.com/")
    print('Scraping:', title, '-', location)
    # Find input fields
    what = browser.find_element_by_name("q")
    what.send_keys(str(title))
    where = browser.find_element_by_name("l")
    clear_auto(where)
    where.send_keys(str(location))

    # Click Search
    button = browser.find_element_by_class_name("icl-Button")
    button.click()

    for i in range(3):
        # Collect pages to scrape
        url = browser.current_url

        # Get soup for the results page
        html = urlopen(url)
        soup = BeautifulSoup(html, 'html.parser')

        # extract job results
        for link in soup.find_all('a', {'class':'jobtitle turnstileLink'}):
            dirty_jobs.append(link.attrs['href'])
            
        # click on next page
        try:
            next_page = browser.find_element_by_class_name("np")
            next_page.click()
        except NoSuchElementException:
            break
        
def clean_jobs(jobs):
    # Find unique values
    jobs = list(set(jobs))
    
    # Clean clean clean
    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)

    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)

    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)

    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)

    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)

    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)
            
    # add prefix
    jobs = ['https://www.indeed.com' + job for job in jobs]
            
    return jobs

def indeed_crawl(titles, locations, dirty_jobs):

    # Loop locations
    for location in locations:
        # Loop titles
        for title in titles:
            deep_indeed_query(title, location, dirty_jobs)

    # Clean it up
    jobs = clean_jobs(dirty_jobs)
    return jobs

def clean_description(text):
    # Clean
    clean_text = BeautifulSoup(text, "lxml").text
    # clean_text = clean_text[2:]
    clean_text = re.sub(r'\\n', ' ', clean_text)
    clean_text = re.sub(r'/', ' ', clean_text)
    clean_text = re.sub(r'[^a-zA-Z ^0-9]', '', clean_text)
    
    return clean_text

def tokenize(text):
    # Tokenize
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if (token.is_stop != True) and (token.is_punct != True)]
    if ' ' in tokens:
        tokens.remove(' ')
    if '  ' in tokens:
        tokens.remove('  ')
    return tokens

def fit_for_nn(text_list):
    # Create a vocab and get word counts per doc
    sparse = tfidf.fit_transform(text)
    # send to df
    tfidf_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())
    return tfidf_dtm
    
def process_query_for_nn(string):
    # Create a vocab and get word counts per doc
    sparse = tfidf.transform([query])
    query_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())
    return query_dtm

In [3]:
def deep_indeed_query(title, location, dirty_jobs):
    # Go to Indeed homepage
    global browser
    browser.get("https://www.indeed.com/")
    print('Scraping:', title, '-', location)
    # Find input fields
    what = browser.find_element_by_name("q")
    what.send_keys(str(title))
    where = browser.find_element_by_name("l")
    clear_auto(where)
    where.send_keys(str(location))

    # Click Search
    button = browser.find_element_by_class_name("icl-Button")
    button.click()

    for i in range(3):
        # Collect pages to scrape
        url = browser.current_url

        # Get soup for the results page
        html = urlopen(url)
        soup = BeautifulSoup(html, 'html.parser')

        # extract job results
        for link in soup.find_all('a', {'class':'jobtitle turnstileLink'}):
            dirty_jobs.append(link.attrs['href'])
            
        # exit popup if it comes up
        try:
            popup_x = browser.find_element_by_class_name("icl-Icon icl-Icon--sm  icl-Icon--black close")
            popup_x.click()
        except NoSuchElementException:
            break
        
        # click on next page
        try:
            next_page = browser.find_element_by_class_name("np")
            next_page.click()
        except NoSuchElementException:
            break
            
# <svg role="img" class="icl-Icon icl-Icon--sm  icl-Icon--black close"

## Spin-Up Web Browser

In [10]:
# Disable auto-complete
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.formfill.enable", "false")
    
# Create new Instance of Chrome in incognito mode
browser = webdriver.Firefox(executable_path='../../Selenium/geckodriver')

In [11]:
# Titles and locations
titles = ["Data Engineer", "Data Analyst", "Data Scientist"]
locations = ["Raleigh-Durham, NC", "Charlotte, NC", "Roanoke, VA", "Charlottesville, VA",
             "Greensboro, NC", "Winston-Salem, NC", "Annapolis, MD", "Philadelphia, PA",
             "Tyson's Corner, VA"]

In [12]:
# Instantiate dirty_jobs
dirty_jobs = []

# Crawl over indeed.com
jobs = indeed_crawl(titles, locations, dirty_jobs)

Scraping: Data Engineer - Raleigh-Durham, NC
Scraping: Data Analyst - Raleigh-Durham, NC
Scraping: Data Scientist - Raleigh-Durham, NC
Scraping: Data Engineer - Charlotte, NC
Scraping: Data Analyst - Charlotte, NC
Scraping: Data Scientist - Charlotte, NC
Scraping: Data Engineer - Roanoke, VA
Scraping: Data Analyst - Roanoke, VA
Scraping: Data Scientist - Roanoke, VA
Scraping: Data Engineer - Charlottesville, VA
Scraping: Data Analyst - Charlottesville, VA
Scraping: Data Scientist - Charlottesville, VA
Scraping: Data Engineer - Greensboro, NC
Scraping: Data Analyst - Greensboro, NC
Scraping: Data Scientist - Greensboro, NC
Scraping: Data Engineer - Winston-Salem, NC
Scraping: Data Analyst - Winston-Salem, NC
Scraping: Data Scientist - Winston-Salem, NC
Scraping: Data Engineer - Annapolis, MD
Scraping: Data Analyst - Annapolis, MD
Scraping: Data Scientist - Annapolis, MD
Scraping: Data Engineer - Philadelphia, PA


StaleElementReferenceException: Message: The element reference of <input id="text-input-what" class="icl-TextInput-control icl-TextInput-control--whatWhere" name="q" type="text"> is stale; either the element is no longer attached to the DOM, it is not in the current frame context, or the document has been refreshed


<button class="icl-Button icl-Button--primary icl-Button--md icl-WhatWhere-button" size="md" type="submit">Find Jobs</button>

In [13]:
texts = []

In [14]:
for job in jobs:
    html = urlopen(job)
    soup = BeautifulSoup(html, 'html.parser')
    
    texts.append(soup.find_all('div', {'class':'jobsearch-jobDescriptionText'}))

NameError: name 'jobs' is not defined

In [15]:
texts = [str(text)[1:-1] for text in texts]

In [None]:
# Send to df
df = pd.DataFrame(texts, columns = ['description'])
df['jobs'] = jobs
df.head()

In [None]:
# NLP Model
nlp = spacy.load("en_core_web_md")

In [None]:
# Clean and tokenize the descriptions
df['tokens'] = df['description'].apply(clean_description).apply(tokenize)

In [None]:
# send clean text to list
text = df['description'].apply(clean_description).tolist()

In [None]:
# Instantiate Vectorizer
tfidf = TfidfVectorizer(stop_words = 'english')

In [None]:
tfidf_dtm = fit_for_nn(text)

In [None]:
# Create a vocab and get word counts per doc
sparse = tfidf.fit_transform(text)

In [None]:
# send to df
tfidf_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())

## Model

In [None]:
# Instantiate model
nn = NearestNeighbors(n_neighbors=20, algorithm='ball_tree')
nn.fit(tfidf_dtm)

In [None]:
query = """I use python to collect and scrape data from the web. I can set up integrated data pipelines
        pipeline to collect data from different sources. I train machine learning models using sklearn, 
        and tensorflow with keras. BeautifulSoup and Selenium. BeautifulSoup and Selenium.
        BeautifulSoup and Selenium. BeautifulSoup and Selenium. I can give results to developers using Flask apps
        and Flask APIs API. I can access APIs API and RSS feeds. I can also use SQL, particularly ElephantSQL
        and Postgres. I like venture capital, finance and business consulting. I love to work with
        natural language processing. Looking for a junior or entry level entry-level or mid level mid-level
        venture capital, finance and business consulting venture capital, finance and business consulting
        venture capital, business data solutions, finance and business consulting, 
        create visualizations with tableau"""

# query = """I use knowledge of process and chemical engineering to help businesses optimize production,
#         often making use of Statistical Process Control.  Background in math, science, organic chemistry.
#         Interested in the environmental waste section, as chemistry forms the backbone of much of that work."""

query_dtm = process_query_for_nn(query)

In [None]:
# Query for closest neighbors
results = nn.kneighbors(query_dtm)[1][0].tolist()

In [None]:
# Send to list
job_urls = df['jobs'][results].tolist()

In [None]:
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val, val)

jobs_df = pd.DataFrame(job_urls)

jobs_df.style.format(make_clickable)

In [None]:
['<a href="{' + link +'}">{' + link + '}</a>' for link in job_urls]