In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import re
import os
import time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import spacy

from bs4 import BeautifulSoup
from urllib.request import urlopen, Request

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.keys import Keys

In [2]:
# Functions
def clear_auto(element):
    auto_input = str(element.get_attribute("value"))
    auto_length = len(auto_input)
    # Delete everything
    for i in range(auto_length):
        element.send_keys(Keys.BACKSPACE)
        
def indeed_query(title, location, dirty_jobs):
    # Go to Indeed homepage
    global browser
    browser.get("https://www.indeed.com/")
    print('Scraping:', title, '-', location)
    # Find input fields
    what = browser.find_element_by_name("q")
    what.send_keys(str(title))
    where = browser.find_element_by_name("l")
    clear_auto(where)
    where.send_keys(str(location))

    # Click Search
    button = browser.find_element_by_class_name("icl-Button")
    button.click()

    # Collect pages to scrape
    url = browser.current_url

    # Get soup for the results page
    html = urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')

    # extract job results
    for link in soup.find_all('a', {'class':'jobtitle turnstileLink'}):
        dirty_jobs.append(link.attrs['href'])
        
def deep_indeed_query(title, location, dirty_jobs):
    # Go to Indeed homepage
    global browser
    browser.get("https://www.indeed.com/")
    print('Scraping:', title, '-', location)
    # Find input fields
    what = browser.find_element_by_name("q")
    what.send_keys(str(title))
    where = browser.find_element_by_name("l")
    clear_auto(where)
    where.send_keys(str(location))

    # Click Search
    button = browser.find_element_by_class_name("icl-Button")
    button.click()

    for i in range(3):
        # Collect pages to scrape
        url = browser.current_url

        # Get soup for the results page
        html = urlopen(url)
        soup = BeautifulSoup(html, 'html.parser')

        # extract job results
        for link in soup.find_all('a', {'class':'jobtitle turnstileLink'}):
            dirty_jobs.append(link.attrs['href'])
            
        # click on next page
        try:
            next_page = browser.find_element_by_class_name("np")
            next_page.click()
        except NoSuchElementException:
            break
        
def clean_jobs(jobs):
    # Find unique values
    jobs = list(set(jobs))
    
    # Clean clean clean
    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)

    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)

    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)

    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)

    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)

    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)
            
    # add prefix
    jobs = ['https://www.indeed.com' + job for job in jobs]
            
    return jobs

def indeed_crawl(titles, locations, dirty_jobs):

    # Loop locations
    for location in locations:
        # Loop titles
        for title in titles:
            deep_indeed_query(title, location, dirty_jobs)

    # Clean it up
    jobs = clean_jobs(dirty_jobs)
    return jobs

def clean_description(text):
    # Clean
    clean_text = BeautifulSoup(text, "lxml").text
    # clean_text = clean_text[2:]
    clean_text = re.sub(r'\\n', ' ', clean_text)
    clean_text = re.sub(r'/', ' ', clean_text)
    clean_text = re.sub(r'[^a-zA-Z ^0-9]', '', clean_text)
    
    return clean_text

def tokenize(text):
    # Tokenize
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if (token.is_stop != True) and (token.is_punct != True)]
    if ' ' in tokens:
        tokens.remove(' ')
    if '  ' in tokens:
        tokens.remove('  ')
    return tokens

def fit_for_nn(text_list):
    # Create a vocab and get word counts per doc
    sparse = tfidf.fit_transform(text)
    # send to df
    tfidf_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())
    return tfidf_dtm
    
def process_query_for_nn(string):
    # Create a vocab and get word counts per doc
    sparse = tfidf.transform([query])
    query_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())
    return query_dtm

In [3]:
def deep_indeed_query(title, location, dirty_jobs):
    # Go to Indeed homepage
    global browser
    browser.get("https://www.indeed.com/")
    print('Scraping:', title, '-', location)
    # Find input fields
    what = browser.find_element_by_name("q")
    what.send_keys(str(title))
    where = browser.find_element_by_name("l")
    clear_auto(where)
    where.send_keys(str(location))

    # Click Search
    button = browser.find_element_by_class_name("icl-Button")
    button.click()

    for i in range(3):
        # Collect pages to scrape
        url = browser.current_url

        # Get soup for the results page
        html = urlopen(url)
        soup = BeautifulSoup(html, 'html.parser')

        # extract job results
        for link in soup.find_all('a', {'class':'jobtitle turnstileLink'}):
            dirty_jobs.append(link.attrs['href'])
            
        # exit popup if it comes up
        try:
            popup_x = browser.find_element_by_class_name("icl-Icon icl-Icon--sm  icl-Icon--black close")
            popup_x.click()
        except NoSuchElementException:
            break
        
        # click on next page
        try:
            next_page = browser.find_element_by_class_name("np")
            next_page.click()
        except NoSuchElementException:
            break
            
# <svg role="img" class="icl-Icon icl-Icon--sm  icl-Icon--black close"

## Spin-Up Web Browser

In [4]:
# Specifying incognito mode as you launch your browser[OPTIONAL]
option = webdriver.ChromeOptions()
option.add_argument("--incognito")
    
# Create new Instance of Chrome in incognito mode
browser = webdriver.Chrome(executable_path='../Documents/Web Drivers/chromedriver', chrome_options=option)

  


In [5]:
# Titles and locations
titles = ["Registered Behavior Technician", "Behavioral Sciences Intern", "Human Resources"]
locations = ["Raleigh-Durham, NC", "Charlotte, NC", "Roanoke, VA", "Charlottesville, VA",
             "Greensboro, NC", "Winston-Salem, NC", "Annapolis, MD", "Philadelphia, PA",
             "Pittsburgh, PA", "Herndon, VA", "Washington, DC", "Arlington, VA"]

In [6]:
# Instantiate dirty_jobs
dirty_jobs = []

# Crawl over indeed.com
jobs = indeed_crawl(titles, locations, dirty_jobs)

Scraping: Registered Behavior Technician - Raleigh-Durham, NC
Scraping: Behavioral Sciences Intern - Raleigh-Durham, NC
Scraping: Human Resources - Raleigh-Durham, NC
Scraping: Registered Behavior Technician - Charlotte, NC
Scraping: Behavioral Sciences Intern - Charlotte, NC
Scraping: Human Resources - Charlotte, NC
Scraping: Registered Behavior Technician - Roanoke, VA
Scraping: Behavioral Sciences Intern - Roanoke, VA
Scraping: Human Resources - Roanoke, VA
Scraping: Registered Behavior Technician - Charlottesville, VA
Scraping: Behavioral Sciences Intern - Charlottesville, VA
Scraping: Human Resources - Charlottesville, VA
Scraping: Registered Behavior Technician - Greensboro, NC
Scraping: Behavioral Sciences Intern - Greensboro, NC
Scraping: Human Resources - Greensboro, NC
Scraping: Registered Behavior Technician - Winston-Salem, NC
Scraping: Behavioral Sciences Intern - Winston-Salem, NC
Scraping: Human Resources - Winston-Salem, NC
Scraping: Registered Behavior Technician - Ann

<button class="icl-Button icl-Button--primary icl-Button--md icl-WhatWhere-button" size="md" type="submit">Find Jobs</button>

In [7]:
texts = []

In [8]:
for job in jobs:
    html = urlopen(job)
    soup = BeautifulSoup(html, 'html.parser')
    
    texts.append(soup.find_all('div', {'class':'jobsearch-jobDescriptionText'}))

In [9]:
texts = [str(text)[1:-1] for text in texts]

In [10]:
# Send to df
df = pd.DataFrame(texts, columns = ['description'])
df['jobs'] = jobs
df.head()

Unnamed: 0,description,jobs
0,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=7200746b122d7...
1,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=e7dd3396a371c...
2,"<div class=""jobsearch-jobDescriptionText"" id=""...","https://www.indeed.com/company/Bonitz,-Inc/job..."
3,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=8d9a483cbf945...
4,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=d306041351e8e...


In [11]:
# NLP Model
nlp = spacy.load("en_core_web_md")

In [12]:
# Clean and tokenize the descriptions
df['tokens'] = df['description'].apply(clean_description).apply(tokenize)

In [13]:
# send clean text to list
text = df['description'].apply(clean_description).tolist()

In [14]:
# Instantiate Vectorizer
tfidf = TfidfVectorizer(stop_words = 'english')

In [15]:
tfidf_dtm = fit_for_nn(text)

In [16]:
# Create a vocab and get word counts per doc
sparse = tfidf.fit_transform(text)

In [17]:
# send to df
tfidf_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())

## Model

In [28]:
# Instantiate model
nn = NearestNeighbors(n_neighbors=20, algorithm='ball_tree')
nn.fit(tfidf_dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                 radius=1.0)

In [29]:
# query = """I use python to collect and scrape data from the web. I can set up integrated data pipelines
#         pipeline to collect data from different sources. I train machine learning models using sklearn, 
#         and tensorflow with keras. BeautifulSoup and Selenium. BeautifulSoup and Selenium.
#         BeautifulSoup and Selenium. BeautifulSoup and Selenium. I can give results to developers using Flask apps
#         and Flask APIs API. I can access APIs API and RSS feeds. I can also use SQL, particularly ElephantSQL
#         and Postgres. I like venture capital, finance and business consulting. I love to work with
#         natural language processing. Looking for a junior or entry level entry-level or mid level mid-level
#         venture capital, finance and business consulting venture capital, finance and business consulting
#         venture capital, finance and business consulting venture capital, finance and business consulting"""

query = """behavioral sciences intern working with children or adults with disablities behavioral therapy
        graphic design personalized personal disabled"""

query_dtm = process_query_for_nn(query)

In [30]:
# Query for closest neighbors
results = nn.kneighbors(query_dtm)[1][0].tolist()

In [31]:
# Send to list
job_urls = df['jobs'][results].tolist()

In [32]:
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val, val)

jobs_df = pd.DataFrame(job_urls)

jobs_df.style.format(make_clickable)

Unnamed: 0,0
0,https://www.indeed.com/rc/clk?jk=fdeb02a81f93a65a&fccid=3a71a4d2f7990a25&vjs=3
1,https://www.indeed.com/rc/clk?jk=c3c9ee70a6cd9436&fccid=fd03a1a845b02aae&vjs=3
2,https://www.indeed.com/rc/clk?jk=d577cfddd0ae0757&fccid=36855770b7ed7408&vjs=3
3,https://www.indeed.com/rc/clk?jk=9aaa252f2a66548e&fccid=647dd81b1cc7dbb9&vjs=3
4,"https://www.indeed.com/company/Within-Me-Therapy,-LLC/jobs/Registered-Behavior-Technician-3937b3ddc15fd35d?fccid=caa68c1767899b25&vjs=3"
5,https://www.indeed.com/rc/clk?jk=66b2d8024633723d&fccid=1b2c71e3bba28fd8&vjs=3
6,https://www.indeed.com/rc/clk?jk=5b407721f1cf7761&fccid=59f0e79f9587b3be&vjs=3
7,https://www.indeed.com/rc/clk?jk=2b26b4bde669090f&fccid=1b2c71e3bba28fd8&vjs=3
8,https://www.indeed.com/rc/clk?jk=2877d8e5332b5bd8&fccid=dd977a85793e145b&vjs=3
9,https://www.indeed.com/rc/clk?jk=20b34b3780ac1204&fccid=4251c565f41817c0&vjs=3


In [33]:
['<a href="{' + link +'}">{' + link + '}</a>' for link in job_urls]

['<a href="{https://www.indeed.com/rc/clk?jk=fdeb02a81f93a65a&fccid=3a71a4d2f7990a25&vjs=3}">{https://www.indeed.com/rc/clk?jk=fdeb02a81f93a65a&fccid=3a71a4d2f7990a25&vjs=3}</a>',
 '<a href="{https://www.indeed.com/rc/clk?jk=c3c9ee70a6cd9436&fccid=fd03a1a845b02aae&vjs=3}">{https://www.indeed.com/rc/clk?jk=c3c9ee70a6cd9436&fccid=fd03a1a845b02aae&vjs=3}</a>',
 '<a href="{https://www.indeed.com/rc/clk?jk=d577cfddd0ae0757&fccid=36855770b7ed7408&vjs=3}">{https://www.indeed.com/rc/clk?jk=d577cfddd0ae0757&fccid=36855770b7ed7408&vjs=3}</a>',
 '<a href="{https://www.indeed.com/rc/clk?jk=9aaa252f2a66548e&fccid=647dd81b1cc7dbb9&vjs=3}">{https://www.indeed.com/rc/clk?jk=9aaa252f2a66548e&fccid=647dd81b1cc7dbb9&vjs=3}</a>',
 '<a href="{https://www.indeed.com/company/Within-Me-Therapy,-LLC/jobs/Registered-Behavior-Technician-3937b3ddc15fd35d?fccid=caa68c1767899b25&vjs=3}">{https://www.indeed.com/company/Within-Me-Therapy,-LLC/jobs/Registered-Behavior-Technician-3937b3ddc15fd35d?fccid=caa68c1767899b25