In [29]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import re
import os
import time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import spacy

from bs4 import BeautifulSoup
from urllib.request import urlopen, Request

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys

In [126]:
# Functions
def clear_auto(element):
    auto_input = str(element.get_attribute("value"))
    auto_length = len(auto_input)
    # Delete everything
    for i in range(auto_length):
        element.send_keys(Keys.BACKSPACE)
        
def indeed_query(title, location, dirty_jobs):
    # Go to Indeed homepage
    global browser
    browser.get("https://www.indeed.com/")
    print('Scraping:', title, '-', location)
    # Find input fields
    what = browser.find_element_by_name("q")
    what.send_keys(str(title))
    where = browser.find_element_by_name("l")
    clear_auto(where)
    where.send_keys(str(location))

    # Click Search
    button = browser.find_element_by_class_name("icl-Button")
    button.click()

    # Collect pages to scrape
    url = browser.current_url

    # Get soup for the results page
    html = urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')

    # extract job results
    for link in soup.find_all('a', {'class':'jobtitle turnstileLink'}):
        dirty_jobs.append(link.attrs['href'])
        
def clean_jobs(jobs):
    # Clean clean clean
    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)

    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)

    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)

    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)

    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)

    for job in jobs:
        if job.startswith('/pagead'):
            jobs.remove(job)
            
    # add prefix
    jobs = ['https://www.indeed.com' + job for job in jobs]
            
    return jobs

def indeed_crawl(titles, locations, dirty_jobs):

    # Loop locations
    for location in locations:
        # Loop titles
        for title in titles:
            indeed_query(title, location, dirty_jobs)

    # Clean it up
    jobs = clean_jobs(dirty_jobs)
    return jobs

def clean_description(text):
    # Clean
    clean_text = BeautifulSoup(text, "lxml").text
    # clean_text = clean_text[2:]
    clean_text = re.sub(r'\\n', ' ', clean_text)
    clean_text = re.sub(r'/', ' ', clean_text)
    clean_text = re.sub(r'[^a-zA-Z ^0-9]', '', clean_text)
    
    return clean_text

def tokenize(text):
    # Tokenize
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if (token.is_stop != True) and (token.is_punct != True)]
    if ' ' in tokens:
        tokens.remove(' ')
    if '  ' in tokens:
        tokens.remove('  ')
    return tokens

def fit_for_nn(text_list):
    # Create a vocab and get word counts per doc
    sparse = tfidf.fit_transform(text)
    # send to df
    tfidf_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())
    return tfidf_dtm
    
def process_query_for_nn(string):
    # Create a vocab and get word counts per doc
    sparse = tfidf.transform([query])
    query_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())
    return query_dtm

## Spin-Up Web Browser

In [127]:
# Specifying incognito mode as you launch your browser[OPTIONAL]
option = webdriver.ChromeOptions()
option.add_argument("--incognito")
    
# Create new Instance of Chrome in incognito mode
browser = webdriver.Chrome(executable_path='../Documents/Web Drivers/chromedriver', chrome_options=option)

  


In [128]:
# Titles and locations
titles = ["Data Analyst", "Data Scientist"]
locations = ["Raleigh-Durham, NC", "Charlotte, NC", "Roanoke, VA", "Charlottesville, VA",
             "Greensboro, NC", "Winston-Salem, NC", "Annapolis, MD", "Philadelphia, PA",
             "Pittsburgh, PA", "Manassas, VA", "Herndon, VA", "Washington, DC", "Arlington, VA"]

In [129]:
# Instantiate dirty_jobs
dirty_jobs = []

# Crawl over indeed.com
jobs = indeed_crawl(titles, locations, dirty_jobs)

Scraping: Data Analyst - Raleigh-Durham, NC
Scraping: Data Scientist - Raleigh-Durham, NC
Scraping: Data Analyst - Charlotte, NC
Scraping: Data Scientist - Charlotte, NC
Scraping: Data Analyst - Roanoke, VA
Scraping: Data Scientist - Roanoke, VA
Scraping: Data Analyst - Charlottesville, VA
Scraping: Data Scientist - Charlottesville, VA
Scraping: Data Analyst - Greensboro, NC
Scraping: Data Scientist - Greensboro, NC
Scraping: Data Analyst - Winston-Salem, NC
Scraping: Data Scientist - Winston-Salem, NC
Scraping: Data Analyst - Annapolis, MD
Scraping: Data Scientist - Annapolis, MD
Scraping: Data Analyst - Philadelphia, PA
Scraping: Data Scientist - Philadelphia, PA
Scraping: Data Analyst - Pittsburgh, PA
Scraping: Data Scientist - Pittsburgh, PA
Scraping: Data Analyst - Manassas, VA
Scraping: Data Scientist - Manassas, VA
Scraping: Data Analyst - Herndon, VA
Scraping: Data Scientist - Herndon, VA
Scraping: Data Analyst - Washington, DC
Scraping: Data Scientist - Washington, DC
Scraping

<button class="icl-Button icl-Button--primary icl-Button--md icl-WhatWhere-button" size="md" type="submit">Find Jobs</button>

In [130]:
texts = []

In [131]:
for job in jobs:
    html = urlopen(job)
    soup = BeautifulSoup(html, 'html.parser')
    
    texts.append(soup.find_all('div', {'class':'jobsearch-jobDescriptionText'}))

In [132]:
texts = [str(text)[1:-1] for text in texts]

In [133]:
# Send to df
df = pd.DataFrame(texts, columns = ['description'])
df['jobs'] = jobs
df.head()

Unnamed: 0,description,jobs
0,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=3a77ced4e1f6c...
1,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=b20e16df83866...
2,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=bf57b8d389d71...
3,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/company/Lakarya/jobs/Da...
4,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=49e950e1919c2...


In [134]:
# NLP Model
nlp = spacy.load("en_core_web_md")

In [135]:
# Clean and tokenize the descriptions
df['tokens'] = df['description'].apply(clean_description).apply(tokenize)

In [136]:
# send clean text to list
text = df['description'].apply(clean_description).tolist()

In [137]:
# Instantiate Vectorizer
tfidf = TfidfVectorizer(stop_words = 'english')

In [138]:
tfidf_dtm = fit_for_nn(text)

In [139]:
# Create a vocab and get word counts per doc
sparse = tfidf.fit_transform(text)

In [140]:
# send to df
tfidf_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())

## Model

In [141]:
# Instantiate model
nn = NearestNeighbors(n_neighbors=20, algorithm='ball_tree')
nn.fit(tfidf_dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                 radius=1.0)

In [148]:
query = """I use python to collect and scrape data from the web. I can set up integrated data pipelines
        pipeline to collect data from different sources. I train machine learning models using sklearn, 
        and tensorflow with keras. BeautifulSoup and Seleium. I can give results to developers using Flask apps
        and Flask APIs API. I can access APIs API and RSS feeds. I can also use SQL, particularly ElephantSQL
        and PostgreSQL. I like venture capital, finance and business consulting. Looking for a junior
        or entry level entry-level or mid level mid-level"""

query_dtm = process_query_for_nn(query)

In [149]:
# Query for closest neighbors
results = nn.kneighbors(query_dtm)[1][0].tolist()

In [150]:
df['jobs'][results].tolist()

['https://www.indeed.com/rc/clk?jk=b9ccc987c4c7ee97&fccid=8fd36856966c7aad&vjs=3',
 'https://www.indeed.com/rc/clk?jk=1e45d5b3889a4f68&fccid=4e041af1d0af1bc8&vjs=3',
 'https://www.indeed.com/rc/clk?jk=fb893c7289cb8935&fccid=a3163e1f583839d0&vjs=3',
 'https://www.indeed.com/company/Quantum-Technologies-Inc./jobs/Data-Analyst-c6475c487dca36d7?fccid=e169cdbdee4fdc2e&vjs=3',
 'https://www.indeed.com/rc/clk?jk=2986305e03bcd41c&fccid=dc2638a079b95f93&vjs=3',
 'https://www.indeed.com/company/Visual-Bridge/jobs/Technology-Consultant-14a052d192068fb2?fccid=22ade5f3d42c0d41&vjs=3',
 'https://www.indeed.com/rc/clk?jk=5f98296a165e5454&fccid=e8f18ca6180ec8da&vjs=3',
 'https://www.indeed.com/rc/clk?jk=5f98296a165e5454&fccid=e8f18ca6180ec8da&vjs=3',
 'https://www.indeed.com/rc/clk?jk=1a7ce6158404da8a&fccid=dfc44f3b8c44a6db&vjs=3',
 'https://www.indeed.com/rc/clk?jk=3d46376452926cef&fccid=a168335bbdcce5e0&vjs=3',
 'https://www.indeed.com/rc/clk?jk=ce6c891fea3c8615&fccid=4e041af1d0af1bc8&vjs=3',
 'http