In [1]:
#######################
####### Imports #######
#######################

# Flask App Imports
from flask import Flask, jsonify, request, json
from flask_restful import Api, reqparse
from flask_cors import CORS

# The usuals
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# System Imports
import re
import os
import time

# Selenium Imports
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.keys import Keys

# Other imports
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import spacy

# Functions (Local Imports)
from functions import (
    clear_auto, deep_indeed_query, clean_jobs,
    indeed_crawl, clean_description,
    fit_for_nn, transform_query_for_nn, parse
)

## Spin-Up Web Browser

In [2]:
# Disable auto-complete
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.formfill.enable", "false")
    
# Create new Instance of Chrome in incognito mode
browser = webdriver.Firefox(executable_path='../../Selenium/geckodriver')

In [3]:
# Titles and locations
titles = ["Data Engineer", "Data Analyst"]
locations = ["Raleigh-Durham, NC", "Charlotte, NC", "Roanoke, VA", "Charlottesville, VA",
             "Greensboro, NC", "Winston-Salem, NC"]

In [4]:
# Instantiate dirty_jobs
dirty_jobs = []

# Crawl over indeed.com
jobs = indeed_crawl(titles, locations, dirty_jobs, browser)

Scraping: Data Engineer - Raleigh-Durham, NC
Scraping: Data Analyst - Raleigh-Durham, NC
Scraping: Data Engineer - Charlotte, NC
Scraping: Data Analyst - Charlotte, NC
Scraping: Data Engineer - Roanoke, VA
Scraping: Data Analyst - Roanoke, VA
Scraping: Data Engineer - Charlottesville, VA
Scraping: Data Analyst - Charlottesville, VA
Scraping: Data Engineer - Greensboro, NC
Scraping: Data Analyst - Greensboro, NC
Scraping: Data Engineer - Winston-Salem, NC
Scraping: Data Analyst - Winston-Salem, NC


<button class="icl-Button icl-Button--primary icl-Button--md icl-WhatWhere-button" size="md" type="submit">Find Jobs</button>

In [5]:
texts = parse(jobs)

In [6]:
texts = [str(text)[1:-1] for text in texts]

In [7]:
# Send to df
df = pd.DataFrame(texts, columns = ['description'])
df['jobs'] = jobs
df.head()

Unnamed: 0,description,jobs
0,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=636c638aca5ed...
1,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=d0cad68fe6483...
2,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=7d12562c2822c...
3,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=4f5ada0860b7e...
4,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=6f004541197f5...


In [8]:
# NLP Model
nlp = spacy.load("en_core_web_md")

In [9]:
def tokenize(text):
    # Tokenize
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if (token.is_stop != True) and (token.is_punct != True)]
    if ' ' in tokens:
        tokens.remove(' ')
    if '  ' in tokens:
        tokens.remove('  ')
    return tokens

In [10]:
# Clean and tokenize the descriptions
df['tokens'] = df['description'].apply(clean_description).apply(tokenize)

In [11]:
# send clean text to list
text = df['description'].apply(clean_description).tolist()

In [12]:
# Instantiate Vectorizer
tfidf = TfidfVectorizer(stop_words = 'english')

In [13]:
def fit_for_nn(text_list):
    # Create a vocab and get word counts per doc
    sparse = tfidf.fit_transform(text_list)
    # send to df
    tfidf_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())
    return tfidf_dtm

In [14]:
tfidf_dtm = fit_for_nn(text)

In [15]:
# Create a vocab and get word counts per doc
sparse = tfidf.fit_transform(text)

In [16]:
# send to df
tfidf_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())

## Model

In [17]:
# Instantiate model
nn = NearestNeighbors(n_neighbors=20, algorithm='ball_tree')
nn.fit(tfidf_dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                 radius=1.0)

In [18]:
def transform_query_for_nn(string):
    # Create a vocab and get word counts per doc
    sparse = tfidf.transform([query])
    query_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())
    return query_dtm

In [19]:
# query = """I use python python python python python python python python python python 
#         to collect and scrape data from the web. I can set up integrated data pipelines
#         pipeline to collect data from different sources. I train machine learning models using sklearn, 
#         and tensorflow with keras.  BeautifulSoup and Selenium. I can give results to developers using Flask apps
#         and Flask APIs API. I can access APIs API and RSS feeds. I can also use SQL SQL SQL SQL SQL SQL SQL SQL,
#         particularly ElephantSQL and Postgres."""

query = """
After finishing my physics degree, I decided to enroll at LambdaSchool 
where I learned how to aggregate, scrape, clean and process (ETL) data 
from a wide variety of sources. Python is still my main tool, but I have 
plenty of experience using Excel,  SQL and Postgres as well. 
I can use Flask to set up APIs to return results processed by 
machine learning models. I’ve dabbled in some machine learning myself, 
but creating illuminating visuals and writing up my results and conclusions 
has always been my forte, so I’d make an outstanding analyst.
"""

# query = """I use knowledge of process and chemical engineering to help businesses optimize production,
#         often making use of Statistical Process Control.  Background in math, science, organic chemistry.
#         Interested in the environmental waste section, as chemistry forms the backbone of much of that work."""

# query = """behavioral sciences intern working with children or adults with disablities behavioral therapy
#         graphic design personalized personal disabled"""

query_dtm = transform_query_for_nn(query)

In [20]:
# Query for closest neighbors
results = nn.kneighbors(query_dtm)[1][0].tolist()

In [21]:
# Send to list
job_urls = df['jobs'][results].tolist()

In [22]:
# Create links
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val, val)

jobs_df = pd.DataFrame(job_urls)

jobs_df.style.format(make_clickable)

Unnamed: 0,0
0,https://www.indeed.com/rc/clk?jk=fdbc2972c6261496&fccid=b716e44d2c6283e7&vjs=3
1,https://www.indeed.com/rc/clk?jk=dde73137fc8000e4&fccid=24f8cd57633df8fb&vjs=3
2,https://www.indeed.com/rc/clk?jk=8e589b591bd1eb03&fccid=deb234f9dd3edcea&vjs=3
3,https://www.indeed.com/rc/clk?jk=3472ab3ebe13d440&fccid=9c05e44275709723&vjs=3
4,https://www.indeed.com/rc/clk?jk=066223d2f549f2d7&fccid=89db72b7484b93bb&vjs=3
5,https://www.indeed.com/rc/clk?jk=4fadb1849627c4e2&fccid=485155ca34f4e0c7&vjs=3
6,https://www.indeed.com/rc/clk?jk=2ad42fc464022d75&fccid=66c010dc97a69d7c&vjs=3
7,https://www.indeed.com/rc/clk?jk=8fd01da175904382&fccid=dfaee2d8a6e3af4d&vjs=3
8,https://www.indeed.com/rc/clk?jk=a07a1ad818d61c5f&fccid=f32a31323adb5fda&vjs=3
9,https://www.indeed.com/rc/clk?jk=12259cd5ee3f8fac&fccid=485155ca34f4e0c7&vjs=3
