In [1]:
#######################
####### Imports #######
#######################

# Flask App Imports
from flask import Flask, jsonify, request, json
from flask_restful import Api, reqparse
from flask_cors import CORS

# The usuals
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# System Imports
import re
import os
import time

# Selenium Imports
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.keys import Keys

# Other imports
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import spacy

# Functions (Local Imports)
from functions import (
    clear_auto, deep_indeed_query, clean_jobs,
    indeed_crawl, clean_description, tokenize,
    fit_for_nn, transform_query_for_nn, parse
)

## Spin-Up Web Browser

In [5]:
# Disable auto-complete
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.formfill.enable", "false")
    
# Create new Instance of Chrome in incognito mode
browser = webdriver.Firefox(executable_path='../../Selenium/geckodriver')

In [6]:
# Titles and locations
titles = ["Chemical Engineer", "Process Engineer", "Environmental Engineer"]
locations = ["Raleigh-Durham, NC", "Charlotte, NC", "Roanoke, VA", "Charlottesville, VA",
             "Greensboro, NC", "Winston-Salem, NC", "Annapolis, MD"]

In [7]:
# Instantiate dirty_jobs
dirty_jobs = []

# Crawl over indeed.com
jobs = indeed_crawl(titles, locations, dirty_jobs, browser)

Scraping: Chemical Engineer - Raleigh-Durham, NC
Scraping: Process Engineer - Raleigh-Durham, NC
Scraping: Environmental Engineer - Raleigh-Durham, NC
Scraping: Chemical Engineer - Charlotte, NC
Scraping: Process Engineer - Charlotte, NC
Scraping: Environmental Engineer - Charlotte, NC
Scraping: Chemical Engineer - Roanoke, VA
Scraping: Process Engineer - Roanoke, VA
Scraping: Environmental Engineer - Roanoke, VA
Scraping: Chemical Engineer - Charlottesville, VA
Scraping: Process Engineer - Charlottesville, VA
Scraping: Environmental Engineer - Charlottesville, VA
Scraping: Chemical Engineer - Greensboro, NC
Scraping: Process Engineer - Greensboro, NC
Scraping: Environmental Engineer - Greensboro, NC
Scraping: Chemical Engineer - Winston-Salem, NC
Scraping: Process Engineer - Winston-Salem, NC
Scraping: Environmental Engineer - Winston-Salem, NC
Scraping: Chemical Engineer - Annapolis, MD
Scraping: Process Engineer - Annapolis, MD
Scraping: Environmental Engineer - Annapolis, MD
Scrapi

<button class="icl-Button icl-Button--primary icl-Button--md icl-WhatWhere-button" size="md" type="submit">Find Jobs</button>

In [9]:
texts = parse(jobs)

In [None]:
texts = [str(text)[1:-1] for text in texts]

In [None]:
# Send to df
df = pd.DataFrame(texts, columns = ['description'])
df['jobs'] = jobs
df.head()

In [None]:
# NLP Model
nlp = spacy.load("en_core_web_md")

In [None]:
# Clean and tokenize the descriptions
df['tokens'] = tokenize(df['description'].apply(clean_description).tolist(), nlp)

In [None]:
# send clean text to list
text = df['description'].apply(clean_description).tolist()

In [None]:
# Instantiate Vectorizer
tfidf = TfidfVectorizer(stop_words = 'english')

In [None]:
tfidf_dtm = fit_for_nn(text)

In [None]:
# Create a vocab and get word counts per doc
sparse = tfidf.fit_transform(text)

In [None]:
# send to df
tfidf_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())

## Model

In [None]:
# Instantiate model
nn = NearestNeighbors(n_neighbors=20, algorithm='ball_tree')
nn.fit(tfidf_dtm)

In [None]:
query = """I use python to collect and scrape data from the web. I can set up integrated data pipelines
        pipeline to collect data from different sources. I train machine learning models using sklearn, 
        and tensorflow with keras. BeautifulSoup and Selenium. BeautifulSoup and Selenium.
        BeautifulSoup and Selenium. BeautifulSoup and Selenium. I can give results to developers using Flask apps
        and Flask APIs API. I can access APIs API and RSS feeds. I can also use SQL, particularly ElephantSQL
        and Postgres. I like venture capital, finance and business consulting. I love to work with
        natural language processing. Looking for a junior or entry level entry-level or mid level mid-level
        venture capital, finance and business consulting venture capital, finance and business consulting
        venture capital, finance and business consulting venture capital, finance and business consulting"""

# query = """I use knowledge of process and chemical engineering to help businesses optimize production,
#         often making use of Statistical Process Control.  Background in math, science, organic chemistry.
#         Interested in the environmental waste section, as chemistry forms the backbone of much of that work."""

# query = """behavioral sciences intern working with children or adults with disablities behavioral therapy
#         graphic design personalized personal disabled"""

query_dtm = process_query_for_nn(query)

In [None]:
# Query for closest neighbors
results = nn.kneighbors(query_dtm)[1][0].tolist()

In [None]:
# Send to list
job_urls = df['jobs'][results].tolist()

In [None]:
# Create links
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val, val)

jobs_df = pd.DataFrame(job_urls)

jobs_df.style.format(make_clickable)