In [1]:
#######################
####### Imports #######
#######################

# Flask App Imports
from flask import Flask, jsonify, request, json
from flask_restful import Api, reqparse
from flask_cors import CORS

# The usuals
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# System Imports
import re
import os
import time

# Selenium Imports
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.keys import Keys

# Other imports
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import spacy

# Functions (Local Imports)
from functions import (
    clear_auto, deep_indeed_query, clean_jobs,
    indeed_crawl, clean_description,
    fit_for_nn, transform_query_for_nn, parse
)

## Spin-Up Web Browser

In [30]:
# Disable auto-complete
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.formfill.enable", "false")
    
# Create new Instance of Chrome in incognito mode
browser = webdriver.Firefox(executable_path='../../Selenium/geckodriver')

In [31]:
# Titles and locations
titles = ["Data Analyst"]
locations = ["Raleigh-Durham, NC", "Charlotte, NC", "Roanoke, VA", "Charlottesville, VA",
             "Greensboro, NC", "Winston-Salem, NC", "Annapolis, MD", "Philadelphia, PA",
             "Tyson's Corner, VA"]

In [32]:
# Instantiate dirty_jobs
dirty_jobs = []

# Crawl over indeed.com
jobs = indeed_crawl(titles, locations, dirty_jobs, browser)

Scraping: Data Analyst - Raleigh-Durham, NC
Scraping: Data Analyst - Charlotte, NC
Scraping: Data Analyst - Roanoke, VA
Scraping: Data Analyst - Charlottesville, VA
Scraping: Data Analyst - Greensboro, NC
Scraping: Data Analyst - Winston-Salem, NC
Scraping: Data Analyst - Annapolis, MD
Scraping: Data Analyst - Philadelphia, PA
Scraping: Data Analyst - Tyson's Corner, VA


<button class="icl-Button icl-Button--primary icl-Button--md icl-WhatWhere-button" size="md" type="submit">Find Jobs</button>

In [33]:
texts = parse(jobs)

In [34]:
texts = [str(text)[1:-1] for text in texts]

In [35]:
# Send to df
df = pd.DataFrame(texts, columns = ['description'])
df['jobs'] = jobs
df.head()

Unnamed: 0,description,jobs
0,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=d5f1bba3e73b9...
1,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=2fd84d65f5d90...
2,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/company/Direct-Results-...
3,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=a02b0697cf65d...
4,"<div class=""jobsearch-jobDescriptionText"" id=""...",https://www.indeed.com/rc/clk?jk=2bccc037fda63...


In [36]:
# NLP Model
nlp = spacy.load("en_core_web_md")

In [37]:
def tokenize(text):
    # Tokenize
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if (token.is_stop != True) and (token.is_punct != True)]
    if ' ' in tokens:
        tokens.remove(' ')
    if '  ' in tokens:
        tokens.remove('  ')
    return tokens

In [38]:
# Clean and tokenize the descriptions
df['tokens'] = df['description'].apply(clean_description).apply(tokenize)

In [39]:
# Object from Base Python
from collections import Counter

# The object `Counter` takes an iterable, but you can instaniate an empty one and update it. 
word_counts = Counter()

def count(docs):

        word_counts = Counter()
        appears_in = Counter()
        
        total_docs = len(docs)

        for doc in docs:
            word_counts.update(doc)
            appears_in.update(set(doc))

        temp = zip(word_counts.keys(), word_counts.values())
        
        wc = pd.DataFrame(temp, columns = ['word', 'count'])

        wc['rank'] = wc['count'].rank(method='first', ascending=False)
        total = wc['count'].sum()

        wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
        wc = wc.sort_values(by='rank')
        wc['cul_pct_total'] = wc['pct_total'].cumsum()

        t2 = zip(appears_in.keys(), appears_in.values())
        ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
        wc = ac.merge(wc, on='word')

        wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
        return wc.sort_values(by='rank')

In [40]:
# Use the Function
wc = count(df['tokens'])

In [41]:
import squarify
import matplotlib.pyplot as plt

wc_top20 = wc[wc['rank'] <= 500]

wc_top20['word']

169              datum
100               work
82            business
218         experience
38                data
155           analysis
114               team
77                    
153               Data
33             support
777        information
41                year
187             report
342             system
211            include
13          management
94             provide
185            process
862            require
83               skill
31            analytic
11             Analyst
113            quality
423           database
267           position
393            project
376                new
86             analyze
158            develop
36                need
68         development
143        opportunity
12              client
84             ability
364          knowledge
25             product
312        application
224               tool
533          reporting
381             degree
480           internal
739           research
1221       statistical
35         

In [28]:
pd.options.display.max_rows = 500

# Fit for NN

In [11]:
# send clean text to list
text = df['description'].apply(clean_description).tolist()

In [12]:
# Instantiate Vectorizer
tfidf = TfidfVectorizer(stop_words = 'english')

In [13]:
def fit_for_nn(text_list):
    # Create a vocab and get word counts per doc
    sparse = tfidf.fit_transform(text_list)
    # send to df
    tfidf_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())
    return tfidf_dtm

In [14]:
tfidf_dtm = fit_for_nn(text)

In [15]:
# Create a vocab and get word counts per doc
sparse = tfidf.fit_transform(text)

In [16]:
# send to df
tfidf_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())

## Model

In [18]:
# Instantiate model
nn = NearestNeighbors(n_neighbors=20, algorithm='ball_tree')
nn.fit(tfidf_dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                 radius=1.0)

In [19]:
def transform_query_for_nn(string):
    # Create a vocab and get word counts per doc
    sparse = tfidf.transform([query])
    query_dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())
    return query_dtm

In [20]:
query = """I use python python python python python python python python python python 
        to collect and scrape data from the web. I can set up integrated data pipelines
        pipeline to collect data from different sources. I train machine learning models using sklearn, 
        and tensorflow with keras. BeautifulSoup and Selenium. BeautifulSoup and Selenium.
        BeautifulSoup and Selenium. BeautifulSoup and Selenium. I can give results to developers using Flask apps
        and Flask APIs API. I can access APIs API and RSS feeds. I can also use SQL SQL SQL SQL SQL SQL SQL SQL,
        particularly ElephantSQL and Postgres. I like venture capital, finance and business consulting. 
        I love to work with natural language processing. Looking for a junior or entry level entry-level 
        or mid level mid-level venture capital, health finance and business consulting venture capital,
        health finance and business consulting venture capital, health finance and business consulting venture capital, visualizations with tableau"""

# query = """I use knowledge of process and chemical engineering to help businesses optimize production,
#         often making use of Statistical Process Control.  Background in math, science, organic chemistry.
#         Interested in the environmental waste section, as chemistry forms the backbone of much of that work."""

# query = """behavioral sciences intern working with children or adults with disablities behavioral therapy
#         graphic design personalized personal disabled"""

query_dtm = transform_query_for_nn(query)

In [21]:
# Query for closest neighbors
results = nn.kneighbors(query_dtm)[1][0].tolist()

In [22]:
# Send to list
job_urls = df['jobs'][results].tolist()

In [23]:
# Create links
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val, val)

jobs_df = pd.DataFrame(job_urls)

jobs_df.style.format(make_clickable)

Unnamed: 0,0
0,https://www.indeed.com/rc/clk?jk=10387c12ef625fc9&fccid=5bd99dfa21c8a490&vjs=3
1,https://www.indeed.com/company/Genisis/jobs/-5ced8816de33773c?fccid=1f0dd0a134fe568e&vjs=3
2,https://www.indeed.com/rc/clk?jk=c593001454703c46&fccid=69bb9b3b22873348&vjs=3
3,https://www.indeed.com/rc/clk?jk=0aac237e87cc9b3a&fccid=679aea3b529f8f45&vjs=3
4,https://www.indeed.com/rc/clk?jk=09f7ae181a22f24f&fccid=e24a60f01d1882d1&vjs=3
5,https://www.indeed.com/rc/clk?jk=d74c68cd59b66889&fccid=035bef51686ff9d1&vjs=3
6,https://www.indeed.com/rc/clk?jk=0643f66f3dbbf86e&fccid=035bef51686ff9d1&vjs=3
7,https://www.indeed.com/company/Vertex-Technological-Solutions/jobs/Senior-Data-Scientist-a9fd53472533202f?fccid=0b9ac889d0071abe&vjs=3
8,https://www.indeed.com/rc/clk?jk=fb893c7289cb8935&fccid=a3163e1f583839d0&vjs=3
9,https://www.indeed.com/rc/clk?jk=b9ccc987c4c7ee97&fccid=8fd36856966c7aad&vjs=3
