# Scrape Indeed

### import libraries

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import re 
import urllib

### define scraping functions

Helper functions copied from class hw.  Make sure you copy the chrome driver into this directory!

In [2]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
browser = webdriver.Chrome('./chromedriver',options=options)

#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,browser):
    browser.get(url)
    res_html = browser.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

#tidies extracted text 
def process_bio(bio):
    bio = bio.encode('ascii',errors='ignore').decode('utf-8')       #removes non-ascii characters
    bio = re.sub('\s+',' ',bio)       #repalces repeated whitespace characters with single space
    return bio

''' More tidying
Sometimes the text extracted HTML webpage may contain javascript code and some style elements. 
This function removes script and style tags from HTML so that extracted text does not contain them.
'''
def remove_script(soup):
    for script in soup(["script", "style"]):
        script.decompose()
    return soup

#helper function to write lists to files
def write_lst(lst,file_):
    with open(file_,'w') as f:
        for l in lst:
            f.write(l)
            f.write('\n')
            
# main function that scrapes search result page            
def scrape_search_result_page(dir_url,page_result,browser):
    print ('-'*20,'Scraping indeed search result page '+ str(page_result)+'','-'*20)
    indeed_links = []
    #execute js on webpage to load faculty listings on webpage and get ready to parse the loaded HTML 
    soup = get_js_soup(dir_url,browser) 
    for link_holder in soup.find_all('div',class_='title'): #get list of all <div> of class 'photo nocaption'
        rel_link = link_holder.find('a')['href'] #get url
        #url returned is relative, so we need to add base url
        if rel_link != '':
            indeed_links.append('https://www.indeed.com' + rel_link) 
    print ('-'*20,'Found {} indeed search urls'.format(len(indeed_links)),'-'*20)
    return indeed_links

## Run scraper function

In [3]:
# build query
q = 'python developer' #job query string
l = 'New+York+State' #location of job
numPage = 20 #num pages to scrap links from
allLinks = [] # list to capture
start = 0 #pagnigation variable, page 1 = 0, page 2 = 10, page 3 = 30, etc

# loop over n number of pages
for page_result in range(numPage):
    start = page_result* 10 #increment the variable used to denote the next page
    search_result_url = 'https://www.indeed.com/jobs?q='+ q +'&l='+ l +'&start='+str(start) #build query string
    print(search_result_url)
    jobSearchResult = scrape_search_result_page(search_result_url,page_result, browser) # call scraper function
    allLinks.extend(jobSearchResult) #add to link
    

https://www.indeed.com/jobs?q=python developer&l=New+York+State&start=0
-------------------- Scraping indeed search result page 0 --------------------
-------------------- Found 19 indeed search urls --------------------
https://www.indeed.com/jobs?q=python developer&l=New+York+State&start=10
-------------------- Scraping indeed search result page 1 --------------------
-------------------- Found 19 indeed search urls --------------------
https://www.indeed.com/jobs?q=python developer&l=New+York+State&start=20
-------------------- Scraping indeed search result page 2 --------------------
-------------------- Found 19 indeed search urls --------------------
https://www.indeed.com/jobs?q=python developer&l=New+York+State&start=30
-------------------- Scraping indeed search result page 3 --------------------
-------------------- Found 18 indeed search urls --------------------
https://www.indeed.com/jobs?q=python developer&l=New+York+State&start=40
-------------------- Scraping indeed sea

### write to file for debugging

In [4]:
#Remove Duplicates
print(len(allLinks))
allLinks = list(set(allLinks))
print (len(allLinks))

376
374


In [5]:
print(allLinks)
job_urls_file = 'jobSearchResult' +q+'.txt'
# write to file
write_lst(allLinks,job_urls_file)

['https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0CEi_iD1h0QD-Oh4sSrQYRGe70UNTrrByMkq2RRcHzje4XUK-7OD9UueuijoYLffwwMh_KO5QDHI9jHVbatkc-epUwL4kA77enCtcgit0rekzHQH7ritBfkqg1-UtIhdqhk32EEbPSP-Hw-uE2drh1AS4HKS6071uHKjaBGyVrlzBK6M6uvq00aEyJNdrmfL5qvePUg53acyxcVLoTSiHK_KPhTKuzqpGbR9HPudlJhmwVvxo_-xvaKKCrTdB4BBHn98-tIXJBJxzqw_c6qU24aYgmvrVGfPkUdju4CyUqQC7mPpjkDn_kMRl59Uzl3rjGTfz1S47pqDTVpVjFZOJFh8AYUwxTNzRvFVvMrJ_20HulffQlc3bMlo-CX1cIpBGxDjcWWzJ114a2Hmh1hOCjYru_OKbD3lRxNeJ9igIrQf8KawPkl267Fltx5MoiyJLDuCenYAOO2lg==&p=12&fvj=1&vjs=3', 'https://www.indeed.com/rc/clk?jk=9f96291f6148ab66&fccid=3967a440d2d21bef&vjs=3', 'https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0CEi_iD1h0QD-Oh4sSrQYRGe70UNTrrByMkq2RRcHzje4XUK-7OD9UueuijoYLffwwMh_KO5QDHI-6Uf1rIZLYdA-G-Fu8AH_VJCzP-s-gBTY0IoDwH2V5x1PVcblMYHa6GftrTJLEIQxIcIxmgD8v9tQuDrSVHZ1BpAIiTZ6pJ0va0WwjJR8d3BqwR6-9ZBZkbmfisCI1UZBFbfLDEqmtIAg6hfaMdZhbSwaifaZGS__vZYty8sS21RrmdJiQKR8Z0sHN5Ep7D6blFRSIy8K6ppb4P_CfwA9GcrhMoxHVj8dwQNDMmGEhNTC36h8cb0YCSe98tWWV

## Scrape page data for each link

In [6]:
from gensim.parsing.preprocessing import remove_stopwords

In [7]:
homepage_found = False
page_data = ''
page_data_list = []
for link_num, indeed_url in enumerate(allLinks):
    print("Accessing link",link_num,"of",len(allLinks))
    try:
        page_soup = remove_script(get_js_soup(indeed_url,browser)) 
    except:
        print ('Could not access {}'.format(indeed_url))
     
    page_data = process_bio(page_soup.get_text(separator=' '))  #helper function from bio hw to clean up text
    
    #remove header
    page_data = page_data[189:] #the 189 slice removes the header of the indeed pages
    
    #remove footer
    footer_position = page_data.find('save job') #find the position of 'save job' which starts the footer
    trimStringBy = footer_position - len(page_data) #returns a negative number to trim the string by
    page_data = page_data[:trimStringBy] #drop footer
    page_data = remove_stopwords(page_data)
    page_data_list.append(page_data)
    

Accessing link 0 of 374
Accessing link 1 of 374
Accessing link 2 of 374
Accessing link 3 of 374
Accessing link 4 of 374
Accessing link 5 of 374
Accessing link 6 of 374
Accessing link 7 of 374
Accessing link 8 of 374
Accessing link 9 of 374
Accessing link 10 of 374
Accessing link 11 of 374
Accessing link 12 of 374
Accessing link 13 of 374
Accessing link 14 of 374
Accessing link 15 of 374
Accessing link 16 of 374
Accessing link 17 of 374
Accessing link 18 of 374
Accessing link 19 of 374
Accessing link 20 of 374
Accessing link 21 of 374
Accessing link 22 of 374
Accessing link 23 of 374
Accessing link 24 of 374
Accessing link 25 of 374
Accessing link 26 of 374
Accessing link 27 of 374
Accessing link 28 of 374
Accessing link 29 of 374
Accessing link 30 of 374
Accessing link 31 of 374
Accessing link 32 of 374
Accessing link 33 of 374
Accessing link 34 of 374
Accessing link 35 of 374
Accessing link 36 of 374
Accessing link 37 of 374
Accessing link 38 of 374
Accessing link 39 of 374
Accessing 

## Print page data and write to file for debug

**Footer still has some text at the end which isn't properly cleaned

In [8]:
print(page_data_list[1])
document_set = page_data_list
page_data_file = 'pageText' +q+'.txt'
write_lst(page_data_list,page_data_file)

Senior Software Developer - Quantitative Modeling KBRA - New York, NY 10022 Senior Software Developer - Quantitative Modeling We're seeking Senior Software Developer join quantitative modeling team Midtown office. The successful candidate member close-knit team data analysts, data scientists, software developers developing deploying quantitative models support bond rating credit analysis. About Team The Data Science Quantitative Modeling (DSQM) team supports KBRAs credit analysts building delivering financial predictive models. Ultimately, strive improve analysts workflows making faster accurate better models. Our software development team believes code craft, writing software creative endeavor, work makes organization successful. We believe small, empowered teams amazing things. We believe picking right tool job instead "because that's we've done." Our Quantitative Modeling team currently uses following tools: R Python RStudio Connect Docker & Jenkins SQL Server About Job The Senior S

## Summarization Using Text Rank

### import libraries

In [9]:
from gensim.summarization import keywords
from gensim.summarization.summarizer import summarize
from gensim.summarization import mz_keywords

In [10]:
# Create single document by concatenating all documents
all_documents = ""

for doc in page_data_list:
    all_documents += doc

In [11]:
#keywords
keywords(all_documents).split('\n')


['developer',
 'development',
 'developed',
 'develops',
 'developments',
 'experimenting',
 'experiment',
 'working',
 'work',
 'works',
 'developers developing deploying',
 'data',
 'develop execute',
 'technologies',
 'technologically',
 'applications',
 'application',
 'applicant',
 'publications experiments',
 'requires',
 'requirements',
 'required',
 'require',
 'requirement',
 'new',
 'news',
 'including',
 'includes',
 'included',
 'engineer',
 'engine',
 'engineerings',
 'engines',
 'businesses',
 'busy',
 'production',
 'product',
 'products',
 'productivity',
 'productive',
 'supports',
 'supportive',
 'supported',
 'supportable',
 'designation',
 'designs',
 'designers',
 'designer',
 'designed',
 'management',
 'managing',
 'managed',
 'manager',
 'managers',
 'manages',
 'managment',
 'solutions',
 'software',
 'builds',
 'python',
 'worked cross functional',
 'knowledge',
 'knowledgeable',
 'create experiences app web',
 'applicable roles',
 'tools',
 'tooling',
 'clien

In [12]:
print(summarize(all_documents, word_count  = 250))

KEY RESPONSIBILITIES AND DUTIES: Manage individual project priorities, deadlines, deliverables AGILE methodologies Develops high-quality, fully tested solutions meet business needs Serve development leader Charles River IMS multi-asset trading platform interfaces Communicate development status timely manner, including metric reporting Create appropriate technical documentation developed solutions operational support Collaborate IT business partners design, develop, test troubleshoot end end technical solutions Perform tasks high complexity modify processes, plans, designs needed Complete testing accordance company standards defined approved testing plans includes quality assurance performance testing Ability work team communicate effectively organizational levels written verbal communication skills Assists senior developers following SDLC/Agile change management processes Interacts IT teams required leverage enterprise services Participate paired programming code review sessions Abilit

In [13]:
print(mz_keywords(all_documents,scores=True,threshold=0.001))

  log_p = np.log2(p)
  h = np.nan_to_num(p * log_p).sum(axis=0)


[('qlikview', 0.005094031994070033), ('data', 0.004127387423669417), ('cornell', 0.00402822649362809), ('blinddata', 0.002746861731331279), ('university', 0.0026647475224960263), ('seen', 0.002378354409462888), ('arxiv', 0.002277972887597499), ('ibm', 0.002029007181506949), ('hadoop', 0.001843190164835043), ('google', 0.0016010457805691125), ('you', 0.0015187682905633835), ('rules', 0.0014030466712082964), ('business', 0.001379001199799871), ('development', 0.0013542603773150086), ('get', 0.001350309679583707), ('companies', 0.00134150880167336), ('code', 0.0013056205085047964), ('pyspark', 0.0012595557043921515), ('research', 0.0012416931800798752), ('contract', 0.001234887525996177), ('minimum', 0.0012015153601771802), ('software', 0.0011958927368942809), ('motesque', 0.0011840182121974849), ('trading', 0.0011744378321915806), ('spark', 0.0011386591190237433), ('marketing', 0.0010765197844600776), ('insurance', 0.0010448025304485569), ('online', 0.0010251317850764048), ('product', 0.

# Topic Modeling

### import libraries

In [32]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import LsiModel
from gensim.models import HdpModel
from gensim.models.wrappers import LdaMallet
from pprint import pprint


### tokenize the documents

In [15]:
docs = page_data_list

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

must download wordnet!!!

In [16]:
 # nltk.download('wordnet')

### lemmatize the documents

In [17]:
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

### compute bigrams

In [18]:
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=10)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

### remove rare and common tokens

In [19]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.75)

In [20]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [21]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 1484
Number of documents: 374


## Build LDA Model

In [39]:
# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [40]:
top_topics = model.top_topics(corpus, topn=10) #, num_words=10)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

pprint(top_topics)

Average topic coherence: -0.7577.
[([(0.016457407, 'blinddata'),
   (0.013355694, 'challenge'),
   (0.01012295, 'company'),
   (0.009961982, 'role'),
   (0.009931401, 'help'),
   (0.009904657, 'location'),
   (0.009899793, 'talent'),
   (0.009886243, 'test'),
   (0.009852432, 'engineer'),
   (0.007913896, 'software')],
  -0.46815505190093104),
 ([(0.01682621, 'cornell'),
   (0.01441851, 'university'),
   (0.011264864, 'application'),
   (0.009779172, 'arxiv'),
   (0.008449792, 'cornell_university'),
   (0.007058739, 'position'),
   (0.0070001795, 'team'),
   (0.0057919268, 'online'),
   (0.005688391, 'backend'),
   (0.0056264284, 'applicant')],
  -0.5274462715617378),
 ([(0.012407964, 'development'),
   (0.012156849, 'you'),
   (0.011766175, 'application'),
   (0.011180819, 'software'),
   (0.010896422, 'design'),
   (0.010592988, 'team'),
   (0.009780526, 'web'),
   (0.00955148, 'product'),
   (0.009547641, 'strong'),
   (0.009385446, 'technology')],
  -0.5760017043470895),
 ([(0.0207

## Build LSI Model

In [41]:
# Build the LSI Model
lsi_model = LsiModel(corpus=corpus, id2word=id2word, num_topics=10, decay=0.5)

In [42]:
pprint(lsi_model.print_topics(-1))

[(0,
  '0.282*"data" + 0.213*"application" + 0.205*"team" + 0.156*"development" + '
  '0.153*"business" + 0.148*"design" + 0.139*"review" + 0.126*"technology" + '
  '0.122*"software" + 0.118*"job"'),
 (1,
  '0.307*"cornell" + -0.273*"data" + 0.254*"university" + 0.178*"arxiv" + '
  '0.154*"cornell_university" + -0.152*"business" + -0.110*"development" + '
  '0.103*"position" + 0.101*"legacy" + 0.099*"backend"'),
 (2,
  '0.277*"data" + -0.216*"blinddata" + -0.158*"company" + -0.151*"role" + '
  '-0.147*"challenge" + -0.144*"location" + -0.135*"talent" + '
  '-0.133*"engineer" + -0.125*"test" + -0.122*"help"'),
 (3,
  '0.232*"development" + -0.182*"build" + -0.168*"rule" + -0.168*"hadoop" + '
  '0.166*"skill" + -0.140*"pyspark" + -0.132*"challenge" + 0.130*"the" + '
  '-0.124*"data" + -0.118*"blinddata"'),
 (4,
  '-0.253*"you" + -0.212*"we" + 0.188*"dashboard" + -0.165*"technology" + '
  '-0.138*"seen" + -0.134*"opportunity" + -0.130*"product" + -0.121*"software" '
  '+ -0.117*"solution"

## Build HDP Model

HDP model determines number of topics automatically

In [54]:
hdp_model = HdpModel(corpus=corpus, id2word=dictionary)

In [55]:
pprint(hdpmodel.show_topics())

[(0,
  '0.011*blinddata + 0.008*challenge + 0.007*talent + 0.006*company + '
  '0.006*role + 0.006*test + 0.006*location + 0.006*help + 0.006*engineer + '
  '0.005*software + 0.005*review + 0.005*preferred + 0.005*time + '
  '0.004*platform + 0.004*project + 0.004*is + 0.004*score + 0.004*partner + '
  '0.004*this + 0.004*job'),
 (1,
  '0.011*cornell + 0.009*university + 0.007*application + 0.007*arxiv + '
  '0.006*cornell_university + 0.005*position + 0.005*team + 0.004*system + '
  '0.004*production + 0.004*legacy + 0.004*applicant + 0.004*online + '
  '0.004*backend + 0.004*website + 0.003*notice + 0.003*python_developer + '
  '0.003*visit + 0.003*environment + 0.003*qualification + 0.003*technology'),
 (2,
  '0.013*data + 0.009*development + 0.007*software + 0.006*team + '
  '0.005*required + 0.005*technology + 0.005*job + 0.005*skill + '
  '0.005*solution + 0.005*review + 0.004*the + 0.004*including + '
  '0.004*application + 0.004*design + 0.004*system + 0.004*time + '
  '0.004*p

### Visualize using pyLDAvis

Here we will visualize the topic models using pyLDAvis.
ref: http://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf

Example Notebook: https://markroxor.github.io/gensim/static/notebooks/gensim_news_classification.html#topic=0&lambda=1&term=

In [45]:
# make sure to pip install pyldavis to run visualization

In [51]:
import pyLDAvis.gensim
import warnings
warnings.filterwarnings('ignore')

In [52]:
pyLDAvis.enable_notebook()

In [53]:
pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)