# Scrape Indeed

### import libraries

In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import re 
import urllib

### define scraping functions

Helper functions copied from class hw.  Make sure you copy the chrome driver into this directory!

In [24]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
browser = webdriver.Chrome('./chromedriver',options=options)

#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,browser):
    browser.get(url)
    res_html = browser.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

#tidies extracted text 
def process_bio(bio):
    bio = bio.encode('ascii',errors='ignore').decode('utf-8')       #removes non-ascii characters
    bio = re.sub('\s+',' ',bio)       #repalces repeated whitespace characters with single space
    return bio

''' More tidying
Sometimes the text extracted HTML webpage may contain javascript code and some style elements. 
This function removes script and style tags from HTML so that extracted text does not contain them.
'''
def remove_script(soup):
    for script in soup(["script", "style"]):
        script.decompose()
    return soup

#helper function to write lists to files
def write_lst(lst,file_):
    with open(file_,'w') as f:
        for l in lst:
            f.write(l)
            f.write('\n')
            
# main function that scrapes search result page            
def scrape_search_result_page(dir_url,page_result,browser):
    print ('-'*20,'Scraping indeed search result page '+ str(page_result)+'','-'*20)
    indeed_links = []
    #execute js on webpage to load faculty listings on webpage and get ready to parse the loaded HTML 
    soup = get_js_soup(dir_url,browser) 
    for link_holder in soup.find_all('div',class_='title'): #get list of all <div> of class 'photo nocaption'
        rel_link = link_holder.find('a')['href'] #get url
        #url returned is relative, so we need to add base url
        if rel_link != '':
            indeed_links.append('https://www.indeed.com' + rel_link) 
    print ('-'*20,'Found {} indeed search urls'.format(len(indeed_links)),'-'*20)
    return indeed_links

## Run scraper function

In [26]:
# build query
q = 'python developer' #job query string
l = 'New+York+State' #location of job
numPage = 20 #num pages to scrap links from
allLinks = [] # list to capture
start = 0 #pagnigation variable, page 1 = 0, page 2 = 10, page 3 = 30, etc

# loop over n number of pages
for page_result in range(numPage):
    start = page_result* 10 #increment the variable used to denote the next page
    search_result_url = 'https://www.indeed.com/jobs?q='+ q +'&l='+ l +'&start='+str(start) #build query string
    print(search_result_url)
    jobSearchResult = scrape_search_result_page(search_result_url,page_result, browser) # call scraper function
    allLinks.extend(jobSearchResult) #add to link
    

https://www.indeed.com/jobs?q=python developer&l=New+York+State&start=0
-------------------- Scraping indeed search result page 0 --------------------
-------------------- Found 19 indeed search urls --------------------
https://www.indeed.com/jobs?q=python developer&l=New+York+State&start=10
-------------------- Scraping indeed search result page 1 --------------------
-------------------- Found 19 indeed search urls --------------------
https://www.indeed.com/jobs?q=python developer&l=New+York+State&start=20
-------------------- Scraping indeed search result page 2 --------------------
-------------------- Found 19 indeed search urls --------------------
https://www.indeed.com/jobs?q=python developer&l=New+York+State&start=30
-------------------- Scraping indeed search result page 3 --------------------
-------------------- Found 19 indeed search urls --------------------
https://www.indeed.com/jobs?q=python developer&l=New+York+State&start=40
-------------------- Scraping indeed sea

### write to file for debugging

In [29]:
#Remove Duplicates
print(len(allLinks))
allLinks = list(set(allLinks))
print (len(allLinks))

378
378


In [30]:
print(allLinks)
job_urls_file = 'jobSearchResult' +q+'.txt'
# write to file
write_lst(allLinks,job_urls_file)

['https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0BnRJ3wYUmav3i0GCqa8Kn26U-Nr4Nk8ngV58FchGt9OAYkMEIkAdQE2-rtB9WcKYQstGFgxcDMjYUrMP9MBRDcdQ7uNUpOfWDoLARmaoFNTYm8yFr0MvFoP7gFyzmAhtcFuqoIl7hikXB85A5oJTrQ8XJLQu2xRdA7IfHCt0kfeRH9vao98ll75odDIWltiFQnY14yEOvOAPchLAukdQpzRr2JzLNA-COg4n-R7c9m_XDn8R8QraimPn0aFstMowMQInLuMosktbqWqZ6S3bFNXcDuB6_2PPU2fDq9jnvmwhiYV1IsaOZiPWu7x5i32OBxqIAXJJFarEV5SC0EgfzYR3CKlZ2q2sIjkXys9_uZ3R6-0Ogay92swPNEwrvxAkBPekCB2_LeDxo5METeIw9tUrkqvCST3rvcygdixkaT1gDnj7C7-ylGZ-WkSlfUXRKG_4vdm25T_g==&p=17&fvj=1&vjs=3', 'https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AdIa2ulEOeUIPnkUO8iQwr_GudB8eYTur8LIhA723bp6zN-UkGnrPbztmO8RTBHSsIhzlDH8JZFQF_Eykd9lOjDRda-FOVV28upeaFbIrk8Ci7jwWXirsnHtQH04oC75q7_9uCs6biW_IYFTkHVfTJhsu2MeL8OUPSeMXtNw8exhBZKpyrXeIPcSkifUao18mpyOm-abluAsL6-07w9BuAOGD5YNksM6QAfoSbSIkSQfQmb35BwXOBIQx9E55pccgr2FurSvc7Ti0tysnywVlpXzz2357fXYzpqyANCuJcIuAqYpReHONIg786FsdUbKVHolpYPiNbqrwMpYVjuV92QoOZWuasL7hgBzWx29IpfUgykjJybzXq20bd0iPiNbb6g2bfcqS1vHHH7IV-1NxTrDqvE

## Scrape page data for each link

In [None]:
from gensim.parsing.preprocessing import remove_stopwords

In [31]:
homepage_found = False
page_data = ''
page_data_list = []
for link_num, indeed_url in enumerate(allLinks):
    print("Accessing link",link_num,"of",len(allLinks))
    try:
        page_soup = remove_script(get_js_soup(indeed_url,browser)) 
    except:
        print ('Could not access {}'.format(indeed_url))
     
    page_data = process_bio(page_soup.get_text(separator=' '))  #helper function from bio hw to clean up text
    
    #remove header
    page_data = page_data[189:] #the 189 slice removes the header of the indeed pages
    
    #remove footer
    footer_position = page_data.find('save job') #find the position of 'save job' which starts the footer
    trimStringBy = footer_position - len(page_data) #returns a negative number to trim the string by
    page_data = page_data[:trimStringBy] #drop footer
    page_data = remove_stopwords(page_data)
    page_data_list.append(page_data)
    

## Print page data and write to file for debug

**Footer still has some text at the end which isn't properly cleaned

In [32]:
print(page_data_list[1])
document_set = page_data_list
page_data_file = 'pageText' +q+'.txt'
write_lst(page_data_list,page_data_file)

Quant Developer - Stat Arb Millennium Management 17 reviews - New York, NY Millennium Management 17 reviews Read people saying working here. Millennium Management global investment management firm founded 1989 manages approximately $37.9 billion assets February 1, 2019. Millennium 2,800 employees offices United States, Europe Asia. Over 25+ years, mission remained constant: deliver alternative investment industrys highest quality returns investors, maintain commitment principles integrity, discipline excellence. What We Do: We employ global, multi-strategy investment approach, opportunistically engaging broad array trading investing strategies wide group diversified managers. Our specialized divisions built continually evolve core infrastructure platform. This enables trading teams pursue unique investment strategies independently, operating centrally-driven risk operational framework. Careers: Our firm harnesses entrepreneurial drive people, strive employ best industry. We offer oppor

## Summarization Using Text Rank

### import libraries

In [None]:
from gensim.summarization import keywords
from gensim.summarization.summarizer import summarize
from gensim.summarization import mz_keywords

In [37]:
# Create single document by concatenating all documents
all_documents = ""

for doc in page_data_list:
    all_documents += doc

In [38]:
#keywords
keywords(all_documents).split('\n')


['developer',
 'developed',
 'developments',
 'develops',
 'experience',
 'experiences',
 'experiments',
 'experiment',
 'working',
 'work',
 'works',
 'worked',
 'design develop',
 'teams',
 'technologies',
 'technology',
 'technologically',
 'applicable',
 'applicant',
 'applicability',
 'offer opportunity developing',
 'required',
 'requirements',
 'requires',
 'requirement',
 'require',
 'requiring',
 'news',
 'developers provide scalable solutions',
 'engineer',
 'engineering',
 'engine',
 'engineerings',
 'team help',
 'include',
 'includes',
 'included',
 'designing',
 'designs',
 'designers',
 'designed',
 'designer',
 'designation',
 'businesses',
 'busy',
 'solution',
 'management',
 'manages',
 'managers',
 'managing',
 'manager',
 'managment',
 'building',
 'builds',
 'supportive',
 'supported',
 'supports',
 'supportability',
 'supportable',
 'python',
 'product',
 'production',
 'productive',
 'productivity',
 'code',
 'coded',
 'tools',
 'tool',
 'tooling',
 'knowledgeab

In [39]:
print(summarize(all_documents, word_count  = 250))

Responsibilities Design develop cloud-based software products Design implement RESTful APIs Python Support maintain existing software products, applications interfaces Evaluate emerging technologies, support testing process, troubleshooting issues Working closely front-end UX developers provide scalable solutions Working Agile/SCRUM team setting Required Skills Experience Understanding Python best practices Strong knowledge Python web frameworks- Django Flask Strong knowledge building RESTful APIs Python Experience working Linux environments Cloud (AWS) experience greatly preferred Excellent communication skills verbal written Job Types: Full-time, Contract Salary: $125,000.00 $150,000.00 /year Contract Length: More 1 year Additional Compensation: Other forms Work Location: One location Benefits: Health insurance Dental insurance Vision insurance Paid time Parental leave This Company Describes Its Culture as: Innovative -- innovative risk-taking Aggressive -- competitive growth-oriente

In [41]:
print(mz_keywords(all_documents,scores=True,threshold=0.001))

[('qlikview', 0.004841039477734628), ('cornell', 0.0041098921825062475), ('data', 0.00400213155708945), ('university', 0.0026121932210938013), ('trading', 0.0024988497791887717), ('ibm', 0.00248158164201507), ('arxiv', 0.002198352012593788), ('seen', 0.0021670370694164805), ('hadoop', 0.001979298795630938), ('machine', 0.0019468147794043551), ('you', 0.0018699687545113781), ('millennium', 0.0016401151465301879), ('pyspark', 0.0016162382902089802), ('rules', 0.0015814095372246784), ('investment', 0.0015674116561666684), ('cancer', 0.001555194430807668), ('contract', 0.001520893282805403), ('msk', 0.0014964042817210243), ('get', 0.001430278767197844), ('code', 0.0014227408636280458), ('learning', 0.0014126074686334484), ('development', 0.0013950115971655373), ('insurance', 0.0013120316437832866), ('build', 0.0013116042330658018), ('software', 0.0013034768937060619), ('management', 0.0012877146264036652), ('marketing', 0.0012714213302529333), ('companies', 0.0012321446823244319), ('portfo

# Topic Modeling

### import libraries

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import LsiModel
from pprint import pprint

### tokenize the documents

In [42]:
docs = page_data_list

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

must download wordnet!!!

In [43]:
 # nltk.download('wordnet')

### lemmatize the documents

In [59]:
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

### compute bigrams

In [46]:
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=10)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

### remove rare and common tokens

In [47]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.75)

In [48]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [49]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 1682
Number of documents: 378


## Build LDA Model

In [50]:
# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [51]:
top_topics = model.top_topics(corpus, topn=10) #, num_words=10)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

pprint(top_topics)

Average topic coherence: -1.0136.
[([(0.01676031, 'cornell'),
   (0.0145913055, 'university'),
   (0.011295941, 'application'),
   (0.009779364, 'arxiv'),
   (0.008383527, 'cornell_university'),
   (0.0071693403, 'position'),
   (0.005925425, 'backend'),
   (0.0058378163, 'online'),
   (0.0056311595, 'production'),
   (0.005628307, 'system')],
  -0.4918800217585634),
 ([(0.012622759, 'software'),
   (0.012479847, 'development'),
   (0.011185206, 'technology'),
   (0.00908064, 'we'),
   (0.00891945, 'application'),
   (0.008679699, 'you'),
   (0.008666556, 'skill'),
   (0.008262491, 'data'),
   (0.0077778473, 'review'),
   (0.0072583323, 'solution')],
  -0.5073619261711284),
 ([(0.07037405, 'data'),
   (0.025456788, 'learning'),
   (0.025060702, 'machine'),
   (0.024803711, 'machine_learning'),
   (0.013530113, 'solution'),
   (0.0118305255, 'model'),
   (0.01152141, 'engineer'),
   (0.01143821, 'business'),
   (0.011110936, 'develop'),
   (0.01095328, 'the')],
  -0.5905734656072598),
 

## Build LSI Model

In [57]:
# Build the LSI Model
lsi_model = LsiModel(corpus=corpus, id2word=id2word, num_topics=10, decay=0.5)

In [58]:
pprint(lsi_model.print_topics(-1))

[(0,
  '0.433*"data" + 0.174*"application" + 0.166*"business" + 0.165*"development" '
  '+ 0.133*"design" + 0.127*"technology" + 0.125*"code" + 0.121*"we" + '
  '0.117*"review" + 0.114*"solution"'),
 (1,
  '-0.346*"data" + 0.282*"cornell" + 0.237*"university" + 0.164*"arxiv" + '
  '0.158*"application" + 0.141*"cornell_university" + 0.120*"position" + '
  '-0.106*"business" + 0.096*"online" + 0.096*"applicant"'),
 (2,
  '0.170*"development" + -0.160*"hadoop" + -0.160*"rule" + 0.155*"skill" + '
  '-0.142*"build" + 0.137*"software" + -0.132*"pyspark" + -0.132*"cornell" + '
  '-0.122*"code" + -0.119*"application"'),
 (3,
  '-0.411*"data" + -0.219*"learning" + -0.219*"machine" + '
  '-0.219*"machine_learning" + 0.166*"design" + 0.163*"code" + '
  '0.147*"development" + 0.144*"skill" + -0.122*"cornell" + '
  '-0.114*"data_driven"'),
 (4,
  '-0.213*"you" + -0.212*"we" + 0.210*"dashboard" + 0.142*"design" + '
  '0.127*"business" + 0.125*"user" + 0.120*"required" + 0.116*"candidate" + '
  '-0.1