In [289]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import string 
from time import sleep
from collections import Counter
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
skill_set = ['r','python','java','c++','ruby','perl','matlab','javascript','scala','excel','tableau',
             'd3js','sas','spss','d3','hadoop','mapreduce','spark',
             'pig','hive','shark','zookeeper','flume','mahout',
             'sql','nosql','hase','cassandra','mongodb','docker','aws']

In [36]:
def url_split_join(arg):
    '''
    Input: job, city or state
    Output: text formatted for indeed search 
    '''
    arg = arg.split()
    return '+'.join(word for word in arg)

In [None]:
#url_split_join('Data Scientist')

In [37]:
def get_search_url(job,city = None, state = None): 
    
    '''
    Input: job and city and or state
    Output: url that directs to results page for the query 
    '''
    job = url_split_join(job)
    
    if city is None and state is None:
        site_list = ['http://www.indeed.com/jobs?q="', job, '"']
    
    elif city is not None and state is None:
        city = url_split_join(city)
        site_list = ['http://www.indeed.com/jobs?q=', job, '&l=', city]
        
    elif city is None and state is not None:
        state = url_split_join(state)
        site_list = site_list = ['http://www.indeed.com/jobs?q=', job, '&l=', state]
        
    else: 
        city = url_split_join(city)
        state = url_split_join(state)
        site_list = ['http://www.indeed.com/jobs?q=%22', job, '%22&l=', city,
                   '%2C+', state]
        
    return ''.join(site_list) 
    

In [390]:
search_url = get_search_url('Data Scientist','Budford','GA')
search_url

'http://www.indeed.com/jobs?q=%22Data+Scientist%22&l=Budford%2C+GA'

In [397]:
def get_job_urls(job,city = None, state = None):
    
    '''
    Goes through the result page for the query and return for urls for each organic job posting
    Input: query
    Output: list of all jobs urls features in the query 
    '''
    
    search_url = get_search_url(job,city,state) #gets results page 
    
    try:
        site = urlopen(search_url).read()
    except:
        return 'Invalid Search' #raises exception if search combination is invalid of if no jobs of that nature exist 
    
    soup = BeautifulSoup(site)
    
    if len(soup) == 0: # in case the default parser lxml doesn't work, try another one
        soup = BeautifulSoup(site, 'html5lib')
    
    
   #gets the total number (organic and sponsored) of job postings
    num_jobs = soup.find(id = 'searchCount').string 
    num_jobs = re.findall('\d+', num_jobs) 
    num_jobs = int("".join(num_jobs[1:]))
    
    #gets the number of page results
    if num_jobs > 10: 
        num_pages = num_jobs//10 
    else: 
        num_pages = 1
    
    page_urls = [] 
    
    #iterates over each page to get the urls within that page
    for i in range(num_pages): 

        start_num = str(i*10)  #page 1 starts at start = 0 , page 1 starts at 10 etc.
        page_url = ''.join([search_url,'&start=', start_num])  
        
        current_page = urlopen(page_url).read()
        page_soup = BeautifulSoup(current_page)
        
        if len(page_soup) == 0: # In case the default parser lxml doesn't work, try another one
            page_soup = BeautifulSoup(page_url, 'html5lib')
        
        results_col =  page_soup.find(id = 'resultsCol')  
        organic_tags = results_col.find_all('div', {'data-tn-component' : "organicJob"}) #gets tags for organic rearch results 
         
        urls  = [x.a.attrs.get('href') for x in organic_tags] #gets the url for the specific job 
        page_urls.append(urls)
        
        if len(urls) < 10: #necessary because sponsored jobs results included in num_jobs
            break 
    
    job_urls = ['https://www.indeed.com'+job for sublist in page_urls for job in sublist]
    
    return job_urls 
               

In [399]:
job_urls = get_job_urls("Data Scientist", 'Buford','GA')
#job_urls

In [371]:
def get_job_info(job_url):
    
    '''
    Input: url of indeed job posting
    Output: role, title, location and list of words in description
    '''
    
    #TO DO: 1) check that ds3 works 
    #       3) make sure general function is accounted for in ch for ch if not in punctuation 
    
    
    # THERE IS A DESCRIPTION SECTION 
    #<div class="jobsearch-JobComponent-description icl-u-xs-mt--md">
    #break this up better because you can go into title, company, location etc. 

    try:
        site =  urlopen(job_url).read() #opens and returns html
    except:
        return "url could not be opened and read" #CHECK THIS 
    
    soup = BeautifulSoup(site)
    
#     if len(soup) == 0: # In case the default parser lxml doesn't work, try another one
#         soup = BeautifulSoup(site, 'html5lib')
    
#     title = str(soup.title)
#     indexes = [i for i, ltr in enumerate(title) if ltr == '-'] #handles if multiple - which divide role and location
    
#     if len(indexes) == 2: 
#         role = title[title.find('>'):title.find('-')][1:].lower().split()
#         location = title[indexes[0]+1:indexes[1]].split()[:2] 
#         city = location[0]
#         state = location [1] #keeps failing 
    
#     else: 
#         role = title[7: indexes[-2]].replace('-'," ").split()
#         location = title[indexes[-2]+1: indexes[-1]].split()[:2] 
#         city = location[0]
#         state = location [1] #keeps failling
     
#     for obj in soup(['style','script','noscript','meta','title']): #removes all unnecessary objects 
#         obj.decompose()
        
#     words =  soup.get_text().split() + role #returns words in the text + title of job 
    
#     punctuation = string.punctuation
#     stop_words = stopwords.words('english')
    
#     words =[''.join(ch for ch in word if ch not in punctuation) for word in words] #gets rid of punctuation between words to enable joint word adjustment and genereal punctuation 
#     words = [re.sub(r"([a-z])([A-Z])", r"\1 \2",word).split() for word in words] #adjusts for joint word
    
#     words =[word.lower() for sublist in words for word in sublist] #flattens lists
#     words = [word for word in words if word not in stop_words] #gets rids of stop words
#     # in words get rid of first words because its the page info like find jobs etc. 
#     # also get rid of last words
#     return role, city, state, words[26:]
     

In [None]:
return 

In [381]:
role_1,city_1,state_1, words_1 = get_info(one_job_url)


In [385]:
one_job_url

'https://www.indeed.com/rc/clk?jk=a1db79011012cae4&fccid=4c9ec3daa2e076a1&vjs=3'

In [382]:
role_2,city_2,state_2, words_2 = get_info(two_job_url)

In [386]:
two_job_url

'https://www.indeed.com/rc/clk?jk=b0bf8ea49f266a49&fccid=131f8387a45b81d9&vjs=3'

In [383]:
#role
#city
words_1

['data',
 'scientist',
 'national',
 'vision',
 'inc362',
 'reviews',
 'duluth',
 'ganational',
 'vision',
 'inc362',
 'reviews',
 'read',
 'people',
 'saying',
 'working',
 'position',
 'description',
 'national',
 'vision',
 'believe',
 'everyone',
 'deserves',
 'see',
 'best',
 'live',
 'best',
 'help',
 'people',
 'making',
 'quality',
 'eye',
 'care',
 'eyewear',
 'affordable',
 'accessible',
 'national',
 'vision',
 'inc',
 'nvi',
 'one',
 'largest',
 'optical',
 'retailers',
 'united',
 'states',
 'offer',
 'innovative',
 'culture',
 'training',
 'priority',
 'hard',
 'work',
 'praised',
 'career',
 'growth',
 'reality',
 'right',
 'looking',
 'crm',
 'analytics',
 'analyst',
 'join',
 'growing',
 'team',
 'crm',
 'analytics',
 'analyst',
 'uses',
 'data',
 'manipulation',
 'data',
 'mining',
 'capabilities',
 'support',
 'analysis',
 'reporting',
 'needs',
 'crm',
 'loyalty',
 'marketing',
 'programs',
 'role',
 'analyze',
 'detailed',
 'customer',
 'data',
 'develop',
 'powerf

In [384]:
words_2

['reviews',
 'find',
 'salaries',
 'access',
 'millions',
 'salaries',
 'employers',
 'post',
 'job',
 'change',
 'countrywhatjob',
 'title',
 'keywords',
 'companywherecity',
 'state',
 'zip',
 'code',
 'find',
 'jobs',
 'advanced',
 'job',
 'search',
 'data',
 'scientist',
 'sun',
 'trust3619',
 'reviews',
 'atlanta',
 'ga',
 '30328sun',
 'trust3619',
 'reviews',
 'read',
 'people',
 'saying',
 'working',
 'performs',
 'sophisticated',
 'data',
 'analytics',
 'encompassing',
 'data',
 'mining',
 'inferential',
 'statistical',
 'analysis',
 'predictive',
 'analytics',
 'example',
 'identifies',
 'actionable',
 'insights',
 'various',
 'multiple',
 'sources',
 'data',
 'measurably',
 'improve',
 'business',
 'outcomes',
 'reduce',
 'business',
 'risk',
 'collects',
 'prepares',
 'data',
 'analysis',
 'performs',
 'exploratory',
 'advanced',
 'predictive',
 'andor',
 'modeling',
 'analytics',
 'identifies',
 'data',
 'relationships',
 'patterns',
 'trends',
 'provides',
 'consultation',

In [106]:
def clean_text(words,skill_set): 
    '''
    Input: list of words included in job posting 
    Output: 
        Desc = list of stemmed words included in job posting no including tecnical skills 
        skills = technical skills required for job 
    
    '''
    #TO DO      1) check whether the skills should be taken out 
    #           2) The description includes no skills because of the variety of languages can be used to do the same job
    #           3) consider whether you want to add back the skills
    
    skills = list(set([word for word in words if word in skill_set]))
    text_no_skills = [word for word in words if word not in skills]
    
    stemmer = SnowballStemmer('english')
    text_no_skills = [stemmer.stem(word) for word in text_no_skills]
    #text = desc_no_skills + skills 
    
    return text_no_skills, skills 
    

In [372]:
text_no_skills , skills = clean_text(words,skill_set)

In [373]:
#words
#skills

In [374]:
def get_data(job, skill_set, city = None, state = None):
    '''
    Input:
    Ouput: dataframe with revelant information about job
        Words used for similiarty 
        Skills and location used for filetring results
    '''
    
     #TO DO 1) how do i make this run faster 
    #     2) what is the bottle neck
    
    job_urls = get_job_urls(job,city,state) #if == invalid search
    
    job_info = []
    
    for url in job_urls:
        role,city,state,text = get_job_info(url) #if job info == cannot be open and read
        words, skills = clean_text(text,skill_set)
        job_info.append({'role':role,'city':city,'state':state,'desc': words, 'skills':skills,'url': url})
        
    return pd.DataFrame(job_info)

#here that dataframe needs to be with the constraints



In [375]:
data = get_data("Data Scientist", skill_set, 'Buford','GA', )

In [376]:
data.head()

Unnamed: 0,city,desc,role,skills,state,url
0,"Duluth,","[associ, data, scientist, west, rock1421, revi...","[associate, data, scientist]","[spss, python, spark, hadoop, java, r]",GA,https://www.indeed.com/rc/clk?jk=b8db29caa083b...
1,"Duluth,","[data, scientist, nation, vision, inc362, revi...","[data, scientist]","[sql, python, spss, sas, r, tableau]",GA,https://www.indeed.com/rc/clk?jk=a1db79011012c...
2,"Atlanta,","[data, scientist, sun, trust3619, review, atla...","[data, scientist]",[],GA,https://www.indeed.com/rc/clk?jk=b0bf8ea49f266...
3,"Alpharetta,","[data, scientist, verizon21964, review, alphar...","[data, scientist]","[sql, python, hadoop, hive, r, tableau]",GA,https://www.indeed.com/rc/clk?jk=8ab778c265cdf...
4,"Atlanta,","[consum, bank, data, scientist, sun, trust3619...","[consumer, banking, data, scientist]","[sql, excel]",GA,https://www.indeed.com/rc/clk?jk=503e464d73344...


In [239]:
#query 
urls = get_job_urls("Data Scientist","Boston","MA")
query_url = urls[0]

In [247]:
title,city,state,query_words = get_info(query_url) #need to fix title 

In [251]:
query, skills = clean_text(query_words,skill_set)

In [242]:
query

In [357]:
def get_top_sim(query, df):
    #TO DO 1) here is where I need to incorporate semantic similiarty 
    #      2) can't stem prior to doing similiarty so need to unstem and then resteam
    
    
    #need to get rid of main words
    
    query = [" ".join(query)]
    job_words  = df.desc.apply(lambda x: " ".join(x)).values.tolist() #this gives you a list of list of the jobs 
     
    vectorizer = TfidfVectorizer() #make sure these defaults are correct
    model = vectorizer.fit(job_words)
    
    query_tfidf = model.transform(query)
    job_tfidf = model.transform(job_words)
    
    cosine_sim = cosine_similarity(query_tfidf,job_tfidf)
    top_10_sim = cosine_sim.argsort()[0][::-1][0:10]
    
    return top_10_sim

In [358]:
indices = get_tfifd(query,data)

In [359]:
indices

array([ 3, 11, 20,  0,  5, 12, 22, 19, 26, 18])

In [351]:
def get_top_jobs(df,indices):
    return df.iloc[indices,:]

In [353]:
#get_top_jobs(data,indices)

# Notes
- how do you update relevant skills in case a new language comes up?
    - word associated with data science
    - essentailly i don't have to feed it in the skills_set
- which state / city has job posting with the skills that you have
    - If you're moving from one place to another what skills should you learn
- On the website track the skills per day, per week, per month so you can see how skills are changing over time 
- Given a certain set of skill, return the job postings that for which you have the highest percentage of those skills


## To Do 
1. Make sure that it is going through all the pages
2. Make sure it is getting all the links on each page
3. How do you want to organize
4. Are the less jobs in of 84 than actual jobs
5. How do you decrease run time 
6. Can I use this same webscrabing code to go through Udemy, Coursera, Udacity, Ed-Ex and determine which courses are the best?
7. Have some sort of counter when the website term 


In [None]:
'http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
                   '%2C+', state] 