In [10]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import string 
from time import sleep
from collections import Counter
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
skill_set = ['r','python','java','c++','ruby','perl','matlab','javascript','scala','excel','tableau',
             'd3js','sas','spss','d3','hadoop','mapreduce','spark',
             'pig','hive','shark','zookeeper','flume','mahout',
             'sql','nosql','hase','cassandra','mongodb','docker','aws']

In [12]:
def url_split_join(arg):
    '''
    Input: job or city
    Output: text formatted for indeed search 
    '''
    arg = arg.split()
    return '+'.join(word for word in arg)

In [13]:
#url_split_join('Data Scientist')

In [18]:
def get_search_url(job,city = None, state= None): 
    
    '''
    Input: job and city 
    Output: url that directs to results page for the query 
    '''
    job = url_split_join(job)
    
    if city is None and state is None:
        site_list = ['http://www.indeed.com/jobs?q="', job, '"']
    
    elif city is not None and state is None:
        city = url_split_join(city)
        site_list = ['http://www.indeed.com/jobs?q=', job, '&l=', city]
        
    elif city is None and state is not None:
        state = url_split_join(state)
        site_list = site_list = ['http://www.indeed.com/jobs?q=', job, '&l=', state]
        
    else: 
        city = url_split_join(city)
        state = url_split_join(state)
        site_list = ['http://www.indeed.com/jobs?q=%22', job, '%22&l=', city,
                   '%2C+', state]       
        
    return ''.join(site_list) 
    

In [230]:
search_url = get_search_url('Machine Learning Engineer','New York', 'NY')
search_url

'http://www.indeed.com/jobs?q=%22Machine+Learning+Engineer%22&l=New+York%2C+NY'

In [228]:
def get_job_urls(job,city = None, state = None):
    
    '''
    Goes through the result page for the query and return for urls for each organic job posting
    Input: query
    Output: list of all jobs urls features in the query 
    '''
    
    search_url = get_search_url(job,city, state) #gets results page 
    
    try:
        site = urlopen(search_url).read()
    except:
        return 'Invalid Search' #raises exception if search combination is invalid of if no jobs of that nature exist 
    
    soup = BeautifulSoup(site)
    
    if len(soup) == 0: # in case the default parser lxml doesn't work, try another one
        soup = BeautifulSoup(site, 'html5lib')
    
    
   #gets the total number (organic and sponsored) of job postings
    num_jobs = soup.find(id = 'searchCount').string 
    num_jobs = re.findall('\d+', num_jobs) 
    num_jobs = int("".join(num_jobs[1:]))
    
    #gets the number of page results
    if num_jobs > 10: 
        num_pages = num_jobs//10 
    else: 
        num_pages = 1
    
    page_urls = [] 
    
    #iterates over each page to get the urls within that page
    for i in range(num_pages): 

        start_num = str(i*10)  #page 1 starts at start = 0 , page 1 starts at 10 etc.
        page_url = ''.join([search_url,'&start=', start_num])  
        
        current_page = urlopen(page_url).read()
        page_soup = BeautifulSoup(current_page)
        
        if len(page_soup) == 0: # In case the default parser lxml doesn't work, try another one
            page_soup = BeautifulSoup(page_url, 'html5lib')
        
        results_col =  page_soup.find(id = 'resultsCol')  
        organic_tags = results_col.find_all('div', {'data-tn-component' : "organicJob"}) #gets tags for organic rearch results 
         
        urls  = [x.a.attrs.get('href') for x in organic_tags] #gets the url for the specific job 
        page_urls.append(urls)
        
        if len(urls) < 10: #necessary because sponsored jobs results included in num_jobs
            break 
    
    job_urls = ['https://www.indeed.com'+job for sublist in page_urls for job in sublist]
    
    return job_urls 
               

In [231]:
job_urls = get_job_urls("Machine Learning Engineer", 'New York','NY')
job_urls[1]

'https://www.indeed.com/rc/clk?jk=9abe0ba0d21aa953&fccid=c2f64c58662819c7&vjs=3'

In [216]:
def get_job_info(job_url):
    
    '''
    Input: url of indeed job posting
    Output: role, title, location and list of words in description
    '''
    
    #TO DO: 1) check that ds3 works 
 
    try:
        site =  urlopen(job_url).read() #opens and returns html
    except:
        return "url could not be opened and read" #CHECK THIS 
    
    soup = BeautifulSoup(site)
    
    if len(soup) == 0: # In case the default parser lxml doesn't work, try another one
        soup = BeautifulSoup(site, 'html5lib')
    
    #general job information
     
    try:
        job_title = soup.find('h3',{'class':"icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title"}).get_text()
    except:
        job_title = 0 
    
    try:
        company_name = soup.find('div',{'class':'icl-u-lg-mr--sm icl-u-xs-mr--xs'}).get_text() #do the other company name thing, do this for state
    except:
        company_name = 0
                                       
#     soup_title = str(soup.title)
#     indexes = [i for i, ltr in enumerate(soup_title) if ltr == '-'] #handles if multiple '-' which divide role and location
    
#     if len(indexes) == 2: 
#         location = soup_title[indexes[0]+1:indexes[1]].split()[:2] 
#         city = location[0]
        #state = location [1] 
    
#     else: 
#         location = soup_title[indexes[-2]+1: indexes[-1]].split()[:2] 
#         city = location[0]
       #state = location [1]

    try:
        company_info = soup.select('div.jobsearch-InlineCompanyRating.icl-u-xs-mt--xs.jobsearch-DesktopStickyContainer-companyrating')[0].text
        company_info = company_info.split('-')[-1]
        company_info = company_info.split(' ')
    
        info = []
    
        for i in company_info:
            try:
                i = int(i)
                info.append(i)
            except:
                info.append(i)
    
    location = [x for x in info if not isinstance(x, int)]
    city = ' '.join(location[:-1])
    state = location[-1]
    
    else:
        city = 'unavailable'
        state = 'unavailable'
       
    content = soup.find('div',{'class':'jobsearch-JobComponent-description icl-u-xs-mt--md'})
    words = content.get_text().split()
    
    punctuation = string.punctuation
    stop_words = stopwords.words('english')
    
    words =[''.join(ch for ch in word if ch not in punctuation) for word in words] #gets rid of punctuation between words to enable joint word adjustment and genereal punctuation 
    words = [re.sub(r"([a-z])([A-Z])", r"\1 \2",word).split() for word in words] #adjusts for joint words
    
    words =[word.lower() for sublist in words for word in sublist] #flattens lists
    job_description = [word for word in words if word not in stop_words and word not in punctuation] #gets rids of stop words

    return job_title, company_name, city[:-1], state, job_description


In [232]:
get_job_info('https://www.indeed.com/rc/clk?jk=9abe0ba0d21aa953&fccid=c2f64c58662819c7&vjs=3')

('Machine Learning Engineer',
 'Voodoo Manufacturing',
 'Brooklyn',
 'NY',
 ['voodoo',
  'manufacturing',
  'building',
  'digital',
  'factory',
  'make',
  'manufacturing',
  'fast',
  'affordable',
  'scalable',
  'software',
  'brooklynbased',
  'factory',
  '200',
  '3d',
  'printers',
  'make',
  'thousands',
  'plastic',
  'parts',
  'products',
  'astonishingly',
  'short',
  'periods',
  'time',
  'making',
  'manufacturing',
  'accessible',
  'helping',
  'companies',
  'individuals',
  'bridge',
  'gap',
  'prototyping',
  'highvolume',
  'production',
  'customers',
  'include',
  'intel',
  'universal',
  'studios',
  'sy',
  'fy',
  'channel',
  'autodesk',
  'viacom',
  'mattel',
  'many',
  'wellfunded',
  'topnotch',
  'investors',
  'including',
  'combinator',
  'w17',
  'batch',
  'general',
  'catalyst',
  'kpcb',
  'edge',
  'focused',
  'building',
  'elite',
  'team',
  'scale',
  'company',
  'coming',
  'years',
  'looking',
  'ambitious',
  'machine',
  'lear

In [218]:
#get_job_info('https://www.indeed.com/cmp/Vensiti-Inc.,/jobs/Data-Scientist-7fc1138f244364aa?q=%22Data+Scientist%22&vjs=3')

In [219]:
#get_job_info('https://www.indeed.com/cmp/Rivel-Research-Group/jobs/Entry-Level-Governance-Esg-Research-Associate-d4cf90c3cff672dd?sjdu=Zzi_VW2ygsY1fzh3Ma9ZsE4zIT1NTXCwgFBhdjeTC3PftXK_q41keFjEJWCY3RMoWfuflRu4OS7wJLBu2LQx-Q&tk=1d0a8l0480ma2003&vjs=3')

In [155]:
#get_job_info('https://www.indeed.com/viewjob?jk=891cfca2b16bf81e&tk=1d0a2hatg19t8000&from=serp&alid=3&advn=8782415643047922&sjdu=JS6l7gydYCZv8PVkhKxCPnl-QLnRktmQo81JznoYMm1mcmDdYzLAXfklnXERcEVjwletQFW-TwQlaPMsRh6LMg_FF59PQXBkVL66Sl86xFncyvSUuUf5UXvPvCrNA6veXuWVcy8oSAkuEAiehROlopkLfP5Md6JkycvfVYukEoQ')

In [101]:
def clean_job_description(job_description,skill_set): 
    '''
    Input: list of words included in job posting 
    Output: 
        Desc = list of stemmed words included in job posting no including tecnical skills 
        skills = technical skills required for job 
    
    '''
    #TO DO      Check whether the skills should be taken out 
    #           The description includes no skills because of the variety of languages can be used to do the same job
    #           Consider whether you want to add back the skills. Because maybe skills are very reflective of nature of job
    
    skills = list(set([word for word in job_description if word in skill_set]))
    text_no_skills = [word for word in job_description if word not in skills]
    
    stemmer = SnowballStemmer('english') #get rid if this if you're going to do semantic similary
    text_no_skills = [stemmer.stem(word) for word in text_no_skills]
    
    return text_no_skills, skills 
    

In [102]:
text_no_skills, skills = clean_job_description(job_description,skill_set)

In [103]:
#text_no_skills
#skills

In [108]:
def get_data(job, skill_set, city = None, state = None):
    '''
    Input:
    Ouput: dataframe with revelant information about job
        Words used for similiarty 
        Skills and location used for filetring results
    '''
    
     #TO DO 1) how do i make this run faster 
    

    job_urls = get_job_urls(job,city) 
    
    if job_urls == 'Invalid Search':
        return 'Invalid Search'
    
    job = []
    unreadable_count = 0 
    
    for url in job_urls:
        job_info = get_job_info(url)
        
        if job_info == "url could not be opened and read":
            unreadable_count += 1
        else:
            job_title, company_name, city, state, job_description = job_info
            text_no_skills, skills = clean_job_description(job_description,skill_set)
            job.append({'job_title':job_title,"company":company_name,'city':city, 'state': state,'desc': text_no_skills, 'skills':skills,'url': url}) 
        
    return pd.DataFrame(job), unreadable_count

#here that dataframe needs to be with the constraints



In [152]:
data_scientist_1, count_1 = get_data("Data Scientist", skill_set,'Buford','GA' )

In [153]:
data_scientist_1

Unnamed: 0,city,company,desc,job_title,skills,state,url
0,Alpharetta,Kemper,"[locat, alpharetta, ga, posit, summari, create...",Data Analytics Specialist,[],GA,https://www.indeed.com/rc/clk?jk=455afcff50032...
1,Alpharetta,CHEP,"[parttim, temporari, internship, chep, help, m...",Internship – Data Science,[],GA,https://www.indeed.com/rc/clk?jk=dd8f4cf0deba0...
2,Duluth,"National Vision, Inc.","[posit, descript, nation, vision, believ, ever...",Data Scientist,"[spss, r, tableau, sas, python, sql]",GA,https://www.indeed.com/rc/clk?jk=a1db79011012c...
3,Atlanta,SunTrust,"[perform, sophist, data, analyt, encompass, da...",Data Scientist,[],GA,https://www.indeed.com/rc/clk?jk=b0bf8ea49f266...
4,Alpharetta,Verizon,"[you'll, you'll, part, team, identifi, trend, ...",Data Scientist,"[r, tableau, hadoop, python, hive, sql]",GA,https://www.indeed.com/rc/clk?jk=8ab778c265cdf...
5,Duluth,WestRock,"[nyse, wrk, partner, custom, provid, different...",Associate Data Scientist,"[spark, java, r, hadoop, spss, python]",GA,https://www.indeed.com/rc/clk?jk=b8db29caa083b...
6,Alpharetta,Change Healthcare,"[market, analysi, research, industri, topic, i...",Data Science Analyst,"[perl, excel, java, hadoop, python, sql]",GA,https://www.indeed.com/company/Green/jobs/Data...
7,Atlanta,Arby's,"[purpos, posit, posit, critic, understand, dri...","Associate Data Scientist, Advanced Analytics","[r, sql, python]",GA,https://www.indeed.com/rc/clk?jk=77518cb3e9ed4...
8,Suwanee,"ARRIS Group, Inc.","[internship, data, analyt, intern, 18003239, d...",Data Analytics Intern,[python],GA,https://www.indeed.com/rc/clk?jk=b1f2cf67e21c3...
9,Alpharetta,Brielle Technologies,"[60, 70, hour, senior, big, data, engin, digit...",Data Scientists – Machine Learning,"[spark, java, r, tableau, hadoop, d3js, scala,...",GA,https://www.indeed.com/rc/clk?jk=b5900f9c349e1...


In [221]:
def dfs_to_csv(job,file_name,skill_set,path):
    '''
    Input: job as typed out in indeed 
    '''
    job_df, count = get_data(job,skill_set)
    job_df['job'] = job
    job_df.to_csv(path + file_name + '.csv')
    
    return count 
    
#     software_developer, count_2 = get_data ("Software Developer",skill_set)
#     software_developer['job'] = 'software developer'
    
#     machine_learning_enginner, count_3 = get_data("Machine Learning Engineer",skill_set)
#     machine_learning_enginner['job'] = 'machine learning enginner'
    
#     full_stack_developer, count_4 = get_data("Full Stack Developer",skill_set)
#     full_stack_developer['job'] = ' full_stack_developer'
    
#     bussiness_intelligence_analyst, count_5 = get_data('Business Intelligence Analyst',skill_set)
#     bussiness_intelligence_analyst['job'] = 'bussiness_intelligence_analyst'
    
#     developer, count_6 = get_data('Developer',skill_set)
#     developer['job'] = 'developer'
    
#     cloud_architect, count_7 = get_data('Cloud Architect',skill_set)
#     cloud_architect['job'] = 'cloud_architect'
    
#     data_engineer, count_8 = get_data("Data Engineer",skill_set)
#     data_engineer['job'] = 'data_engineer'
      
#   df.to_csv(path)

In [222]:
ds_count = dfs_to_csv('Data Scientist','data_scientist',skill_set,r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')


In [223]:
ds_count

1

In [236]:
#l = []
#l[-1]

In [226]:
mle_count = dfs_to_csv('Machine Learning Engineer', 'machine_learning_engineer',skill_set, r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/' )

IndexError: list index out of range

In [225]:
#soft_eng_count = dfs_to_csv('Software Developer', 'softwate_developer',skill_set,r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')

  ' that document to Beautiful Soup.' % decoded_markup


AttributeError: 'NoneType' object has no attribute 'find_all'

In [90]:
def get_query_info(query_url,skill_set):
    
    job_title, company_name, city, state, job_description = get_job_info(query_url)
    text_no_skills, skills = clean_job_description(job_description,skill_set)
    
    return job_title, company_name, city, state, text_no_skills, skills

In [91]:
query_url = get_job_urls('Data Scientist','Boston','MA')[0] #you have to deal with the .string thing 

In [692]:
job_title, company_name, city, state, text_no_skills, skills = get_query_info(query_url,skill_set) #need to fix the title string thing

In [697]:
def get_top_sim(query_words, df):
    #TO DO 1) here is where I need to incorporate semantic similiarty 
    #      2) can't stem prior to doing similiarty so need to unstem and then resteam
    

    
    query_words = [" ".join(query)]
    job_words = df.desc.apply(lambda x: " ".join(x)).values.tolist() #this gives you a list of list of the jobs 
     
    vectorizer = TfidfVectorizer() #make sure these defaults are correct
    model = vectorizer.fit(job_words)
    
    query_tfidf = model.transform(query_words)
    job_tfidf = model.transform(job_words)
    
    cosine_sim = cosine_similarity(query_tfidf,job_tfidf)
    top_10_sim = cosine_sim.argsort()[0][::-1][0:10]
    
    return cosine_sim, top_10_sim

In [699]:
cosine_sim, top_10_sim = get_top_sim(text_no_skills,data)

In [702]:
cosine_sim
top_10_sim

array([20, 22, 24, 13,  6, 23, 14,  8, 26, 18])

In [703]:
def get_top_jobs(df,indices):
    return df.iloc[indices,:]

In [705]:
get_top_jobs(data,top_10_sim)

Unnamed: 0,city,company,desc,job_title,skills,state,url
20,"Atlanta,",Epsilon,"[sr, data, scientist, analyt, practic, respons...",Senior Data Scientist,"[aws, python, spark, sql, hadoop, sas, java, s...",GA,https://www.indeed.com/rc/clk?jk=18fb5f0e261aa...
22,"Roswell,",MotoRad of America,"[job, summari, respons, analyz, larg, amount, ...",Insights and Analytics Manager,[],GA,https://www.indeed.com/company/MotoRad-of-Amer...
24,"Alpharetta,",ADP,"[unlock, career, potenti, technolog, adp, enjo...",Full Stack Developer,"[java, docker, excel]",GA,https://www.indeed.com/rc/clk?jk=37fafd1237590...
13,"Atlanta,",Cox Communications,"[descript, cox, communic, look, data, scientis...",Data Scientist,"[sql, python, spark, sas, java, scala, hive, r...",GA,https://www.indeed.com/rc/clk?jk=759ba16b59111...
6,"Alpharetta,",Equifax,"[job, number, 20765, posit, titl, extern, desc...","Data Scientist, Keying and Linking","[sql, r, sas]",GA,https://www.indeed.com/rc/clk?jk=337a80c92cae2...
23,"Atlanta,",Cox Communications,"[descript, cox, communic, look, director, data...","Director, Data Science","[sql, python, spark, sas, java, scala, hive, r...",GA,https://www.indeed.com/rc/clk?jk=280ddd1362b17...
14,"Atlanta,",Catalina Marketing,"[catalina, catalina, person, digit, media, con...",Senior Data Scientist,"[python, r]",GA,https://www.indeed.com/rc/clk?jk=f7d028a809b84...
8,"Atlanta,",Catalina Marketing,"[catalina, catalina, person, digit, media, con...",Data Scientist,"[python, r]",GA,https://www.indeed.com/rc/clk?jk=303a953bd67d7...
26,"Alpharetta,",ADP,"[adp, hire, princip, applic, develop, full, st...",Principal Application Developer,"[java, docker, excel, sql]",GA,https://www.indeed.com/rc/clk?jk=63c3ab3e6804c...
18,"Seattle,",KPMG,"[innov, collabor, shine, lighthous, –, kpmgs, ...","Sr. Associate, Data Scientist, NLP","[aws, python, sql]",WA,https://www.indeed.com/rc/clk?jk=062f6a71626aa...


# Notes
- how do you update relevant skills in case a new language comes up?
    - word associated with data science
    - essentailly i don't have to feed it in the skills_set
- which state / city has job posting with the skills that you have
    - If you're moving from one place to another what skills should you learn
- On the website track the skills per day, per week, per month so you can see how skills are changing over time 
- Given a certain set of skill, return the job postings that for which you have the highest percentage of those skills


## To Do 
1. Make sure that it is going through all the pages
2. Make sure it is getting all the links on each page
3. How do you want to organize
4. Are the less jobs in of 84 than actual jobs
5. How do you decrease run time 
6. Can I use this same webscrabing code to go through Udemy, Coursera, Udacity, Ed-Ex and determine which courses are the best?
7. Have some sort of counter when the website term 


In [None]:
'http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
                   '%2C+', state] 