In [405]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import string 
from time import sleep #don't need this here 
from collections import Counter #don't need this here
import pandas as pd
import numpy as np
from nltk.corpus import stopwords #don't need this here
from nltk.stem import SnowballStemmer #don't need this here
from sklearn.feature_extraction.text import TfidfVectorizer #don't need this here
from sklearn.metrics.pairwise import cosine_similarity #don't need this here

In [406]:
skill_set = ['r','python','java','c++','ruby','perl','matlab','javascript','scala','excel','tableau',
             'd3js','sas','spss','d3','hadoop','mapreduce','spark',
             'pig','hive','shark','zookeeper','flume','mahout',
             'sql','nosql','hase','cassandra','mongodb','docker','aws']

In [407]:
def url_split_join(arg):
    '''
    Input: job 
    Output: text formatted for indeed search 
    '''
    arg = arg.split()
    return '+'.join(word for word in arg)

In [408]:
url_split_join('"Could Architect"')

'"Could+Architect"'

In [409]:
def get_search_url(job): 
    
    '''
    Input: job 
    Output: url that directs to results page for the query 
    '''
    job = url_split_join(job)
    site_list = ['http://www.indeed.com/jobs?q="', job, '"']      
        
    return ''.join(site_list) 
    

In [410]:
get_search_url('"Could Architect"')

'http://www.indeed.com/jobs?q=""Could+Architect""'

In [411]:
def get_job_urls(job):
    
    '''
    Goes through the result page for the query and return for urls for each organic job posting
    Input: query
    Output: list of all jobs urls features in the query 
    '''
    
    search_url = get_search_url(job) #gets results page 
    
    try:
        site = urlopen(search_url).read()
    except:
        return 'Invalid Search' #raises exception if search combination is invalid of if no jobs of that nature exist 
    
    soup = BeautifulSoup(site)
    
    if len(soup) == 0: # in case the default parser lxml doesn't work, try another one
        soup = BeautifulSoup(site, 'html5lib')
           
    #gets the total number (organic and sponsored) of job postings
    
    try: 
        num_jobs = soup.find(id = 'searchCount').string 
        num_jobs = re.findall('\d+', num_jobs) 
        num_jobs = int("".join(num_jobs[1:]))
    
    except:
        num_jobs = 150000
    
    #gets the number of page results
    if num_jobs > 10: 
        num_pages = num_jobs//10 
    else: 
        num_pages = 1
    
    page_urls = [] 
    
    #iterates over each page to get the urls within that page
    for i in range(num_pages): 

        start_num = str(i*10)  #page 1 starts at start = 0 , page 1 starts at 10 etc.
        page_url = ''.join([search_url,'&start=', start_num]) 
    
        current_page = urlopen(page_url).read()
        page_soup = BeautifulSoup(current_page)
        
        if len(page_soup) == 0: # In case the default parser lxml doesn't work, try another one
            page_soup = BeautifulSoup(page_url, 'html5lib')
        
        try:
       
            results_col =  page_soup.find(id = 'resultsCol')  
            organic_tags = results_col.find_all('div', {'data-tn-component' : "organicJob"}) #gets tags for organic rearch results 
         
            urls  = [x.a.attrs.get('href') for x in organic_tags] #gets the url for the specific job 
            page_urls.append(urls)
        
            if len(urls) < 10: #necessary because sponsored jobs results included in num_jobs
                break  
        except:
            break 
    
    job_urls = ['https://www.indeed.com'+job for sublist in page_urls for job in sublist]
    
    return job_urls
               

In [412]:
get_job_urls('Could Architect')

[]

In [413]:
def get_job_info(job_url):
    
    '''
    Input: url of indeed job posting
    Output: role, title, location and list of words in description
    '''
 
    try:
        site =  urlopen(job_url).read() #opens and returns html
    except:
        return "url could not be opened and read"
    
    soup = BeautifulSoup(site)
    
    if len(soup) == 0: # in case the default parser lxml doesn't work, try another one
        soup = BeautifulSoup(site, 'html5lib')
    
    #general job information
     
    try:
        job_title = soup.find('h3',{'class':"icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title"}).get_text()
    except:
        job_title = 'unavailable' 
    
    try:
        company_name = soup.find('div',{'class':'icl-u-lg-mr--sm icl-u-xs-mr--xs'}).get_text() #do the other company name thing, do this for state
    except:
        company_name = 'unavailable'

    try:
        company_info = soup.select('div.jobsearch-InlineCompanyRating.icl-u-xs-mt--xs.jobsearch-DesktopStickyContainer-companyrating')[0].text
        company_info = company_info.split('-')[-1]
        company_info = company_info.split(' ')
    
        info = []
    
        for i in company_info:
            try:
                i = int(i)
                info.append(i)
            except:
                info.append(i)
    
        location = [x for x in info if not isinstance(x, int)]
        city = ' '.join(location[:-1])
        state = location[-1]
    
    except:
        city = 'unavailable'
        state = 'unavailable'
       
    try:
        content = soup.find('div',{'class':'jobsearch-JobComponent-description icl-u-xs-mt--md'})
        words = content.get_text().split()
    
        punctuation = string.punctuation
        stop_words = stopwords.words('english')
    
        words =[''.join(ch for ch in word if ch not in punctuation) for word in words] #gets rid of punctuation between words to enable joint word adjustment and genereal punctuation 
        words = [re.sub(r"([a-z])([A-Z])", r"\1 \2",word).split() for word in words] #adjusts for joint words
    
        words =[word.lower() for sublist in words for word in sublist] #flattens lists
        job_description = [word for word in words if word not in stop_words and word not in punctuation] #gets rids of stop words

    except:
        job_description = 'Unavailable'
    
    return job_title, company_name, city[:-1], state, job_description
        


In [414]:
get_job_info('https://www.indeed.com/jobs?q=%22%22Could+Architect%22%22&start=0')


('unavailable', 'unavailable', 'unavailabl', 'unavailable', 'Unavailable')

In [415]:
def clean_job_description(job_description,skill_set): 
    '''
    Input: list of words included in job posting 
    Output: 
        Desc = list of stemmed words included in job posting no including tecnical skills 
        skills = technical skills required for job 
    
    '''
    
    try:
        skills = list(set([word for word in job_description if word in skill_set]))
        text_no_skills = [word for word in job_description if word not in skills]
    
    except:
        skills = 'unavailable'
        text_no_skills = 'unavailable'
    
    return text_no_skills, skills 
    
    
    

In [416]:
text_no_skills, skills = clean_job_description(job_description,skill_set)

In [417]:
#text_no_skills
#skills

In [418]:
def get_data(job, skill_set, city = None, state = None):
    '''
    Input:
    Ouput: dataframe with revelant information about job
        Words used for similiarty 
        Skills and location used for filetring results
    '''
    
     #TO DO 1) how do i make this run faster 
    

    job_urls = get_job_urls(job) 
    
    if job_urls == 'Invalid Search':
        return 'Invalid Search'
    
    job = []
    unreadable_count = 0 
    
    for url in job_urls:
        job_info = get_job_info(url)
        
        if job_info == "url could not be opened and read":
            unreadable_count += 1
        else:
            job_title, company_name, city, state, job_description = job_info
            text_no_skills, skills = clean_job_description(job_description,skill_set)
            job.append({'job_title':job_title,"company":company_name,'city':city, 'state': state,'desc': text_no_skills, 'skills':skills,'url': url}) 
        
    return pd.DataFrame(job), unreadable_count

#here that dataframe needs to be with the constraints



In [419]:
def dfs_to_csv(job,file_name,skill_set,path):
    '''
    Input: job as typed out in indeed 
    '''
    job_df, count = get_data(job,skill_set)
    job_df['job'] = job
    job_df.to_csv(path + file_name + '.csv')
    
    return count 


In [290]:
#DONE
#ds_count = dfs_to_csv('Data Scientist','data_scientist',skill_set,r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')



In [288]:
#DONE
#mle_count = dfs_to_csv('Machine Learning Engineer', 'machine_learning_engineer',skill_set, r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/' )



In [342]:
#DONE
#soft_developer_count = dfs_to_csv('Software Developer', 'softwate_developer',skill_set,r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')



In [333]:
#DONE
#fsd_count = dfs_to_csv('Full Stack Developer', 'full_stack_developer',skill_set,r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')



In [335]:
#DONE
#bia_count = dfs_to_csv('Business Intelligence Analyst','business_intelligence_analyst', skill_set,r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')



In [336]:
#LOADING

#developer_count = dsf_to_csv('Developer', 'developer', skill_set, r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/' )



In [337]:
#DONE
#data_eng_count = dfs_to_csv("Data Engineer", 'data_engineer', skill_set, r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')



In [339]:
#DONE
#data_arc_count = dfs_to_csv('Data Architect', 'data_architect',skill_set,r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')



In [340]:
#LOADING

#dev_ops_count = dfs_to_csv('DevOps','dev_ops',skill_set, r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')



In [420]:
#python_count = dfs_to_csv('Python','python',skill_set, r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')

  ' that document to Beautiful Soup.' % decoded_markup


In [421]:
#python_count

1

In [422]:
def get_ideal_text(skill_set,ideal_url):
    
    _, _, _,_, job_description = get_job_info(ideal_url)
    text_no_skills, _ = clean_job_description(job_description,skill_set)
    
    #job_title, company_name, city[:-1], state, job_description
    
    return text_no_skills

In [423]:
get_ideal_text(skill_set,'https://www.indeed.com/viewjob?jk=eafcd4523a84b235&tk=1d0d2d912ah4o804&from=serp&vjs=3')

['applies',
 'developed',
 'subject',
 'matter',
 'knowledge',
 'solve',
 'common',
 'complex',
 'business',
 'issues',
 'within',
 'established',
 'guidelines',
 'recommends',
 'appropriate',
 'alternatives',
 'works',
 'problems',
 'diverse',
 'complexity',
 'scope',
 'may',
 'act',
 'team',
 'project',
 'leader',
 'providing',
 'direction',
 'team',
 'activities',
 'facilitates',
 'information',
 'validation',
 'team',
 'decision',
 'making',
 'process',
 'exercises',
 'independent',
 'judgment',
 'within',
 'generally',
 'defined',
 'policies',
 'practices',
 'identify',
 'select',
 'solution',
 'ability',
 'handle',
 'unique',
 'situations',
 'may',
 'seek',
 'advice',
 'order',
 'make',
 'decisions',
 'complex',
 'business',
 'issues',
 'responsibilities',
 'design',
 'implement',
 'qa',
 'deploy',
 'document',
 'web',
 'analytics',
 'solutions',
 'using',
 'tag',
 'managers',
 'analytics',
 'tools',
 'diagnose',
 'resolve',
 'complex',
 'analytics',
 'data',
 'collection',
 'ano

In [692]:
job_title, company_name, city, state, text_no_skills, skills = get_query_info(query_url,skill_set) #need to fix the title string thing

In [697]:
def get_top_sim(query_words, df):
    #TO DO 1) here is where I need to incorporate semantic similiarty 
    #      2) can't stem prior to doing similiarty so need to unstem and then resteam
     
    query_words = [" ".join(query)]
    job_words = df.desc.apply(lambda x: " ".join(x)).values.tolist() #this gives you a list of list of the jobs 
     
    vectorizer = TfidfVectorizer() #make sure these defaults are correct
    model = vectorizer.fit(job_words)
    
    query_tfidf = model.transform(query_words)
    job_tfidf = model.transform(job_words)
    
    cosine_sim = cosine_similarity(query_tfidf,job_tfidf)
    top_10_sim = cosine_sim.argsort()[0][::-1][0:10]
    
    return cosine_sim, top_10_sim

In [699]:
cosine_sim, top_10_sim = get_top_sim(text_no_skills,data)

In [702]:
cosine_sim
top_10_sim

array([20, 22, 24, 13,  6, 23, 14,  8, 26, 18])

In [703]:
def get_top_jobs(df,indices):
    return df.iloc[indices,:]

In [705]:
get_top_jobs(data,top_10_sim)

Unnamed: 0,city,company,desc,job_title,skills,state,url
20,"Atlanta,",Epsilon,"[sr, data, scientist, analyt, practic, respons...",Senior Data Scientist,"[aws, python, spark, sql, hadoop, sas, java, s...",GA,https://www.indeed.com/rc/clk?jk=18fb5f0e261aa...
22,"Roswell,",MotoRad of America,"[job, summari, respons, analyz, larg, amount, ...",Insights and Analytics Manager,[],GA,https://www.indeed.com/company/MotoRad-of-Amer...
24,"Alpharetta,",ADP,"[unlock, career, potenti, technolog, adp, enjo...",Full Stack Developer,"[java, docker, excel]",GA,https://www.indeed.com/rc/clk?jk=37fafd1237590...
13,"Atlanta,",Cox Communications,"[descript, cox, communic, look, data, scientis...",Data Scientist,"[sql, python, spark, sas, java, scala, hive, r...",GA,https://www.indeed.com/rc/clk?jk=759ba16b59111...
6,"Alpharetta,",Equifax,"[job, number, 20765, posit, titl, extern, desc...","Data Scientist, Keying and Linking","[sql, r, sas]",GA,https://www.indeed.com/rc/clk?jk=337a80c92cae2...
23,"Atlanta,",Cox Communications,"[descript, cox, communic, look, director, data...","Director, Data Science","[sql, python, spark, sas, java, scala, hive, r...",GA,https://www.indeed.com/rc/clk?jk=280ddd1362b17...
14,"Atlanta,",Catalina Marketing,"[catalina, catalina, person, digit, media, con...",Senior Data Scientist,"[python, r]",GA,https://www.indeed.com/rc/clk?jk=f7d028a809b84...
8,"Atlanta,",Catalina Marketing,"[catalina, catalina, person, digit, media, con...",Data Scientist,"[python, r]",GA,https://www.indeed.com/rc/clk?jk=303a953bd67d7...
26,"Alpharetta,",ADP,"[adp, hire, princip, applic, develop, full, st...",Principal Application Developer,"[java, docker, excel, sql]",GA,https://www.indeed.com/rc/clk?jk=63c3ab3e6804c...
18,"Seattle,",KPMG,"[innov, collabor, shine, lighthous, –, kpmgs, ...","Sr. Associate, Data Scientist, NLP","[aws, python, sql]",WA,https://www.indeed.com/rc/clk?jk=062f6a71626aa...


# Notes
- how do you update relevant skills in case a new language comes up?
    - word associated with data science
    - essentailly i don't have to feed it in the skills_set
- which state / city has job posting with the skills that you have
    - If you're moving from one place to another what skills should you learn
- On the website track the skills per day, per week, per month so you can see how skills are changing over time 
- Given a certain set of skill, return the job postings that for which you have the highest percentage of those skills


## To Do 
1. Make sure that it is going through all the pages
2. Make sure it is getting all the links on each page
3. How do you want to organize
4. Are the less jobs in of 84 than actual jobs
5. How do you decrease run time 
6. Can I use this same webscrabing code to go through Udemy, Coursera, Udacity, Ed-Ex and determine which courses are the best?
7. Have some sort of counter when the website term 


In [None]:
'http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
                   '%2C+', state] 