In [259]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import string 
from time import sleep #don't need this here 
from collections import Counter #don't need this here
import pandas as pd
import numpy as np
from nltk.corpus import stopwords #don't need this here
from nltk.stem import SnowballStemmer #don't need this here
from sklearn.feature_extraction.text import TfidfVectorizer #don't need this here
from sklearn.metrics.pairwise import cosine_similarity #don't need this here

In [278]:
skill_set = ['r','python','java','c++','ruby','perl','matlab','javascript','scala','excel','tableau',
             'd3js','sas','spss','d3','hadoop','mapreduce','spark',
             'pig','hive','shark','zookeeper','flume','mahout',
             'sql','nosql','hase','cassandra','mongodb','docker','aws']

In [279]:
def url_split_join(arg):
    '''
    Input: job 
    Output: text formatted for indeed search 
    '''
    arg = arg.split()
    return '+'.join(word for word in arg)

In [294]:
url_split_join('Software Developer')

'Software+Developer'

In [292]:
def get_search_url(job): 
    
    '''
    Input: job 
    Output: url that directs to results page for the query 
    '''
    job = url_split_join(job)
    site_list = ['http://www.indeed.com/jobs?q="', job, '"']      
        
    return ''.join(site_list) 
    

In [295]:
get_search_url('Software Developer')

'http://www.indeed.com/jobs?q="Software+Developer"'

In [331]:
def get_job_urls(job):
    
    '''
    Goes through the result page for the query and return for urls for each organic job posting
    Input: query
    Output: list of all jobs urls features in the query 
    '''
    
    search_url = get_search_url(job) #gets results page 
    
    try:
        site = urlopen(search_url).read()
    except:
        return 'Invalid Search' #raises exception if search combination is invalid of if no jobs of that nature exist 
    
    soup = BeautifulSoup(site)
    
    if len(soup) == 0: # in case the default parser lxml doesn't work, try another one
        soup = BeautifulSoup(site, 'html5lib')
    
    
   #gets the total number (organic and sponsored) of job postings
    
    num_jobs = soup.find(id = 'searchCount').string 
    num_jobs = re.findall('\d+', num_jobs) 
    num_jobs = int("".join(num_jobs[1:]))
    print(num_jobs)
    
    #gets the number of page results
    if num_jobs > 10: 
        num_pages = num_jobs//10 
    else: 
        num_pages = 1
    print(num_pages)
    
    page_urls = [] 
    
    #iterates over each page to get the urls within that page
    for i in range(num_pages): 

        start_num = str(i*10)  #page 1 starts at start = 0 , page 1 starts at 10 etc.
        page_url = ''.join([search_url,'&start=', start_num]) 
        print(page_url)
        
        current_page = urlopen(page_url).read()
        page_soup = BeautifulSoup(current_page)
        
        if len(page_soup) == 0: # In case the default parser lxml doesn't work, try another one
            page_soup = BeautifulSoup(page_url, 'html5lib')
        
        results_col =  page_soup.find(id = 'resultsCol')  
        organic_tags = results_col.find_all('div', {'data-tn-component' : "organicJob"}) #gets tags for organic rearch results 
         
        urls  = [x.a.attrs.get('href') for x in organic_tags] #gets the url for the specific job 
        page_urls.append(urls)
        
        if len(urls) < 10: #necessary because sponsored jobs results included in num_jobs
            break 
    
    job_urls = ['https://www.indeed.com'+job for sublist in page_urls for job in sublist]
    
    return job_urls 
               

In [318]:
#get_job_urls('Software Developer')

In [289]:
def get_job_info(job_url):
    
    '''
    Input: url of indeed job posting
    Output: role, title, location and list of words in description
    '''
    
    #TO DO: 1) check that ds3 works 
 
    try:
        site =  urlopen(job_url).read() #opens and returns html
    except:
        return "url could not be opened and read" #CHECK THIS 
    
    soup = BeautifulSoup(site)
    
    if len(soup) == 0: # In case the default parser lxml doesn't work, try another one
        soup = BeautifulSoup(site, 'html5lib')
    
    #general job information
     
    try:
        job_title = soup.find('h3',{'class':"icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title"}).get_text()
    except:
        job_title = 'unavailable' 
    
    try:
        company_name = soup.find('div',{'class':'icl-u-lg-mr--sm icl-u-xs-mr--xs'}).get_text() #do the other company name thing, do this for state
    except:
        company_name = 'unavailable'

    try:
        company_info = soup.select('div.jobsearch-InlineCompanyRating.icl-u-xs-mt--xs.jobsearch-DesktopStickyContainer-companyrating')[0].text
        company_info = company_info.split('-')[-1]
        company_info = company_info.split(' ')
    
        info = []
    
        for i in company_info:
            try:
                i = int(i)
                info.append(i)
            except:
                info.append(i)
    
        location = [x for x in info if not isinstance(x, int)]
        city = ' '.join(location[:-1])
        state = location[-1]
    
    except:
        city = 'unavailable'
        state = 'unavailable'
       
    try:
        content = soup.find('div',{'class':'jobsearch-JobComponent-description icl-u-xs-mt--md'})
        words = content.get_text().split()
    
        punctuation = string.punctuation
        stop_words = stopwords.words('english')
    
        words =[''.join(ch for ch in word if ch not in punctuation) for word in words] #gets rid of punctuation between words to enable joint word adjustment and genereal punctuation 
        words = [re.sub(r"([a-z])([A-Z])", r"\1 \2",word).split() for word in words] #adjusts for joint words
    
        words =[word.lower() for sublist in words for word in sublist] #flattens lists
        job_description = [word for word in words if word not in stop_words and word not in punctuation] #gets rids of stop words

    except:
        job_description = 'Unavailable'
    
    return job_title, company_name, city[:-1], state, job_description
        


In [271]:
#get_job_info('https://www.indeed.com/viewjob?jk=891cfca2b16bf81e&tk=1d0a2hatg19t8000&from=serp&alid=3&advn=8782415643047922&sjdu=JS6l7gydYCZv8PVkhKxCPnl-QLnRktmQo81JznoYMm1mcmDdYzLAXfklnXERcEVjwletQFW-TwQlaPMsRh6LMg_FF59PQXBkVL66Sl86xFncyvSUuUf5UXvPvCrNA6veXuWVcy8oSAkuEAiehROlopkLfP5Md6JkycvfVYukEoQ')

In [286]:
def clean_job_description(job_description,skill_set): 
    '''
    Input: list of words included in job posting 
    Output: 
        Desc = list of stemmed words included in job posting no including tecnical skills 
        skills = technical skills required for job 
    
    '''
    
    try:
        skills = list(set([word for word in job_description if word in skill_set]))
        text_no_skills = [word for word in job_description if word not in skills]
    
    except:
        skills = 'unavailable'
        text_no_skills = 'unavailable'
    
    return text_no_skills, skills 
    
    
    

In [102]:
text_no_skills, skills = clean_job_description(job_description,skill_set)

In [103]:
#text_no_skills
#skills

In [322]:
def get_data(job, skill_set, city = None, state = None):
    '''
    Input:
    Ouput: dataframe with revelant information about job
        Words used for similiarty 
        Skills and location used for filetring results
    '''
    
     #TO DO 1) how do i make this run faster 
    

    job_urls = get_job_urls(job) 
    
    if job_urls == 'Invalid Search':
        return 'Invalid Search'
    
    job = []
    unreadable_count = 0 
    
    for url in job_urls:
        job_info = get_job_info(url)
        
        if job_info == "url could not be opened and read":
            unreadable_count += 1
        else:
            job_title, company_name, city, state, job_description = job_info
            text_no_skills, skills = clean_job_description(job_description,skill_set)
            job.append({'job_title':job_title,"company":company_name,'city':city, 'state': state,'desc': text_no_skills, 'skills':skills,'url': url}) 
        
    return pd.DataFrame(job), unreadable_count

#here that dataframe needs to be with the constraints



In [152]:
data_scientist_1, count_1 = get_data("Data Scientist", skill_set,'Buford','GA' )

In [153]:
data_scientist_1

Unnamed: 0,city,company,desc,job_title,skills,state,url
0,Alpharetta,Kemper,"[locat, alpharetta, ga, posit, summari, create...",Data Analytics Specialist,[],GA,https://www.indeed.com/rc/clk?jk=455afcff50032...
1,Alpharetta,CHEP,"[parttim, temporari, internship, chep, help, m...",Internship – Data Science,[],GA,https://www.indeed.com/rc/clk?jk=dd8f4cf0deba0...
2,Duluth,"National Vision, Inc.","[posit, descript, nation, vision, believ, ever...",Data Scientist,"[spss, r, tableau, sas, python, sql]",GA,https://www.indeed.com/rc/clk?jk=a1db79011012c...
3,Atlanta,SunTrust,"[perform, sophist, data, analyt, encompass, da...",Data Scientist,[],GA,https://www.indeed.com/rc/clk?jk=b0bf8ea49f266...
4,Alpharetta,Verizon,"[you'll, you'll, part, team, identifi, trend, ...",Data Scientist,"[r, tableau, hadoop, python, hive, sql]",GA,https://www.indeed.com/rc/clk?jk=8ab778c265cdf...
5,Duluth,WestRock,"[nyse, wrk, partner, custom, provid, different...",Associate Data Scientist,"[spark, java, r, hadoop, spss, python]",GA,https://www.indeed.com/rc/clk?jk=b8db29caa083b...
6,Alpharetta,Change Healthcare,"[market, analysi, research, industri, topic, i...",Data Science Analyst,"[perl, excel, java, hadoop, python, sql]",GA,https://www.indeed.com/company/Green/jobs/Data...
7,Atlanta,Arby's,"[purpos, posit, posit, critic, understand, dri...","Associate Data Scientist, Advanced Analytics","[r, sql, python]",GA,https://www.indeed.com/rc/clk?jk=77518cb3e9ed4...
8,Suwanee,"ARRIS Group, Inc.","[internship, data, analyt, intern, 18003239, d...",Data Analytics Intern,[python],GA,https://www.indeed.com/rc/clk?jk=b1f2cf67e21c3...
9,Alpharetta,Brielle Technologies,"[60, 70, hour, senior, big, data, engin, digit...",Data Scientists – Machine Learning,"[spark, java, r, tableau, hadoop, d3js, scala,...",GA,https://www.indeed.com/rc/clk?jk=b5900f9c349e1...


In [245]:
def dfs_to_csv(job,file_name,skill_set,path):
    '''
    Input: job as typed out in indeed 
    '''
    job_df, count = get_data(job,skill_set)
    job_df['job'] = job
    job_df.to_csv(path + file_name + '.csv')
    
    return count 


In [290]:
ds_count = dfs_to_csv('Data Scientist','data_scientist',skill_set,r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')

In [288]:
mle_count = dfs_to_csv('Machine Learning Engineer', 'machine_learning_engineer',skill_set, r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/' )

In [None]:
soft_developer_count = dfs_to_csv('Software Developer', 'softwate_developer',skill_set,r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')

11047
1104
http://www.indeed.com/jobs?q="Software+Developer"&start=0
http://www.indeed.com/jobs?q="Software+Developer"&start=10
http://www.indeed.com/jobs?q="Software+Developer"&start=20
http://www.indeed.com/jobs?q="Software+Developer"&start=30
http://www.indeed.com/jobs?q="Software+Developer"&start=40
http://www.indeed.com/jobs?q="Software+Developer"&start=50
http://www.indeed.com/jobs?q="Software+Developer"&start=60
http://www.indeed.com/jobs?q="Software+Developer"&start=70
http://www.indeed.com/jobs?q="Software+Developer"&start=80
http://www.indeed.com/jobs?q="Software+Developer"&start=90
http://www.indeed.com/jobs?q="Software+Developer"&start=100
http://www.indeed.com/jobs?q="Software+Developer"&start=110
http://www.indeed.com/jobs?q="Software+Developer"&start=120
http://www.indeed.com/jobs?q="Software+Developer"&start=130
http://www.indeed.com/jobs?q="Software+Developer"&start=140
http://www.indeed.com/jobs?q="Software+Developer"&start=150
http://www.indeed.com/jobs?q="Software+D

http://www.indeed.com/jobs?q="Software+Developer"&start=1360
http://www.indeed.com/jobs?q="Software+Developer"&start=1370
http://www.indeed.com/jobs?q="Software+Developer"&start=1380
http://www.indeed.com/jobs?q="Software+Developer"&start=1390
http://www.indeed.com/jobs?q="Software+Developer"&start=1400
http://www.indeed.com/jobs?q="Software+Developer"&start=1410
http://www.indeed.com/jobs?q="Software+Developer"&start=1420
http://www.indeed.com/jobs?q="Software+Developer"&start=1430
http://www.indeed.com/jobs?q="Software+Developer"&start=1440
http://www.indeed.com/jobs?q="Software+Developer"&start=1450
http://www.indeed.com/jobs?q="Software+Developer"&start=1460
http://www.indeed.com/jobs?q="Software+Developer"&start=1470
http://www.indeed.com/jobs?q="Software+Developer"&start=1480
http://www.indeed.com/jobs?q="Software+Developer"&start=1490
http://www.indeed.com/jobs?q="Software+Developer"&start=1500
http://www.indeed.com/jobs?q="Software+Developer"&start=1510
http://www.indeed.com/jo

http://www.indeed.com/jobs?q="Software+Developer"&start=2710
http://www.indeed.com/jobs?q="Software+Developer"&start=2720
http://www.indeed.com/jobs?q="Software+Developer"&start=2730
http://www.indeed.com/jobs?q="Software+Developer"&start=2740
http://www.indeed.com/jobs?q="Software+Developer"&start=2750
http://www.indeed.com/jobs?q="Software+Developer"&start=2760
http://www.indeed.com/jobs?q="Software+Developer"&start=2770
http://www.indeed.com/jobs?q="Software+Developer"&start=2780
http://www.indeed.com/jobs?q="Software+Developer"&start=2790
http://www.indeed.com/jobs?q="Software+Developer"&start=2800
http://www.indeed.com/jobs?q="Software+Developer"&start=2810
http://www.indeed.com/jobs?q="Software+Developer"&start=2820
http://www.indeed.com/jobs?q="Software+Developer"&start=2830
http://www.indeed.com/jobs?q="Software+Developer"&start=2840
http://www.indeed.com/jobs?q="Software+Developer"&start=2850
http://www.indeed.com/jobs?q="Software+Developer"&start=2860
http://www.indeed.com/jo

http://www.indeed.com/jobs?q="Software+Developer"&start=4060
http://www.indeed.com/jobs?q="Software+Developer"&start=4070
http://www.indeed.com/jobs?q="Software+Developer"&start=4080
http://www.indeed.com/jobs?q="Software+Developer"&start=4090
http://www.indeed.com/jobs?q="Software+Developer"&start=4100
http://www.indeed.com/jobs?q="Software+Developer"&start=4110
http://www.indeed.com/jobs?q="Software+Developer"&start=4120
http://www.indeed.com/jobs?q="Software+Developer"&start=4130
http://www.indeed.com/jobs?q="Software+Developer"&start=4140
http://www.indeed.com/jobs?q="Software+Developer"&start=4150
http://www.indeed.com/jobs?q="Software+Developer"&start=4160
http://www.indeed.com/jobs?q="Software+Developer"&start=4170
http://www.indeed.com/jobs?q="Software+Developer"&start=4180
http://www.indeed.com/jobs?q="Software+Developer"&start=4190
http://www.indeed.com/jobs?q="Software+Developer"&start=4200
http://www.indeed.com/jobs?q="Software+Developer"&start=4210
http://www.indeed.com/jo

http://www.indeed.com/jobs?q="Software+Developer"&start=5410
http://www.indeed.com/jobs?q="Software+Developer"&start=5420
http://www.indeed.com/jobs?q="Software+Developer"&start=5430
http://www.indeed.com/jobs?q="Software+Developer"&start=5440
http://www.indeed.com/jobs?q="Software+Developer"&start=5450
http://www.indeed.com/jobs?q="Software+Developer"&start=5460
http://www.indeed.com/jobs?q="Software+Developer"&start=5470
http://www.indeed.com/jobs?q="Software+Developer"&start=5480
http://www.indeed.com/jobs?q="Software+Developer"&start=5490
http://www.indeed.com/jobs?q="Software+Developer"&start=5500
http://www.indeed.com/jobs?q="Software+Developer"&start=5510
http://www.indeed.com/jobs?q="Software+Developer"&start=5520
http://www.indeed.com/jobs?q="Software+Developer"&start=5530
http://www.indeed.com/jobs?q="Software+Developer"&start=5540
http://www.indeed.com/jobs?q="Software+Developer"&start=5550
http://www.indeed.com/jobs?q="Software+Developer"&start=5560
http://www.indeed.com/jo

http://www.indeed.com/jobs?q="Software+Developer"&start=6760
http://www.indeed.com/jobs?q="Software+Developer"&start=6770
http://www.indeed.com/jobs?q="Software+Developer"&start=6780
http://www.indeed.com/jobs?q="Software+Developer"&start=6790
http://www.indeed.com/jobs?q="Software+Developer"&start=6800
http://www.indeed.com/jobs?q="Software+Developer"&start=6810
http://www.indeed.com/jobs?q="Software+Developer"&start=6820
http://www.indeed.com/jobs?q="Software+Developer"&start=6830
http://www.indeed.com/jobs?q="Software+Developer"&start=6840
http://www.indeed.com/jobs?q="Software+Developer"&start=6850
http://www.indeed.com/jobs?q="Software+Developer"&start=6860
http://www.indeed.com/jobs?q="Software+Developer"&start=6870
http://www.indeed.com/jobs?q="Software+Developer"&start=6880
http://www.indeed.com/jobs?q="Software+Developer"&start=6890
http://www.indeed.com/jobs?q="Software+Developer"&start=6900
http://www.indeed.com/jobs?q="Software+Developer"&start=6910
http://www.indeed.com/jo

http://www.indeed.com/jobs?q="Software+Developer"&start=8110
http://www.indeed.com/jobs?q="Software+Developer"&start=8120
http://www.indeed.com/jobs?q="Software+Developer"&start=8130
http://www.indeed.com/jobs?q="Software+Developer"&start=8140
http://www.indeed.com/jobs?q="Software+Developer"&start=8150
http://www.indeed.com/jobs?q="Software+Developer"&start=8160
http://www.indeed.com/jobs?q="Software+Developer"&start=8170
http://www.indeed.com/jobs?q="Software+Developer"&start=8180
http://www.indeed.com/jobs?q="Software+Developer"&start=8190
http://www.indeed.com/jobs?q="Software+Developer"&start=8200
http://www.indeed.com/jobs?q="Software+Developer"&start=8210
http://www.indeed.com/jobs?q="Software+Developer"&start=8220
http://www.indeed.com/jobs?q="Software+Developer"&start=8230
http://www.indeed.com/jobs?q="Software+Developer"&start=8240
http://www.indeed.com/jobs?q="Software+Developer"&start=8250
http://www.indeed.com/jobs?q="Software+Developer"&start=8260
http://www.indeed.com/jo

http://www.indeed.com/jobs?q="Software+Developer"&start=9460
http://www.indeed.com/jobs?q="Software+Developer"&start=9470
http://www.indeed.com/jobs?q="Software+Developer"&start=9480
http://www.indeed.com/jobs?q="Software+Developer"&start=9490
http://www.indeed.com/jobs?q="Software+Developer"&start=9500
http://www.indeed.com/jobs?q="Software+Developer"&start=9510
http://www.indeed.com/jobs?q="Software+Developer"&start=9520
http://www.indeed.com/jobs?q="Software+Developer"&start=9530
http://www.indeed.com/jobs?q="Software+Developer"&start=9540
http://www.indeed.com/jobs?q="Software+Developer"&start=9550
http://www.indeed.com/jobs?q="Software+Developer"&start=9560
http://www.indeed.com/jobs?q="Software+Developer"&start=9570
http://www.indeed.com/jobs?q="Software+Developer"&start=9580
http://www.indeed.com/jobs?q="Software+Developer"&start=9590
http://www.indeed.com/jobs?q="Software+Developer"&start=9600
http://www.indeed.com/jobs?q="Software+Developer"&start=9610
http://www.indeed.com/jo

http://www.indeed.com/jobs?q="Software+Developer"&start=10800
http://www.indeed.com/jobs?q="Software+Developer"&start=10810
http://www.indeed.com/jobs?q="Software+Developer"&start=10820
http://www.indeed.com/jobs?q="Software+Developer"&start=10830
http://www.indeed.com/jobs?q="Software+Developer"&start=10840
http://www.indeed.com/jobs?q="Software+Developer"&start=10850
http://www.indeed.com/jobs?q="Software+Developer"&start=10860
http://www.indeed.com/jobs?q="Software+Developer"&start=10870
http://www.indeed.com/jobs?q="Software+Developer"&start=10880
http://www.indeed.com/jobs?q="Software+Developer"&start=10890
http://www.indeed.com/jobs?q="Software+Developer"&start=10900
http://www.indeed.com/jobs?q="Software+Developer"&start=10910
http://www.indeed.com/jobs?q="Software+Developer"&start=10920
http://www.indeed.com/jobs?q="Software+Developer"&start=10930
http://www.indeed.com/jobs?q="Software+Developer"&start=10940
http://www.indeed.com/jobs?q="Software+Developer"&start=10950
http://w

In [326]:
fsd_count = dfs_to_csv('Full Stack Developer', 'full_stack_developer',skill_set,r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')

http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=0
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=10
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=20
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=30
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=40
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=50
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=60
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=70
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=80
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=90
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=100
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=110
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=120
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=130
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=140
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=150
http://www.indeed.c

http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=1320
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=1330
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=1340
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=1350
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=1360
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=1370
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=1380
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=1390
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=1400
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=1410
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=1420
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=1430
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=1440
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=1450
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=1460
http://www.indeed.com/jobs?q="Full+Stack+Developer"&sta

http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=2630
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=2640
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=2650
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=2660
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=2670
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=2680
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=2690
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=2700
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=2710
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=2720
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=2730
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=2740
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=2750
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=2760
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=2770
http://www.indeed.com/jobs?q="Full+Stack+Developer"&sta

http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=3940
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=3950
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=3960
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=3970
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=3980
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=3990
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=4000
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=4010
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=4020
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=4030
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=4040
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=4050
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=4060
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=4070
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=4080
http://www.indeed.com/jobs?q="Full+Stack+Developer"&sta

http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=5250
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=5260
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=5270
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=5280
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=5290
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=5300
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=5310
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=5320
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=5330
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=5340
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=5350
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=5360
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=5370
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=5380
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=5390
http://www.indeed.com/jobs?q="Full+Stack+Developer"&sta

http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=6560
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=6570
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=6580
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=6590
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=6600
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=6610
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=6620
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=6630
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=6640
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=6650
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=6660
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=6670
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=6680
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=6690
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=6700
http://www.indeed.com/jobs?q="Full+Stack+Developer"&sta

http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=7870
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=7880
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=7890
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=7900
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=7910
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=7920
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=7930
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=7940
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=7950
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=7960
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=7970
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=7980
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=7990
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=8000
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=8010
http://www.indeed.com/jobs?q="Full+Stack+Developer"&sta

http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=9180
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=9190
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=9200
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=9210
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=9220
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=9230
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=9240
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=9250
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=9260
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=9270
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=9280
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=9290
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=9300
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=9310
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=9320
http://www.indeed.com/jobs?q="Full+Stack+Developer"&sta

http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=10480
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=10490
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=10500
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=10510
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=10520
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=10530
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=10540
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=10550
http://www.indeed.com/jobs?q="Full+Stack+Developer"&start=10560


KeyboardInterrupt: 

In [None]:
bia_count = dfs_to_csv('Business Intelligence Analyst','business_intelligence_analyst', skill_set,r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')

In [None]:
developer_count = dsf_to_csv('Developer', 'developer', skill_set, r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/' )

In [None]:
data_eng_count = dfs_to_csv("Data Engineer", 'data_engineer', skill_set, r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')

In [None]:
cloud_arc_count = dfs_to_csv("Could Architect", 'cloud_architect',skill_set,r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')

In [None]:
data_arc_count = dfs_to_csv('Data Architect', 'data_architect',skill_set,r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')

In [None]:
dev_ops_count = dfs_to_csv('DevOps','dev_ops',skill_set, r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')

In [None]:
software_eng_count = dfs_to_csv('Sofwate Engineer','software_engineer',skill_set,r'/Users/danielatejada 1/Desktop/Galvanize/capstone_project/')

In [None]:
query_url = get_job_urls('Data Scientist','Boston','MA')[0] #you have to deal with the .string thing 

In [692]:
job_title, company_name, city, state, text_no_skills, skills = get_query_info(query_url,skill_set) #need to fix the title string thing

In [697]:
def get_top_sim(query_words, df):
    #TO DO 1) here is where I need to incorporate semantic similiarty 
    #      2) can't stem prior to doing similiarty so need to unstem and then resteam
     
    query_words = [" ".join(query)]
    job_words = df.desc.apply(lambda x: " ".join(x)).values.tolist() #this gives you a list of list of the jobs 
     
    vectorizer = TfidfVectorizer() #make sure these defaults are correct
    model = vectorizer.fit(job_words)
    
    query_tfidf = model.transform(query_words)
    job_tfidf = model.transform(job_words)
    
    cosine_sim = cosine_similarity(query_tfidf,job_tfidf)
    top_10_sim = cosine_sim.argsort()[0][::-1][0:10]
    
    return cosine_sim, top_10_sim

In [699]:
cosine_sim, top_10_sim = get_top_sim(text_no_skills,data)

In [702]:
cosine_sim
top_10_sim

array([20, 22, 24, 13,  6, 23, 14,  8, 26, 18])

In [703]:
def get_top_jobs(df,indices):
    return df.iloc[indices,:]

In [705]:
get_top_jobs(data,top_10_sim)

Unnamed: 0,city,company,desc,job_title,skills,state,url
20,"Atlanta,",Epsilon,"[sr, data, scientist, analyt, practic, respons...",Senior Data Scientist,"[aws, python, spark, sql, hadoop, sas, java, s...",GA,https://www.indeed.com/rc/clk?jk=18fb5f0e261aa...
22,"Roswell,",MotoRad of America,"[job, summari, respons, analyz, larg, amount, ...",Insights and Analytics Manager,[],GA,https://www.indeed.com/company/MotoRad-of-Amer...
24,"Alpharetta,",ADP,"[unlock, career, potenti, technolog, adp, enjo...",Full Stack Developer,"[java, docker, excel]",GA,https://www.indeed.com/rc/clk?jk=37fafd1237590...
13,"Atlanta,",Cox Communications,"[descript, cox, communic, look, data, scientis...",Data Scientist,"[sql, python, spark, sas, java, scala, hive, r...",GA,https://www.indeed.com/rc/clk?jk=759ba16b59111...
6,"Alpharetta,",Equifax,"[job, number, 20765, posit, titl, extern, desc...","Data Scientist, Keying and Linking","[sql, r, sas]",GA,https://www.indeed.com/rc/clk?jk=337a80c92cae2...
23,"Atlanta,",Cox Communications,"[descript, cox, communic, look, director, data...","Director, Data Science","[sql, python, spark, sas, java, scala, hive, r...",GA,https://www.indeed.com/rc/clk?jk=280ddd1362b17...
14,"Atlanta,",Catalina Marketing,"[catalina, catalina, person, digit, media, con...",Senior Data Scientist,"[python, r]",GA,https://www.indeed.com/rc/clk?jk=f7d028a809b84...
8,"Atlanta,",Catalina Marketing,"[catalina, catalina, person, digit, media, con...",Data Scientist,"[python, r]",GA,https://www.indeed.com/rc/clk?jk=303a953bd67d7...
26,"Alpharetta,",ADP,"[adp, hire, princip, applic, develop, full, st...",Principal Application Developer,"[java, docker, excel, sql]",GA,https://www.indeed.com/rc/clk?jk=63c3ab3e6804c...
18,"Seattle,",KPMG,"[innov, collabor, shine, lighthous, –, kpmgs, ...","Sr. Associate, Data Scientist, NLP","[aws, python, sql]",WA,https://www.indeed.com/rc/clk?jk=062f6a71626aa...


# Notes
- how do you update relevant skills in case a new language comes up?
    - word associated with data science
    - essentailly i don't have to feed it in the skills_set
- which state / city has job posting with the skills that you have
    - If you're moving from one place to another what skills should you learn
- On the website track the skills per day, per week, per month so you can see how skills are changing over time 
- Given a certain set of skill, return the job postings that for which you have the highest percentage of those skills


## To Do 
1. Make sure that it is going through all the pages
2. Make sure it is getting all the links on each page
3. How do you want to organize
4. Are the less jobs in of 84 than actual jobs
5. How do you decrease run time 
6. Can I use this same webscrabing code to go through Udemy, Coursera, Udacity, Ed-Ex and determine which courses are the best?
7. Have some sort of counter when the website term 


In [None]:
'http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
                   '%2C+', state] 