## Fletcher

#### Analyzingdata scientist listings

In [1]:
# code inspired from https://jessesw.com/Data-Science-Skills/

from bs4 import BeautifulSoup # For HTML parsing
import urllib2 # Website connections
import re # Regular expressions
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'
import pandas as pd # For converting results to a dataframe and bar chart plots
from pymongo import MongoClient
import ssl
%matplotlib inline

In [2]:
client = MongoClient()
db = client.dsbc
job_descs = db["job_descriptions"]

In [3]:
def clean_text(url):
    '''this function takes a url to a job posting as input and outputs cleaned text of job description'''
    try:
        page = urllib2.urlopen(url, timeout = 3)
        soup = BeautifulSoup(page)
    except:
        try:
            context = ssl._create_unverified_context()
            page = urllib2.urlopen(url, context=context)
            soup = BeautifulSoup(page)
        except:
            print "can't load webpage: " + url
            return 
    
    for script in soup(["script", "style"]):
        script.extract()  # Remove these two elements from the BS4 object

    text = soup.get_text() # Get the text from this
    lines = (line.strip() for line in text.splitlines()) # break into lines
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each

    def chunk_space(chunk):
        chunk_out = chunk + ' ' # Need to fix spacing issue
        return chunk_out

    text = ''.join(chunk_space(chunk) for chunk in chunks if chunk).encode('utf-8') # Get rid of all blank lines and ends of line

    # Now clean out all of the unicode junk (this line works great!!!)

    try:
        text = text.decode('unicode_escape').encode('ascii', 'ignore') # Need this as some websites aren't formatted
    except:                                                            # in a way that this works, can occasionally throw
        print "Encoding error at: " + url                              # an exception
        return  

    text = re.sub("[^a-zA-Z.+0-9]"," ", text)  # Now get rid of any terms that aren't words (include 3 for d3.js)
                                                # Also include + for C++

    text = text.lower().split()  # Go to lower case and split them apart

    stop_words = set(stopwords.words("english")) # Filter out any stop words
    text = [w for w in text if not w in stop_words]

    return text

In [4]:
def get_count(soup):
    '''get number of job postings for each location'''
    
    search_count = soup.find("div", attrs = {"id": "searchCount"}).text
    search_count = str(search_count)
    search_count = search_count.split("of ")[1]
    search_count = int(search_count.replace(",", ""))
    return search_count

In [5]:
def get_soup(location, list_index):
    location = location.replace(" ", "+")
    url = 'http://www.indeed.com/jobs?q=Data+Scientist&l=' + location + '&jt=fulltime&start=' + str(list_index)
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page)
    return soup

In [6]:
def get_descriptions(location):
    '''takes as input location in "city, state" format
    and inserts document with company, job, title, and job description to mongodb database'''
    
    location_adj = location.replace(" ", "+")
    list_index = 0
    
    soup = get_soup(location_adj, list_index)
    
    jobs_count = get_count(soup) # gets number of job postings
    
    while (list_index < jobs_count) and (list_index < 1000):

        soup = get_soup(location_adj, list_index)

        job_links = soup.find_all("a", attrs = {"rel": "nofollow", "itemprop": "title"})
        company_names = soup.find_all("span", attrs = {"class": "company", "itemprop": "hiringOrganization"})

        for i in range(10):
            try:
                job_find = str(job_links[i]).split('href="')[1]
                job_find = job_find.split('"', 1)
                job_url = job_find[0]
                job_url = 'http://www.indeed.com' + job_url

                job_title = job_find[1].split('title="')[1].split('"')[0]
                description = clean_text(job_url)
                company_name = company_names[i].string.decode('unicode_escape').encode('ascii', 'ignore')

                document = {"title": job_title, "description": description, "company": company_name, "location": location}
                job_descs.insert(document)
            
            except:
                pass

        list_index += 10
        

In [17]:
get_descriptions("New York, NY")

can't load webpage: http://www.indeed.com/rc/clk?jk=fe1b6ed033c3da05
can't load webpage: http://www.indeed.com/rc/clk?jk=eea861a09c368b08
can't load webpage: http://www.indeed.com/rc/clk?jk=fe1b6ed033c3da05
can't load webpage: http://www.indeed.com/rc/clk?jk=f55817b461dd9296
can't load webpage: http://www.indeed.com/rc/clk?jk=1170cf8273017948
can't load webpage: http://www.indeed.com/rc/clk?jk=f4180461c1594e09
can't load webpage: http://www.indeed.com/rc/clk?jk=2b29f5bff996644e
can't load webpage: http://www.indeed.com/rc/clk?jk=ba5b4dd5ad709365
can't load webpage: http://www.indeed.com/rc/clk?jk=3d5a83c661dafe4f
can't load webpage: http://www.indeed.com/rc/clk?jk=2b29f5bff996644e
can't load webpage: http://www.indeed.com/rc/clk?jk=f4180461c1594e09
can't load webpage: http://www.indeed.com/rc/clk?jk=4aa6343fb5b422c0
can't load webpage: http://www.indeed.com/rc/clk?jk=2ce32905cd66b559
can't load webpage: http://www.indeed.com/rc/clk?jk=f2d2737308c588e0
can't load webpage: http://www.ind




Encoding error at: http://www.indeed.com/rc/clk?jk=a78122ab4cdf32b8
can't load webpage: http://www.indeed.com/rc/clk?jk=7f66cdc2dce1c308




can't load webpage: http://www.indeed.com/rc/clk?jk=0fdadd7fd4770cf7
can't load webpage: http://www.indeed.com/rc/clk?jk=a63323957a25d626
can't load webpage: http://www.indeed.com/rc/clk?jk=0b389db70735951d
can't load webpage: http://www.indeed.com/rc/clk?jk=e4b4f874ab3c0056
can't load webpage: http://www.indeed.com/rc/clk?jk=b43fb478c3e509ed
can't load webpage: http://www.indeed.com/rc/clk?jk=71c25183f9b53d4d
can't load webpage: http://www.indeed.com/rc/clk?jk=fd74f9fbbeea25a3
can't load webpage: http://www.indeed.com/rc/clk?jk=1e1edddbaf95cc7d
can't load webpage: http://www.indeed.com/rc/clk?jk=aad8a97bb15a81b3
can't load webpage: http://www.indeed.com/rc/clk?jk=475a8bc82d08d389
can't load webpage: http://www.indeed.com/rc/clk?jk=58c98b994957ab6d
can't load webpage: http://www.indeed.com/rc/clk?jk=65b7920d61928cb2
can't load webpage: http://www.indeed.com/rc/clk?jk=1d58ac87d2f04932
can't load webpage: http://www.indeed.com/rc/clk?jk=ec8291733817ffd7
can't load webpage: http://www.in




can't load webpage: http://www.indeed.com/rc/clk?jk=ba54833832d328b2


In [None]:
# get_descriptions("New York, NY")

get_descriptions("San Francisco, CA")
get_descriptions("Washington, DC")
get_descriptions("Seattle, WA")
get_descriptions("Chicago, IL")
get_descriptions("Dallas, TX")

# get_descriptions("Houston, TX")
# get_descriptions("Los Angeles, CA")
# get_descriptions("Cambridge, MA")
# get_descriptions("Boston, MA")

# # get_descriptions("San Diego, CA")
# get_descriptions("Atlanta, GA")
# get_descriptions("Philadelphia, PA")
# get_descriptions("Austin, TX")

can't load webpage: http://www.indeed.com/rc/clk?jk=33326ff7432fc2d9
can't load webpage: http://www.indeed.com/rc/clk?jk=a4909bca385e91dc
can't load webpage: http://www.indeed.com/rc/clk?jk=624de3123c537acd
can't load webpage: http://www.indeed.com/rc/clk?jk=5c98448bb0d835c8
can't load webpage: http://www.indeed.com/rc/clk?jk=de013f7192746ba4
can't load webpage: http://www.indeed.com/rc/clk?jk=d270e9e959ccf3cf
can't load webpage: http://www.indeed.com/rc/clk?jk=de013f7192746ba4
can't load webpage: http://www.indeed.com/rc/clk?jk=a4909bca385e91dc
can't load webpage: http://www.indeed.com/rc/clk?jk=d270e9e959ccf3cf
can't load webpage: http://www.indeed.com/rc/clk?jk=067876b9374d5365
can't load webpage: http://www.indeed.com/rc/clk?jk=5c98448bb0d835c8
can't load webpage: http://www.indeed.com/rc/clk?jk=dd516a887453524f
can't load webpage: http://www.indeed.com/rc/clk?jk=2c37351b4774025c
can't load webpage: http://www.indeed.com/rc/clk?jk=1350aea668613613
can't load webpage: http://www.ind