In [47]:
import pandas as pd
import numpy as np
import urllib
from bs4 import BeautifulSoup
import time
import re

In [48]:
# https://www.indeed.com/jobs?q=data+scientist+$20,000&l=New+York&start=30
# $20000 means 20k or up
# l = location
# start increases incrementally by 10. Starting at 00.

Trialing one page of Indeed to test if my functions work.

In [49]:
# Test one page of url
url = 'https://www.indeed.com/jobs?q=data+scientist&start=00'
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html, 'lxml')

In [50]:
rows = soup.find_all('span', {'class':'np'})
if len(rows) > 1:
    print 'pass'
elif 'Next' in rows[0].text:
    print 'break'
else:
    print 'what'

break


In [51]:
rows[0].text

u'Next\xa0\xbb'

#### Function making
Defining all my functions for each feature

In [52]:
# Trying to get all jobs titles first
job_list = []
for row in soup.find_all('div', {'id':True, 'data-jk':True}):
    jobtitle = row.a.text
    print jobtitle
    job_list.append(jobtitle)
    
# Convert to function
def job_func(soup, job_list=[]):
    for row in soup.find_all('div', {'id':True, 'data-jk':True}):
        jobtitle = row.a.text
        job_list.append(jobtitle)
    return job_list

Data Scientist
Data Scientist
Junior Data Scientist
Jr Data Scientist
Data Scientist
Data Scientist/Machine Learning Engineer
Data Scientist
Data Scientist
Data Science Analyst
Data Scientist
Data Scientist
Entry Level Data Scientist
Data Scientist (Product)
Data Scientist - Interactive Gaming
Data Scientist - Machine Learning


In [53]:
# Looking at location
location_list = []
for row in soup.find_all('div', {'id':True}):
    for header in row.find_all('span', {'class':'location'}):
        location = header.text
        print location
        location_list.append(location)
        
# Convert to function
def loc_func(soup, location_list=[]):
    for row in soup.find_all('div', {'id':True}):
        for header in row.find_all('span', {'class':'location'}):
            location = header.text
            location_list.append(location)
    return location_list

Durham, NC
Arlington, VA 22209
Tempe, AZ 85282
Chantilly, VA 20151
Kennesaw, GA
San Diego, CA 92129
Seattle, WA 98103 (Green Lake - Wallingford area)
Newark, DE
Silver Spring, MD 20910
Chicago, IL
Kansas City, MO
United States
New York, NY 10011 (Chelsea area)
San Francisco, CA
Palo Alto, CA


In [54]:
# Looking at salary if there is one
salary_list = []
for row in soup.find_all('div', {'id':True}):
    for block in row.find_all('td', {'class':'snip'}):
        try:
            salary =  block.find('span', {'class':'no-wrap'}).text
            print salary
            salary_list.append(salary)
        except:
            print 'no salary info'
            salary_list.append(np.nan)
            
def sal_func(soup, salary_list=[]):
    for row in soup.find_all('div', {'id':True}):
        for block in row.find_all('td', {'class':'snip'}):
            try:
                salary =  block.find('span', {'class':'no-wrap'}).text
                salary_list.append(salary)
            except:
                salary_list.append(np.nan)
                
    return salary_list

no salary info
no salary info
no salary info
no salary info
no salary info
no salary info
no salary info
$125,000 a year
no salary info
$110,000 - $125,000 a year
no salary info
no salary info
no salary info
no salary info
no salary info


In [55]:
# Looking at company
company_list = []
for row in soup.find_all('div', {'id':True}):
    for header in row.find_all('span', {'class':'company'}):
        company_list.append(header.text.replace('\n',''))

def comp_func(soup, company_list=[]):
    for row in soup.find_all('div', {'id':True}):
        for header in row.find_all('span', {'class':'company'}):
            company_list.append(header.text.replace('\n',''))
    return company_list

In [56]:
# Looking at days posted/sponsored. Will specify sponsored as np.nan
# Time is organized by: hours, days, and more than 30 days. 30+ days will be regarded as 30 days
time_list = []
for row in soup.find_all('div', {'id':True}):
    for header in row.find_all('div', {'class':'result-link-bar'}):
        try:
            time =  header.find('span', {'class':'date'}).text
            print time
            if 'days' in time:
                days = float(time.split()[0])
                time_list.append(round(days,2))
            elif 'hours' in time:
                hours = float(time.split()[0])/24.
                time_list.append(round(hours,2))
            else:
                time_list.append(30.0)
        except:
            print 'Sponsored'
            time_list.append(np.nan)
            
def time_func(soup, time_list=[]):
    for row in soup.find_all('div', {'id':True}):
        for header in row.find_all('div', {'class':'result-link-bar'}):
            try:
                time =  header.find('span', {'class':'date'}).text
                if 'days' in time:
                    days = float(time.split()[0])
                    time_list.append(days)
                elif 'hours' in time:
                    hours = float(time.split()[0])/24.
                    time_list.append(hours)
                else:
                    time_list.append(30.0)
            except:
                time_list.append(np.nan)
                
    return time_list

Sponsored
Sponsored
Sponsored
1 day ago
3 days ago
1 day ago
2 days ago
2 days ago
9 hours ago
1 day ago
5 days ago
1 day ago
1 day ago
Sponsored
Sponsored


In [57]:
# # Make dataframe of information
# # Lists are:
# '''job_list, location_list, salary_list, company_list, time_list, rating_list, summary_list
# rating_list is a tuple with two pieces of info: rating out of 5 and number of ratings'''

# # Split rating_list into 2 parts: star_list and review_list
# star_list = [i[0] for i in rating_list]
# review_list = [i[1] for i in rating_list]

In [58]:
# Get summary of job posting
summary_list = []
for row in soup.find_all('div', {'id':True}):
    for header in row.find_all('span', {'class':'summary'}):
        summary_list.append(header.text.replace('\n',''))

def sum_func(soup, summary_list=[]):
    for row in soup.find_all('div', {'id':True}):
        for header in row.find_all('span', {'class':'summary'}):
            summary_list.append(header.text.replace('\n',''))
    return summary_list

In [59]:
# Ratings and number of reviews
rating_list = []
for row in soup.find_all('div', {'id':True, 'data-jk':True}):
    try:
        # Get number of ratings
        ratings = row.find('span', {'class':'slNoUnderline'}).text
        
        # Get actual star count out of 5
        # width is a string with format 'width:##.#px'. Trying to pull out ##.# as a float
        width = row.find('span', {'style':True})['style']
        pixels = re.search('\d{2}.\d',width)
        stars = round(float(pixels.group())/60. * 5, 2)
        rating_list.append((stars, ratings))
        
    except:
        rating_list.append((np.nan, np.nan))

def rat_func(soup, rating_list=[], star_list=[], review_list=[]):
    for row in soup.find_all('div', {'id':True, 'data-jk':True}):
        try:
            # Get number of ratings
            ratings = row.find('span', {'class':'slNoUnderline'}).text

            # Get actual star count out of 5
            # width is a string with format 'width:##.#px'. Trying to pull out ##.# as a float
            width = row.find('span', {'style':True})['style']
            pixels = re.search('\d{2}.\d',width)
            stars = round(float(pixels.group())/60. * 5, 2)
            rating_list.append((stars, ratings))

        except:
            rating_list.append((np.nan, np.nan))
            
    star_list = [i[0] for i in rating_list]
    review_list = [i[1] for i in rating_list]
            
    return star_list, review_list

In [60]:
# df = pd.DataFrame(zip(job_func(soup), 
#                       loc_func(soup), 
#                       sal_func(soup), 
#                       comp_func(soup), 
#                       time_func(soup), 
#                       rat_func(soup)[0], 
#                       rat_func(soup)[1],
#                       sum_func(soup)), 
#                   columns=['Title', 
#                            'Location', 
#                            'Salary',
#                            'Company', 
#                            'Post_date', 
#                            'Rating', 
#                            'Reviews',
#                            'Summary'])
# df.head()

In [61]:
# Got some information, but df is not clean at all. Let's work easiest to hardest. 
# Reviews is first

def cleaner(row):
    try:
        # Replace commas with nothing
        new_row = row.replace(',', '')

        # Get of rid of word 'reviews'
        new_row = float(new_row.split()[0])
    except:
        new_row = np.nan    
    
    return new_row

In [62]:
# Make a cleaner for salary. Taking average of range of salary for computation purposes
def sal_cleaner(row):
    try:
        # Replace commas with nothing
        new_row = row.replace(',', '')

        # Get list of salary split by -
        split_list = new_row.split('a year')[0]
        dol_replace = split_list.replace('$', '')
        dol_split = dol_replace.split('-')
        
        if len(dol_split) > 1:
            new_row = 0.5 * (float(dol_split[0]) + float(dol_split[1]))
        else:
            new_row = float(dol_split[0])
    except:
        new_row = np.nan    
    
    return new_row

In [63]:
# We are going to eventually run this above code multiple times. So better to put in a function
def indeed_maker(soup):
    df = pd.DataFrame(zip(job_func(soup), 
                      loc_func(soup), 
                      sal_func(soup), 
                      comp_func(soup), 
                      time_func(soup), 
                      rat_func(soup)[0], 
                      rat_func(soup)[1],
                      sum_func(soup)), 
                  columns=['Title', 
                           'Location', 
                           'Salary',
                           'Company', 
                           'Post_date', 
                           'Rating', 
                           'Reviews',
                           'Summary'])
    
    df.Reviews = df.Reviews.map(cleaner)
    df.Salary = df.Salary.map(sal_cleaner)
    
    return df

### Time to run. Tried to run a while loop, but Indeed cuts off at page 100.
Even if there are more than 100 page worths, Indeed cuts off.
As a result, will have to search through several cities instead of searching without locations.

In [64]:
list_of_cities = ['Atlanta', 'Los+Angeles', 'Seattle', 'Portland', 'San+Francisco', 
                  'Washington%2C+DC', 'Boston', 'Austin', 'Dallas', 'Houston',
                  'Orlando', 'Philadelphia', 'Pittsburgh', 'Cincinnati', 'Miami',
                  'New+York+City', 'New+Jersey', 'Kansas+City', 'San Diego',
                 'San+Jose', 'Salt+Lake+City', 'Raleigh', 'Minneapolis', 'Oklahoma+City',
                 'St.+Louis', 'Detroit', 'Colombus', 'Cleveland', 'Indianapolis', 'Baltimore']

In [65]:
# Looking for two titles
for role in ['data+scientist', 'data+analyst']:
    # Looking at a list of cities
    for city in list_of_cities:
        # Looking through all pages Indeed will allow you to look through
        for page in np.linspace(0, 950, 20):
            # Page indicator
            start = str(int(page))

            # Web initializer
            url = 'https://www.indeed.com/jobs?q=' + role + '&l=' + city + '&limit=50&start=' + start
            html = urllib.urlopen(url).read()
            soup = BeautifulSoup(html, 'lxml')

            # Function that scrapes all relevant data and creates a dataframe        
            ind_df = indeed_maker(soup)
            
            # Break for loop if no next link is available
#             rows = soup.find_all('span', {'class':'np'})
#             if len(rows) > 1:
#                 pass
#             elif 'Previous' in rows[0].text:
#                 break

In [66]:
# Getting rid of row duplicates (sometimes reviews and rating will be different)
ind_df.drop_duplicates(subset=['Title', 'Location', 'Salary', 'Company', 'Summary'], 
                       inplace=True)
ind_df.describe()

Unnamed: 0,Salary,Post_date,Rating,Reviews
count,1871.0,20417.0,29441.0,29441.0
mean,86023.212453,13.945187,3.904609,1591.476648
std,38038.561864,9.07421,0.497423,5608.328746
min,10000.0,0.083333,1.25,2.0
25%,59889.75,5.0,3.55,27.0
50%,77500.0,12.0,3.7,156.0
75%,102500.0,22.0,4.35,768.0
max,275000.0,30.0,5.0,98543.0


In [67]:
ind_df.shape

(39020, 8)

In [68]:
ind_df.describe()

Unnamed: 0,Salary,Post_date,Rating,Reviews
count,1871.0,20417.0,29441.0,29441.0
mean,86023.212453,13.945187,3.904609,1591.476648
std,38038.561864,9.07421,0.497423,5608.328746
min,10000.0,0.083333,1.25,2.0
25%,59889.75,5.0,3.55,27.0
50%,77500.0,12.0,3.7,156.0
75%,102500.0,22.0,4.35,768.0
max,275000.0,30.0,5.0,98543.0


In [69]:
# Saving dataframe as a csv so that I don't have to run code again.
ind_df.to_csv('Indeed_0618', encoding='utf-8')

In [46]:
path = 'Indeed_0616'
df = pd.read_csv(path)

IOError: File Indeed_0616 does not exist

In [None]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
dropped.describe()