In [14]:
import requests
import bs4
from bs4 import BeautifulSoup

import re
import pandas as pd
import numpy as np

from IPython.core.display import HTML, Image

In [15]:
def get_soup_from_url(url):
    '''Returns Beautiful Soup for a given webpage'''
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,'lxml')
    return soup

def extract_location_from_result(result):
    loca = result.find('span', class_ = 'location').text
    location = loca.split(',')[0]
    return str(location)

def extract_company_from_result(result):
    company = result.find('span', class_ = 'company').text.encode('ascii','ignore').strip()
    return str(company)

def extract_jobtitle_from_result(result):
    jobtitle = result.find('a', class_ = 'jobtitle')
    if jobtitle != None:
        jobtitle = result.find('a',class_ = 'jobtitle').text.strip().encode('ascii','ignore')
    else:
        jobtitle = result.find('a').text.encode('ascii','ignore')
    return str(jobtitle)

def extract_salary_from_result(result):
    salary = result.find('nobr')
    if salary != None:
        salary = result.find('nobr').text
    else:
        salary = np.nan
    return str(salary)

def DollarDrop(x):
    y = x.split('$')
    z = y[1].replace(',','')
    return z

def SalarySplitter(x):
    a = x.strip().split(' ')
    tenor = a[-1]
    if tenor == 'year':
        if a[1] == '-':
            salary = (int(DollarDrop(a[0])) + int(DollarDrop(a[2])))/2
        else:
            salary = int(DollarDrop(a[0]))
    else:
        salary = np.nan
    return salary

In [16]:
url_template = "http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l={city}&start={start}"
max_results_per_city = 300

results = []

for city in set(['New+York', 'Chicago', 'Los+Angeles', 'Seattle', 'San+Francisco', 'Baltimore', 'Atlanta', 'Boston', 
                 'Dallas', 'Raleigh', 'Philadelphia']):
    URL1 = (url_template.replace('{city}',city))
    for start in range(0, max_results_per_city, 10):
        URL2 = URL1.replace('{start}',str(start))
        # Grab the results from the request (as above)
        # Append to the full set of results
        soup = get_soup_from_url(URL2)
        for element in soup.findAll('div',class_ = 'result'):
            miniResult=[]
            miniResult.append(extract_jobtitle_from_result(element))
            miniResult.append(extract_company_from_result(element))
            miniResult.append(extract_location_from_result(element))
            cityTag = city.replace('+',' ')
            miniResult.append(cityTag)
            miniResult.append(extract_salary_from_result(element))
            results.append(miniResult)

In [17]:
df = pd.DataFrame(results, columns  = ['JobTitle', 'CompanyName', 'City', 'CityGrouping', 'Salary'])

In [18]:
df['Salary'] = df['Salary'].apply(lambda x: SalarySplitter(x))

In [19]:
df.dropna(inplace = True)

In [26]:
df['CityGrouping'].value_counts()

Baltimore        33
New York         33
Chicago          28
Atlanta          21
Boston           19
San Francisco    18
Raleigh          12
Philadelphia     12
Dallas           12
Seattle          11
Los Angeles      11
Name: CityGrouping, dtype: int64

In [21]:
df.to_csv('SalaryInfo.csv')

In [22]:
workdf = pd.read_csv('SalaryInfo.csv')

In [31]:
SalaryMedian = workdf['Salary'].median()
SalaryMedian

106734.5

In [29]:
workdf['HighSalary']=[x>SalaryMedian for x in workdf['Salary']]

In [30]:
workdf.head()

Unnamed: 0.1,Unnamed: 0,JobTitle,CompanyName,City,CityGrouping,Salary,HighSalary
0,5,Statistician,Etech Hi Inc.,Chicago,Chicago,71000.0,False
1,18,Data Scientist,Workbridge Associates,Chicago,Chicago,95000.0,False
2,22,Machine Learning Data Scientist,All-In Analytics,Chicago,Chicago,132500.0,True
3,25,Mid-Level Data Scientist,Workbridge Associates,Chicago,Chicago,105000.0,False
4,47,Senior Data Scientist,Workbridge Associates,Chicago,Chicago,150000.0,True
