In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm

In [2]:
URL = "https://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=New+York&start=10"

In [3]:
r = requests.get(URL)

soup = BeautifulSoup(r.text, 'html.parser')

jobs = soup.find_all(name='div', attrs={'class':"result"})

In [4]:
def extract_company_from_result(html):
    try:
        return html.find("span",{"class":"company"}).text
    except:
        return None

In [5]:
def extract_salary_from_result(html):
    try:
        return html.find("span",{"class":"no-wrap"}).text
    except:
        return None

In [6]:
def extract_location_from_result(html):   
    try:
        return html.find("span",{'class':'location'}).text
    except:
        return None

In [7]:
def extract_job_from_result(html):
    try:
        return html.find("a").text
    except:
        return None

In [8]:
def extract_date_from_result(html):   
    try:
        return html.find("span",{'class':'date'}).text
    except:
        return None

In [9]:
url_template = "https://www.indeed.com/jobs?q=data+scientist&1={}&start=10"
max_results_per_city = 2000
results = []

for city in ['New York','New Jersey']:
    for start in tqdm(range(0, max_results_per_city, 10)):
        r1 = requests.get(url_template.format(city,start))

        soup = BeautifulSoup(r1.text, 'html.parser')

        jobs = soup.find_all(name='div', attrs={'class':"result"}) # Grab the results from the request (as above)
        results.extend(jobs) # Append to the full set of results

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [01:24<00:00,  2.36it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [01:27<00:00,  2.28it/s]


In [11]:
df = pd.DataFrame({"job_title":extract_job_from_result(result),
                   "company":extract_company_from_result(result),
                   "location":extract_location_from_result(result),
                   "salary":extract_salary_from_result(result),
                   "posted":extract_date_from_result(result), 
                   "date_extracted": pd.to_datetime('today')} for result in results)

In [12]:
df.replace(to_replace="\n",value="",regex=True,inplace=True)
df.head(50)

Unnamed: 0,job_title,company,location,salary,posted,date_extracted
0,,Triplebyte,,"$150,000 - $225,000 a year",16 days ago,2020-04-19 17:33:47.677877
1,Sr. Data Scientist/Machine Learning Engineer,aikiu,,"$70,000 - $110,000 a year",30+ days ago,2020-04-19 17:33:47.678875
2,Data Scientist,Seen by Indeed,,,23 days ago,2020-04-19 17:33:47.679872
3,Senior Data Scientist (Remote-friendly),Noom Inc.,,,30+ days ago,2020-04-19 17:33:47.680870
4,Data Scientist,Global Fishing Watch,,$45 - $65 an hour,16 days ago,2020-04-19 17:33:47.680870
5,Data Scientist,ClearOne Advantage,,"$70,000 - $80,000 a year",27 days ago,2020-04-19 17:33:47.681867
6,Data Scientist,Northrop Grumman,,,30+ days ago,2020-04-19 17:33:47.682864
7,Data Scientist,Andor Health,"Orlando, FL 32811 (Eagles Nest area)",,Today,2020-04-19 17:33:47.682864
8,Lead Data Scientist,CPNET,"Harrisburg, PA","$85,000 - $95,000 a year",16 days ago,2020-04-19 17:33:47.683862
9,Data Scientist/Analytics Consultant,Deloitte,"Arlington, VA 22209 (North Rosslyn area)",,5 days ago,2020-04-19 17:33:47.684858


In [13]:
df.to_csv("indeed_jobs.csv")

In [30]:
df_raw = df
df_raw['days posted'] = ''
df_raw

Unnamed: 0,job_title,company,location,salary,posted,date_extracted,days posted
0,,Triplebyte,,"$150,000 - $225,000 a year",16 days ago,2020-04-19 17:33:47.677877,
1,Sr. Data Scientist/Machine Learning Engineer,aikiu,,"$70,000 - $110,000 a year",30+ days ago,2020-04-19 17:33:47.678875,
2,Data Scientist,Seen by Indeed,,,23 days ago,2020-04-19 17:33:47.679872,
3,Senior Data Scientist (Remote-friendly),Noom Inc.,,,30+ days ago,2020-04-19 17:33:47.680870,
4,Data Scientist,Global Fishing Watch,,$45 - $65 an hour,16 days ago,2020-04-19 17:33:47.680870,
...,...,...,...,...,...,...,...
7595,Junior Data Scientist | TS/SCI,Connect Talent Solutions,"Springfield, MA",,30+ days ago,2020-04-19 17:33:51.965810,
7596,Data Scientist,West Creek Financial,"Washington, DC",,1 day ago,2020-04-19 17:33:51.966808,
7597,Data Scientist,Source Enterprises,"New York, NY",,30+ days ago,2020-04-19 17:33:51.966808,
7598,Data Scientist,Big Zeta,,"$100,000 a year",30+ days ago,2020-04-19 17:33:51.967808,


In [None]:
df[df['posted'].str.contains('day ago')] 