In [16]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm

In [17]:
URL = "https://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=New+York&start=10"

In [18]:
r = requests.get(URL)

soup = BeautifulSoup(r.text, 'html.parser')

jobs = soup.find_all(name='div', attrs={'class':"result"})

In [27]:
def extract_company_from_result(html):
    try:
        return html.find("span",{"class":"company"}).text
    except:
        return None

In [28]:
def extract_salary_from_result(html):
    try:
        return html.find("span",{"class":"no-wrap"}).text
    except:
        return None

In [29]:
def extract_location_from_result(html):   
    try:
        return html.find("span",{'class':'location'}).text
    except:
        return None

In [30]:
def extract_job_from_result(html):
    try:
        return html.find("a").text
    except:
        return None

In [31]:
def extract_date_from_result(html):
    try:
        return html.find("span", {"class":"date"}).text
    except:
        return None

In [117]:
url_template = "https://www.indeed.com/jobs?q=data+scientist&1={}&start=10"
max_results_per_city = 2000
results = []

for city in ['New York','Chicago','Boston', 'San Francisco Bay Area', 'Seattle']:
    for start in tqdm(range(0, max_results_per_city, 10)):
        r1 = requests.get(url_template.format(city,start))

        soup = BeautifulSoup(r1.text, 'html.parser')

        jobs = soup.find_all(name='div', attrs={'class':"result"}) # Grab the results from the request (as above)
        results.extend(jobs) # Append to the full set of results

100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [02:01<00:00,  1.65it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [01:56<00:00,  1.71it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [02:04<00:00,  1.61it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [01:54<00:00,  1.74it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [01:59<00:00,  1.68it/s]


In [169]:
df = pd.DataFrame({"job_title":extract_job_from_result(result),
                   "company":extract_company_from_result(result),
                   "location":extract_location_from_result(result),
                   "salary":extract_salary_from_result(result),
                   "posted": extract_date_from_result(result),
                    "date": pd.to_datetime('today')} for result in results)

In [170]:
df.replace(to_replace="\n",value="",regex=True,inplace=True)
df.head(50)

Unnamed: 0,job_title,company,location,salary,posted,date
0,,Triplebyte,,"$150,000 - $225,000 a year",17 days ago,2020-04-20 21:54:52.594403
1,Sr. Data Scientist/Machine Learning Engineer,aikiu,,"$70,000 - $110,000 a year",30+ days ago,2020-04-20 21:54:52.595401
2,Data Scientist,Global Fishing Watch,,$45 - $65 an hour,17 days ago,2020-04-20 21:54:52.596409
3,Senior Data Scientist,TRANZACT,,"$90,000 - $125,000 a year",30+ days ago,2020-04-20 21:54:52.598393
4,Data Scientist,Seen by Indeed,,,24 days ago,2020-04-20 21:54:52.598393
5,Data Scientist,Northrop Grumman,,,30+ days ago,2020-04-20 21:54:52.599390
6,Machine Learning Engineer,Triplebyte,,"$150,000 - $250,000 a year",17 days ago,2020-04-20 21:54:52.600387
7,Lead Data Scientist,CPNET,"Harrisburg, PA","$85,000 - $95,000 a year",17 days ago,2020-04-20 21:54:52.600387
8,Data Scientist/Engineer (Open to remote Workers),Medidata Solutions,"New York, NY 10003 (Flatiron District area)",,Just posted,2020-04-20 21:54:52.601385
9,Data Scientist (all levels),IAMUS Consulting,"Linthicum Heights, MD 21090",,3 days ago,2020-04-20 21:54:52.601385


In [171]:
df[['posted','etc','etc.']] = df.posted.apply(lambda x: pd.Series(str(x).split(" ")))

In [172]:
df[['posted','1']] = df.posted.apply(lambda x: pd.Series(str(x).split("+")))

In [173]:
df = df.drop(['etc','etc.','1'], axis=1)
df

Unnamed: 0,job_title,company,location,salary,posted,date
0,,Triplebyte,,"$150,000 - $225,000 a year",17,2020-04-20 21:54:52.594403
1,Sr. Data Scientist/Machine Learning Engineer,aikiu,,"$70,000 - $110,000 a year",30,2020-04-20 21:54:52.595401
2,Data Scientist,Global Fishing Watch,,$45 - $65 an hour,17,2020-04-20 21:54:52.596409
3,Senior Data Scientist,TRANZACT,,"$90,000 - $125,000 a year",30,2020-04-20 21:54:52.598393
4,Data Scientist,Seen by Indeed,,,24,2020-04-20 21:54:52.598393
...,...,...,...,...,...,...
18995,Data Scientist,"Big Data Federation, Inc.","Santa Clara, CA 95050","$100,000 - $120,000 a year",18,2020-04-20 21:55:07.418923
18996,Freelance Data Scientist for an Award-Winning ...,Mixed In Key,"Miami, FL",,8,2020-04-20 21:55:07.419958
18997,"Scientist III, Quality Control - Data Review",Amneal Pharmaceuticals,"Piscataway, NJ 08854",,30,2020-04-20 21:55:07.420886
18998,Sr. Data Scientist/Machine Learning Engineer,aikiu,,"$70,000 - $110,000 a year",30,2020-04-20 21:55:07.422883


In [174]:
df['posted'].replace(to_replace ="Just", value =0)

0        17
1        30
2        17
3        30
4        24
         ..
18995    18
18996     8
18997    30
18998    30
18999    30
Name: posted, Length: 19000, dtype: object

In [175]:
df['date'] = df['date'].dt.date

In [176]:
df = df.dropna()

In [177]:
df

Unnamed: 0,job_title,company,location,salary,posted,date
7,Lead Data Scientist,CPNET,"Harrisburg, PA","$85,000 - $95,000 a year",17,2020-04-20
14,Data Scientist,"Big Data Federation, Inc.","Santa Clara, CA 95050","$100,000 - $120,000 a year",18,2020-04-20
23,Lead Data Scientist,CPNET,"Harrisburg, PA","$85,000 - $95,000 a year",17,2020-04-20
30,Data Scientist,Getronics,"Atlanta, GA",$50 - $55 an hour,20,2020-04-20
42,Lead Data Scientist,CPNET,"Harrisburg, PA","$85,000 - $95,000 a year",17,2020-04-20
...,...,...,...,...,...,...
18975,Data Scientist I - Remote,HMS Health Management Systems,"Irving, TX",$50 an hour,18,2020-04-20
18976,Data Scientist,"Big Data Federation, Inc.","Santa Clara, CA 95050","$100,000 - $120,000 a year",18,2020-04-20
18988,Lead Data Scientist,CPNET,"Harrisburg, PA","$85,000 - $95,000 a year",17,2020-04-20
18994,Data Scientist I - Remote,HMS Health Management Systems,"Irving, TX",$50 an hour,18,2020-04-20


In [178]:
df.to_csv('indeed_cleaned')

In [179]:
csv = pd.read_csv('indeed_cleaned', encoding='utf-8')

In [180]:
html = csv.to_html()

In [181]:
print(html)

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Unnamed: 0</th>
      <th>job_title</th>
      <th>company</th>
      <th>location</th>
      <th>salary</th>
      <th>posted</th>
      <th>date</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>7</td>
      <td>Lead Data Scientist</td>
      <td>CPNET</td>
      <td>Harrisburg, PA</td>
      <td>$85,000 - $95,000 a year</td>
      <td>17</td>
      <td>2020-04-20</td>
    </tr>
    <tr>
      <th>1</th>
      <td>14</td>
      <td>Data Scientist</td>
      <td>Big Data Federation, Inc.</td>
      <td>Santa Clara, CA 95050</td>
      <td>$100,000 - $120,000 a year</td>
      <td>18</td>
      <td>2020-04-20</td>
    </tr>
    <tr>
      <th>2</th>
      <td>23</td>
      <td>Lead Data Scientist</td>
      <td>CPNET</td>
      <td>Harrisburg, PA</td>
      <td>$85,000 - $95,000 a year</td>
      <td>17</td>
      <td>2020-04-20</td>
    </tr>
    <tr>
    

</table>
