In [5]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm

In [2]:
URL = "https://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=New+York&start=10"

In [3]:
r = requests.get(URL)

soup = BeautifulSoup(r.text, 'html.parser')

jobs = soup.find_all(name='div', attrs={'class':"result"})

In [4]:
def extract_company_from_result(html):
    try:
        return html.find("span",{"class":"company"}).text
    except:
        return None

In [5]:
def extract_salary_from_result(html):
    try:
        return html.find("span",{"class":"no-wrap"}).text
    except:
        return None

In [6]:
def extract_location_from_result(html):   
    try:
        return html.find("span",{'class':'location'}).text
    except:
        return None

In [7]:
def extract_job_from_result(html):
    try:
        return html.find("a").text
    except:
        return None

In [8]:
def extract_date_from_result(html):
    try:
        return html.find("span", {"class":"date"}).text
    except:
        return None

In [9]:
url_template = "https://www.indeed.com/jobs?q=data+scientist&1={}&start=10"
max_results_per_city = 2000
results = []

for city in ['New York City','Chicago','Boston', 'San Francisco Bay Area', 'Seattle']:
    for start in tqdm(range(0, max_results_per_city, 10)):
        r1 = requests.get(url_template.format(city,start))

        soup = BeautifulSoup(r1.text, 'html.parser')

        jobs = soup.find_all(name='div', attrs={'class':"result"}) # Grab the results from the request (as above)
        results.extend(jobs) # Append to the full set of results

100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [01:53<00:00,  1.76it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [02:18<00:00,  1.44it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [02:03<00:00,  1.62it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [02:14<00:00,  1.49it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [01:57<00:00,  1.70it/s]


In [44]:
df = pd.DataFrame({"job_title":extract_job_from_result(result),
                   "company":extract_company_from_result(result),
                   "location":extract_location_from_result(result),
                   "salary":extract_salary_from_result(result),
                   "posted": extract_date_from_result(result),
                    "date": pd.to_datetime('today')} for result in results)

In [45]:
df.replace(to_replace="\n",value="",regex=True,inplace=True)
df.head(50)

Unnamed: 0,job_title,company,location,salary,posted,date
0,,Triplebyte,,"$150,000 - $225,000 a year",17 days ago,2020-04-20 22:36:56.544951
1,Sr. Data Scientist/Machine Learning Engineer,aikiu,,"$70,000 - $110,000 a year",30+ days ago,2020-04-20 22:36:56.544951
2,Senior Data Scientist,TRANZACT,,"$90,000 - $125,000 a year",30+ days ago,2020-04-20 22:36:56.545948
3,Machine Learning Engineer,Triplebyte,,"$150,000 - $250,000 a year",17 days ago,2020-04-20 22:36:56.545948
4,Data Scientist,Seen by Indeed,,,24 days ago,2020-04-20 22:36:56.546945
5,"Sr Data Scientist, Zoro",Zoro Tools,,,13 days ago,2020-04-20 22:36:56.547942
6,Data Scientist,Northrop Grumman,,,30+ days ago,2020-04-20 22:36:56.548940
7,Biomedical Data Scientist/Bioinformatics,Aspira Labs,"Trumbull, CT 06611",,4 days ago,2020-04-20 22:36:56.549936
8,Data Scientist- Call Center,Momentum Solar,"South Plainfield, NJ",,5 days ago,2020-04-20 22:36:56.549936
9,RWD Healthcare Data Scientist,W2O,"Florham Park, NJ",,6 days ago,2020-04-20 22:36:56.550963


In [46]:
df[['posted','etc','etc.']] = df.posted.apply(lambda x: pd.Series(str(x).split(" ")))

In [47]:
df[['posted','etc..']] = df.posted.apply(lambda x: pd.Series(str(x).split("+")))

In [48]:
df = df.drop(['etc','etc.','etc..'], axis=1)
df.head(15)

Unnamed: 0,job_title,company,location,salary,posted,date
0,,Triplebyte,,"$150,000 - $225,000 a year",17,2020-04-20 22:36:56.544951
1,Sr. Data Scientist/Machine Learning Engineer,aikiu,,"$70,000 - $110,000 a year",30,2020-04-20 22:36:56.544951
2,Senior Data Scientist,TRANZACT,,"$90,000 - $125,000 a year",30,2020-04-20 22:36:56.545948
3,Machine Learning Engineer,Triplebyte,,"$150,000 - $250,000 a year",17,2020-04-20 22:36:56.545948
4,Data Scientist,Seen by Indeed,,,24,2020-04-20 22:36:56.546945
5,"Sr Data Scientist, Zoro",Zoro Tools,,,13,2020-04-20 22:36:56.547942
6,Data Scientist,Northrop Grumman,,,30,2020-04-20 22:36:56.548940
7,Biomedical Data Scientist/Bioinformatics,Aspira Labs,"Trumbull, CT 06611",,4,2020-04-20 22:36:56.549936
8,Data Scientist- Call Center,Momentum Solar,"South Plainfield, NJ",,5,2020-04-20 22:36:56.549936
9,RWD Healthcare Data Scientist,W2O,"Florham Park, NJ",,6,2020-04-20 22:36:56.550963


In [49]:
df['posted'] = df['posted'].replace(to_replace ="Just", value =0)

In [50]:
df['posted'] = df['posted'].replace(to_replace ="Today", value =0)

In [51]:
df['date'] = df['date'].dt.date

In [52]:
df = df[df.job_title != '']

In [54]:
df

Unnamed: 0,job_title,company,location,salary,posted,date
1,Sr. Data Scientist/Machine Learning Engineer,aikiu,,"$70,000 - $110,000 a year",30,2020-04-20
2,Senior Data Scientist,TRANZACT,,"$90,000 - $125,000 a year",30,2020-04-20
3,Machine Learning Engineer,Triplebyte,,"$150,000 - $250,000 a year",17,2020-04-20
4,Data Scientist,Seen by Indeed,,,24,2020-04-20
5,"Sr Data Scientist, Zoro",Zoro Tools,,,13,2020-04-20
...,...,...,...,...,...,...
18995,Senior Data Scientist,TRANZACT,,"$90,000 - $125,000 a year",30,2020-04-20
18996,Data Scientist,Northrop Grumman,,,30,2020-04-20
18997,"Sr Data Scientist, Zoro",Zoro Tools,,,13,2020-04-20
18998,Machine Learning Engineer,Triplebyte,,"$150,000 - $250,000 a year",17,2020-04-20


In [55]:
df.to_csv('indeed_cleaned.csv')

In [19]:
csv = pd.read_csv('indeed_cleaned.csv', encoding='utf-8')
csv

Unnamed: 0.1,Unnamed: 0,job_title,company,location,salary,posted,date
0,1,Sr. Data Scientist/Machine Learning Engineer,aikiu,,"$70,000 - $110,000 a year",30,2020-04-20
1,2,Senior Data Scientist,TRANZACT,,"$90,000 - $125,000 a year",30,2020-04-20
2,3,Machine Learning Engineer,Triplebyte,,"$150,000 - $250,000 a year",17,2020-04-20
3,4,Data Scientist,Seen by Indeed,,,24,2020-04-20
4,5,"Sr Data Scientist, Zoro",Zoro Tools,,,13,2020-04-20
...,...,...,...,...,...,...,...
18003,18995,Senior Data Scientist,TRANZACT,,"$90,000 - $125,000 a year",30,2020-04-20
18004,18996,Data Scientist,Northrop Grumman,,,30,2020-04-20
18005,18997,"Sr Data Scientist, Zoro",Zoro Tools,,,13,2020-04-20
18006,18998,Machine Learning Engineer,Triplebyte,,"$150,000 - $250,000 a year",17,2020-04-20


In [21]:
csv = csv.drop(['Unnamed: 0'], axis=1)
csv

Unnamed: 0,job_title,company,location,salary,posted,date
0,Sr. Data Scientist/Machine Learning Engineer,aikiu,,"$70,000 - $110,000 a year",30,2020-04-20
1,Senior Data Scientist,TRANZACT,,"$90,000 - $125,000 a year",30,2020-04-20
2,Machine Learning Engineer,Triplebyte,,"$150,000 - $250,000 a year",17,2020-04-20
3,Data Scientist,Seen by Indeed,,,24,2020-04-20
4,"Sr Data Scientist, Zoro",Zoro Tools,,,13,2020-04-20
...,...,...,...,...,...,...
18003,Senior Data Scientist,TRANZACT,,"$90,000 - $125,000 a year",30,2020-04-20
18004,Data Scientist,Northrop Grumman,,,30,2020-04-20
18005,"Sr Data Scientist, Zoro",Zoro Tools,,,13,2020-04-20
18006,Machine Learning Engineer,Triplebyte,,"$150,000 - $250,000 a year",17,2020-04-20


In [22]:
html = csv.to_html()

In [23]:
print(html)

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>job_title</th>
      <th>company</th>
      <th>location</th>
      <th>salary</th>
      <th>posted</th>
      <th>date</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>Sr. Data Scientist/Machine Learning Engineer</td>
      <td>aikiu</td>
      <td>NaN</td>
      <td>$70,000 - $110,000 a year</td>
      <td>30</td>
      <td>2020-04-20</td>
    </tr>
    <tr>
      <th>1</th>
      <td>Senior Data Scientist</td>
      <td>TRANZACT</td>
      <td>NaN</td>
      <td>$90,000 - $125,000 a year</td>
      <td>30</td>
      <td>2020-04-20</td>
    </tr>
    <tr>
      <th>2</th>
      <td>Machine Learning Engineer</td>
      <td>Triplebyte</td>
      <td>NaN</td>
      <td>$150,000 - $250,000 a year</td>
      <td>17</td>
      <td>2020-04-20</td>
    </tr>
    <tr>
      <th>3</th>
      <td>Data Scientist</td>
      <td>Seen by Indeed</td>
      <td>NaN</td>

</table>


In [24]:
import pyperclip as clip

clip.copy(f"{html}")

In [65]:
sort_by_posted = csv.sort_values('posted')

In [66]:
sort_by_posted

Unnamed: 0.1,Unnamed: 0,job_title,company,location,salary,posted,date
1906,2009,Global Big Data Scientist,Abbott Laboratories,United States,,0,2020-04-20
3114,3284,Data Scientist/Engineer (Open to remote Workers),Medidata Solutions,"New York, NY 10003 (Flatiron District area)",,0,2020-04-20
2229,2350,Data Scientist/Engineer (Open to remote Workers),Medidata Solutions,"New York, NY 10003 (Flatiron District area)",,0,2020-04-20
2230,2351,Global Big Data Scientist,Abbott Laboratories,United States,,0,2020-04-20
3112,3282,Global Big Data Scientist,Abbott Laboratories,United States,,0,2020-04-20
...,...,...,...,...,...,...,...
8730,9210,Senior Data Scientist,TRANZACT,,"$90,000 - $125,000 a year",30,2020-04-20
8731,9211,Data Scientist,Northrop Grumman,,,30,2020-04-20
8733,9213,Machine Learning On-call Intern,Perspecta Labs,,,30,2020-04-20
8570,9041,Data Science Intern,Dolphin,"New York, NY 10014 (SoHo area)",$25 an hour,30,2020-04-20


In [8]:
html.replace('\n',' ', regex=True)
html

TypeError: replace() takes no keyword arguments