In [1]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup


def get_url(position, location):
    """Generate url from position and location"""
    template = 'https://in.indeed.com/jobs?q={}&l={}'
    position = position.replace(' ', '%20')
    location = location.replace(' ', '+')
    url = template.format(position, location)
    return url


def get_record(card):
    """Extract job data from a single record"""
    
    job_title = card.find('h2',{'class':'jobTitle'}).text
    #company = card.find('a').text
    job_location = card.find('div',{'class':'companyLocation'}).text
    post_date = card.find('span', 'date').text
    today = datetime.today().strftime('%Y-%m-%d')
    summary = card.find('li').text
    #job_url = 'https://www.indeed.com' + card.h2.a.get('href')

    # this does not exists for all jobs, so handle the exceptions
    try:
        job_salary=card.find('div',{'class':'metadata salary-snippet-container'}).text
    except AttributeError:
        job_salary=''
    try:
        company = card.find('a').text
    except AttributeError:
        company=card.find('span',{'class':'companyName'}).text ## somewhere tag is different
        
    record = (job_title, company, job_location, post_date, today, summary, job_salary)
    return record


def main(position, location):
    """Run the main program routine"""
    records = []
    url = get_url(position, location)
    
    # extract the job data
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'job_seen_beacon')
        for card in cards:
            record = get_record(card)
            records.append(record)
        try:
            url = 'https://in.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break
    with open('resultsindeed.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary'])
        writer.writerows(records)


In [2]:
main('data scientist', 'india')

In [3]:
import pandas as pd

In [4]:
df=pd.read_csv('C:/Users/hp/Downloads/Imarticus/Python/resultsindeed.csv')

In [5]:
df.head()

Unnamed: 0,JobTitle,Company,Location,PostDate,ExtractDate,Summary,Salary
0,Product: Data Scientist,Indeed,"Hyderabad, Telangana",Posted30+ days ago,2022-01-25,Proactively research data and identify opportu...,"₹34,20,000 - ₹51,40,000 a year"
1,Data Scientist (BE/ B-tech 4-6 yrs),CommerceIQ,"Bengaluru, Karnataka•Temporarily Remote",Posted30+ days ago,2022-01-25,Work on data science roadmap and build the cor...,"₹25,00,000 - ₹35,00,000 a year"
2,Senior Data Scientist,Indeed,"Hyderabad, Telangana",Posted30+ days ago,2022-01-25,Proactively research data and identify opportu...,"₹40,00,000 - ₹60,00,000 a year"
3,newAssociate Data Scientist,Shell,"Bengaluru, Karnataka+1 location",PostedToday,2022-01-25,"Gathers data, analyses and reports findings.",
4,Data Scientist-CAI,HDFC Bank,"Mumbai, Maharashtra",Posted30+ days ago,2022-01-25,"Hands-on experience in data wrangling (ETL), f...",


In [6]:
df.shape

(1005, 7)

In [18]:
df.isnull().sum()

JobTitle         0
Company          0
Location         0
PostDate         0
ExtractDate      0
Summary          0
Salary         909
dtype: int64