# Data Extraction - [indeed.com](https://za.indeed.com/jobs?q=python&l=Johannesburg,%20Gauteng&start=0)

Hi there, my name is Duane Muller, and this notebook is a demonstration of what I can do  
in data extraction using Python.  
This example is from https://za.indeed.com  

In this demo I am extracting a list of jobs that are posted on this site,  
according to what and where.

I am extracting the following:  
- Job title
- Job url
- Company name
- Job location
- Job summary
- Posted date
- Extracted date

If this is this is a project that you would like me to help you with,   
then send me an invite on Upwork and we can discuss this matter further. 

## Required modules

In [None]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup


## Extract Raw html

In [None]:
template = 'https://za.indeed.com/jobs?q={}&l={}'

def get_url(what, where):
    '''Generate a url from the "What" and "Where"'''
    template = 'https://za.indeed.com/jobs?q={}&l={}'
    url = template.format(what, where)
    return url

url = get_url('Python', 'Johannesburg Gauteng')

response = requests.get(url).text

soup = BeautifulSoup(response, 'html.parser')
cards = soup.find_all('div', 'cardOutline')

## Prototype of a single record

In [None]:
card = cards[0]
title_tag = card.h2.a.span
job_title = title_tag.get('title')
atag = card.h2.a
job_url = 'https://za.indeed.com'+ atag.get('href')
company_name = card.find('span','companyName').text
job_location = card.find('div', 'companyLocation').text
job_summary = card.find('div', 'job-snippet').text.replace('\n','')
post_date = card.find('span', 'date').text.replace('EmployerActive','Active').replace('Posted','Posted ')
today = datetime.today().strftime('%Y-%m-%d')
try:
    job_salary = card.find('div', class_="metadata salary-snippet-container").text.strip().replace('\xa0', '')
except AttributeError:
    job_salary = ''

## Function that returns a record object

In [None]:
def get_record(card):
    '''Extract job data from a single card'''
    title_tag = card.h2.a.span
    job_title = title_tag.get('title')
    #atag = card.h2.a
    job_url = 'https://za.indeed.com'+ atag.get('href')
    company_name = card.find('span','companyName').text
    job_location = card.find('div', 'companyLocation').text
    job_summary = card.find('div', 'job-snippet').text.replace('\n','')
    post_date = card.find('span', 'date').text.replace('EmployerActive','Active').replace('Posted','Posted ')
    today = datetime.today().strftime('%Y-%m-%d')
    try:
        job_salary = card.find('div', class_="metadata salary-snippet-container").text.strip().replace('\xa0', '')
    except AttributeError:
        job_salary = ''
    record = (job_title,company_name,job_location,post_date,today,job_summary,job_salary,job_url)
    return record
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

records[0]  

## Main function to extract all records

In [3]:
def main(what,where):  
    records = []
    url = get_url(what,where)
    while True:
        
        response = requests.get(url).text
        soup = BeautifulSoup(response, 'html.parser')
        cards = soup.find_all('div', 'cardOutline')

        for card in cards:
            record = get_record(card)
            records.append(record)
        try:
            url = 'https://za.indeed.com'+soup.find('a', {'aria-label':'Next'}).get('href')
        except AttributeError:
            break
        with open('results.csv','w',newline='',encoding='utf-8')as f:
            writer = csv.writer(f)
            writer.writerow(['JobTitle','Company','Location','PostDate','ExtractDate','Summary','Salary','JobUrl'])
            writer.writerows(records)

main('Python', 'Johannesburg Gauteng')