In [235]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

In [236]:
def get_url(vaga, local):
    """Gera uma url da vaga e local"""
    template = 'https://br.indeed.com/jobs?q={}&l={}'
    url = template.format(vaga, local)
    return url

In [237]:
url = get_url('python', 'Florianópolis SC')

## Extrair HTML

In [238]:
response = requests.get(url)

In [239]:
response

<Response [200]>

In [240]:
response.reason

''

In [241]:
soup = BeautifulSoup(response.text, 'html.parser')

In [242]:
cards = soup.findAll('div', class_="job_seen_beacon")

In [243]:
len(cards)

15

## Prototipar modelo com um único registro

In [244]:
card = cards[0]

In [245]:
atag = card.find('h2').find('span')

In [246]:
job_title = atag.get('title')

In [247]:
company = card.find('span', 'companyName').text

In [248]:
location_list = card.find('div', 'companyLocation').text.replace('•', ' ').split()

In [249]:
job_location = str(location_list[0] + ' ' + location_list[1])

In [250]:
job_summary = card.find('div', "job-snippet").text.strip()

In [251]:
post_date = card.find('span', 'date').text

In [252]:
today = datetime.today().strftime('%d-%m-%Y')

In [253]:
try:
    job_salary = card.find('div', 'salary-snippet').text.strip()
except AttributeError:
    job_salary = ''

## Generalizando o modelo em uma função

In [205]:
def get_record(card):
    """Extract job data from a single record"""
    atag = card.find('h2').find('span')
    job_title = atag.get('title')
    company = card.find('span', 'companyName').text
    location_list = card.find('div', 'companyLocation').text.replace('•', ' ').split()
    job_location = str(location_list[0] + ' ' + location_list[1])
    job_summary = card.find('div', "job-snippet").text.strip()
    post_date = card.find('span', 'date').text.strip()
    today = datetime.today().strftime('%d-%m-%Y')
    try:
        job_salary = card.find('div', 'salary-snippet').text.strip()
    except AttributeError:
        job_salary = ''
        
    record = (job_title, company, job_location, job_summary, post_date, today, job_salary)
    
    return record

In [230]:
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

In [208]:
records[8]

('Estágio Desenvolvimento de Software',
 'GnTech',
 'Florianópolis, SC',
 'Desenvolvimento de software utilizando Python, Django, MongoDB, PostgreSQL, AWS, Azure, API Rest, entre outros.',
 'há 30+ dias',
 '16-11-2021',
 '')

## Acessando as próximas páginas

In [231]:
while True:
    try:
        url = 'https://br.indeed.com' + soup.find('a', {"aria-label": "Próxima"}).get('href')
    except AttributeError:
        break
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    cards = soup.findAll('div', class_="job_seen_beacon")
    
    for card in cards:
        record = get_record(card)
        records.append(record)

AttributeError: 'NoneType' object has no attribute 'text'

## Agregando tudo

In [260]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

def get_url(vaga, local):
    """Gera uma url da vaga e local"""
    template = 'https://br.indeed.com/jobs?q={}&l={}'
    url = template.format(vaga, local)
    return url

def get_record(card):
    """Extract job data from a single record"""
    atag = card.find('h2').find('span')
    try:
        job_title = atag.get('title')
    except AttributeError:
        job_title = ''    
    try:
        company = card.find('span', 'companyName').text
    except AttributeError:
        company = ''
    try:
        location_list = card.find('div', 'companyLocation').text.replace('•', ' ').split()
    except AttributeError:
        location_list = ''
    try:
        job_location = str(location_list[0] + ' ' + location_list[1])
    except AttributeError:
        job_location = ''
    try:
        job_summary = card.find('div', "job-snippet").text.strip()
    except AttributeError:
        job_summary = ''
    try:
        post_date = card.find('span', 'date').text.strip()
    except AttributeError:
        post_date = ''
    try:
        today = datetime.today().strftime('%d-%m-%Y')
    except AttributeError:
        today = ''
    try:
        job_salary = card.find('div', 'salary-snippet').text.strip()
    except AttributeError:
        job_salary = ''
        
    record = (job_title, company, job_location, job_summary, post_date, today, job_salary)
    
    return record

def main(vaga, local):
    """Run the mains program routine"""
    records = []
    url = get_url(vaga, local)
    
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.findAll('div', class_="job_seen_beacon")
    
        for card in cards:
            record = get_record(card)
            records.append(record)
            
        try:
            url = 'https://br.indeed.com' + soup.find('a', {"aria-label": "Próxima"}).get('href')
        except AttributeError:
            break
            
    # Save the data
    with open('resultados.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Vaga', 'Empresa', 'Local', 'DataAnuncio', 'DataColeta', 'SumarioVaga', 'Salario'])
        writer.writerows(records)

In [261]:
# run the main program
main('python', 'Florianópolis, SC')