# Test File

###### Create a test file with Jupyter and try and scrape Seek.co.nz
###### Used https://www.youtube.com/watch?v=eN_3d4JrL_w as a base

## Import and setup

In [None]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

In [None]:
def get_url(position, location):
    """Generate a url from position and location"""
    template = 'https://www.seek.co.nz/{}-jobs/in-All-{}'
    url = template.format(position, location)
    return url


In [None]:
url = get_url('data-scientist','New-Zealand')

## Extract raw html

In [None]:
response = requests.get(url)

In [None]:
response.reason

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
cards = soup.find_all('article')

In [None]:
len(cards)

In [None]:
section = soup.find('div', {'class':'_3MPUOLE'})
x = 0
for div in section.select('div[data-search-sol-meta]'):
    x = x + 1
print(x)

In [None]:
jobs = soup.find_all('div[data-search-sol-meta]')

## Prototype the model with a single record

In [None]:
card = cards[0]

In [None]:
atag = card.h1.a

In [None]:
job_title = atag.string

In [None]:
job_url = 'https://www.seek.co.nz' + atag.get('href')

In [None]:
company = card.find('span',{"_3FrNV7v _3PZrylH E6m4BZb"}).a.text.strip()

In [None]:
card.find('span',{"_3FrNV7v _3PZrylH E6m4BZb"}).text.strip()

In [None]:
location = card.find('div',{'class':'xxz8a1h'}).a.text

In [None]:
salary = card.find('span',{'class':'lwHBT6d'}).text

In [None]:
card.find_all('span',{'class':{'Eadjc1o' : 'location'}})

In [None]:
jobcategory = card.find(attrs={"data-automation": "jobClassification"}).text

## Prototype the model with a single record V2

In [None]:
card = cards[0]

In [None]:
job_title = card.find(attrs={"data-automation": "jobTitle"}).text

In [None]:
job_url = 'https://www.seek.co.nz' + card.find(attrs={"data-automation": "jobTitle"}).get('href')

In [1009]:
try:
    company = card.find(attrs={"data-automation": "jobCompany"}).text
except AttributeError:
    company = ''
    

In [1010]:
location = card.find(attrs={"data-automation": "jobLocation"}).text

In [None]:
try:
    job_salary = card.find(attrs={"data-automation": "jobSalary"}).text
except AttributeError:
    job_salary = ''

In [None]:
jobcategory = card.find(attrs={"data-automation": "jobClassification"}).text

In [None]:
jobsubcategory = card.find(attrs={"data-automation": "jobSubClassification"}).text

In [None]:
jobshortdescription = card.find(attrs={"data-automation": "jobShortDescription"}).text

In [None]:
bullet_points = ''
x = 0
for li in card.select('li'):
    if x == 0:
        bullet_points = bullet_points + li.text
        x = 1
    else:
        bullet_points = bullet_points + ' - ' + li.text

## Generalise the model with a function

In [None]:
def get_record(card):
    """Extract job data from a single record"""
    card = cards[0]
    job_title = card.find(attrs={"data-automation": "jobTitle"}).text
    job_url = 'https://www.seek.co.nz' + card.find(attrs={"data-automation": "jobTitle"}).get('href')
    company = card.find(attrs={"data-automation": "jobCompany"}).text
    location = card.find(attrs={"data-automation": "jobLocation"}).text
    try:
        job_salary = card.find(attrs={"data-automation": "jobSalary"}).text
    except AttributeError:
        job_salary = ''
    jobcategory = card.find(attrs={"data-automation": "jobClassification"}).text
    jobsubcategory = card.find(attrs={"data-automation": "jobSubClassification"}).text
    jobshortdescription = card.find(attrs={"data-automation": "jobShortDescription"}).text
    bullet_points = ''
    x = 0
    for li in card.select('li'):
        if x == 0:
            bullet_points = bullet_points + li.text
            x = 1
        else:
            bullet_points = bullet_points + ' - ' + li.text
            
    record = (job_title,company,location,jobshortdescription,bullet_points,job_salary,jobcategory,jobsubcategory,job_url)
    
    return record

In [None]:
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

In [None]:
print(records[0])

## Getting the next page

In [None]:
while True:
    try:
        url = 'https://www.seek.co.nz' + soup.find(attrs={"data-automation": "page-next"}).get('href')
    except AttributeError:
        break
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    cards = soup.find_all('article')
    
    for card in cards:
    record = get_record(card)
    records.append(record)

In [None]:
print(len(records))

## Putting it all together

In [1013]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import time

def get_url(position, location):
    """Generate a url from position and location"""
    template = 'https://www.seek.co.nz/{}-jobs/in-All-{}'
    url = template.format(position, location)
    return url


def get_record(card):
    """Extract job data from a single record"""
    job_title = card.find(attrs={"data-automation": "jobTitle"}).text
    job_url = 'https://www.seek.co.nz' + card.find(attrs={"data-automation": "jobTitle"}).get('href')
    
    try:
        company = card.find(attrs={"data-automation": "jobCompany"}).text
    except AttributeError:
        company = ''
        
    location = card.find(attrs={"data-automation": "jobLocation"}).text
    
    try:
        job_salary = card.find(attrs={"data-automation": "jobSalary"}).text
    except AttributeError:
        job_salary = ''
        
    jobcategory = card.find(attrs={"data-automation": "jobClassification"}).text
    jobsubcategory = card.find(attrs={"data-automation": "jobSubClassification"}).text
    jobshortdescription = card.find(attrs={"data-automation": "jobShortDescription"}).text
    bullet_points = ''
    x = 0
    for li in card.select('li'):
        if x == 0:
            bullet_points = bullet_points + li.text
            x = 1
        else:
            bullet_points = bullet_points + ' - ' + li.text
            
    record = (job_title,company,location,jobshortdescription,bullet_points,job_salary,jobcategory,jobsubcategory,job_url)
    
    return record

def main(position,location):
    """Run the main program routine"""
    records = []
    url = get_url(position, location)
    print(url)
    
    # extract the job data
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('article')

        for card in cards:
            time.sleep(1)
            record = get_record(card)
            records.append(record)       
        
        try:
            url = 'https://www.seek.co.nz' + soup.find(attrs={"data-automation": "page-next"}).get('href')
            print(url)
        except AttributeError:
            print('no more pages, saving data')
            break
            
    # save the job data
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'ShortDesc', 'BulletPoints','Salary','Cat','SubCat','URL'])
        writer.writerows(records)
        print('saved and done')

In [1014]:
# run the main program
main('operations-analyst','Auckland')

https://www.seek.co.nz/operations-analyst-jobs/in-All-Auckland
https://www.seek.co.nz/operations-analyst-jobs/in-All-Auckland?page=2
https://www.seek.co.nz/operations-analyst-jobs/in-All-Auckland?page=3
https://www.seek.co.nz/operations-analyst-jobs/in-All-Auckland?page=4
no more pages, saving data
saved and done
