# Test File

###### Create a test file with Jupyter and try and scrape Seek.co.nz
###### Used https://www.youtube.com/watch?v=eN_3d4JrL_w as a base
###### https://github.com/chrisgnorris/seeknz-scraper

## Import and setup

In [None]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
def get_url(position, location):
    """Generate a url from position and location"""
    template = 'https://www.seek.co.nz/{}-jobs/in-All-{}'
    url = template.format(position, location)
    return url


In [None]:
url = get_url('data-scientist','New-Zealand')

## Extract raw html

In [None]:
response = requests.get(url)

In [None]:
response.reason

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
cards = soup.find_all('article')

In [None]:
len(cards)

In [None]:
section = soup.find('div', {'class':'_3MPUOLE'})
x = 0
for div in section.select('div[data-search-sol-meta]'):
    x = x + 1
print(x)

In [None]:
jobs = soup.find_all('div[data-search-sol-meta]')

## Prototype the model with a single record

In [None]:
card = cards[0]

In [None]:
atag = card.h1.a

In [None]:
job_title = atag.string

In [None]:
job_url = 'https://www.seek.co.nz' + atag.get('href')

In [None]:
company = card.find('span',{"_3FrNV7v _3PZrylH E6m4BZb"}).a.text.strip()

In [None]:
card.find('span',{"_3FrNV7v _3PZrylH E6m4BZb"}).text.strip()

In [None]:
location = card.find('div',{'class':'xxz8a1h'}).a.text

In [None]:
salary = card.find('span',{'class':'lwHBT6d'}).text

In [None]:
card.find_all('span',{'class':{'Eadjc1o' : 'location'}})

In [None]:
jobcategory = card.find(attrs={"data-automation": "jobClassification"}).text

## Prototype the model with a single record V2

In [None]:
card = cards[1]

In [None]:
job_title = card.find(attrs={"data-automation": "jobTitle"}).text

In [None]:
job_url = 'https://www.seek.co.nz' + card.find(attrs={"data-automation": "jobTitle"}).get('href')

In [None]:
try:
    company = card.find(attrs={"data-automation": "jobCompany"}).text
except AttributeError:
    company = ''
    

In [None]:
location = card.find(attrs={"data-automation": "jobLocation"}).text

In [None]:
try:
    job_salary = card.find(attrs={"data-automation": "jobSalary"}).text
except AttributeError:
    job_salary = ''

In [None]:
job_category = card.find(attrs={"data-automation": "jobClassification"}).text

In [None]:
job_subcategory = card.find(attrs={"data-automation": "jobSubClassification"}).text

In [None]:
job_short_description = card.find(attrs={"data-automation": "jobShortDescription"}).text

In [None]:
try:
    job_listing_date = card.find(attrs={"data-automation": "jobListingDate"}).text
except AttributeError:
    job_listing_date = 'Featured'

In [None]:
date_mined = datetime.today().strftime('%Y-%m-%d')

In [None]:
job_id = card.get('data-job-id')

In [None]:
bullet_points = ''
x = 0
for li in card.select('li'):
    if x == 0:
        bullet_points = bullet_points + li.text
        x = 1
    else:
        bullet_points = bullet_points + ' - ' + li.text

## Generalise the model with a function

In [None]:
def get_record(card):
    """Extract job data from a single record"""
    card = cards[0]
    job_title = card.find(attrs={"data-automation": "jobTitle"}).text
    job_url = 'https://www.seek.co.nz' + card.find(attrs={"data-automation": "jobTitle"}).get('href')
    company = card.find(attrs={"data-automation": "jobCompany"}).text
    location = card.find(attrs={"data-automation": "jobLocation"}).text
    try:
        job_salary = card.find(attrs={"data-automation": "jobSalary"}).text
    except AttributeError:
        job_salary = ''
    job_category = card.find(attrs={"data-automation": "jobClassification"}).text
    job_subcategory = card.find(attrs={"data-automation": "jobSubClassification"}).text
    job_short_description = card.find(attrs={"data-automation": "jobShortDescription"}).text
    try:
        job_listing_date = card.find(attrs={"data-automation": "jobListingDate"}).text
    except AttributeError:
        job_listing_date = 'Featured'
    date_mined = datetime.today().strftime('%Y-%m-%d')
    job_id = card.get('data-job-id')
    bullet_points = ''
    x = 0
    for li in card.select('li'):
        if x == 0:
            bullet_points = bullet_points + li.text
            x = 1
        else:
            bullet_points = bullet_points + ' - ' + li.text
            
    record = (job_id,job_title,company,location,jobshortdescription,bullet_points,job_salary,jobcategory,jobsubcategory,job_url)
    
    return record

In [None]:
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

In [None]:
print(records[0])

## Getting the next page

In [None]:
while True:
    try:
        url = 'https://www.seek.co.nz' + soup.find(attrs={"data-automation": "page-next"}).get('href')
    except AttributeError:
        break
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    cards = soup.find_all('article')
    
    for card in cards:
    record = get_record(card)
    records.append(record)

In [None]:
print(len(records))

## Saving File to Excel and referencing as a dataframe to ignore previously mined files

In [None]:
#Opening existing file as DB

try:
    df = pd.read_csv('results.csv')  
except FileNotFoundError:
    print('No existing file found')


In [None]:
df

In [None]:
#https://stackoverflow.com/questions/60675117/returning-a-string-from-loc-query
#https://stackoverflow.com/questions/56260348/selecting-single-value-in-a-pandas-dataframe
lookup = df.loc[df['ID'] == '1111', 'ID'].values

In [None]:
lookup

In [None]:
if lookup.size > 0:
    print('Found')
else:
    print('Not found')

In [None]:
#Put together
def lookup_ID(jobid):
    """Feed URL for it to look up, will return found or not found"""
    
    try:
        df = pd.read_csv('results.csv')  
        lookup = df.loc[df['ID'] == jobid, 'ID'].values
        
        if lookup.size > 0:
            searchResult = 'Found'
         
        else:
            searchResult = 'Not found'
        
        return searchResult
        
    except FileNotFoundError:
        searchResult = 'No existing csv file found'
        return searchResult
        



In [None]:
lookup_ID(51160410)

## Putting it all together

In [504]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import time

#Global variables are telling Python that you are using these variables in the global pool. 
#Using them in local command can cause issues. Issues I am having is due to functions cross sharing global
#https://www.w3schools.com/python/python_variables_global.asp 

def lookup_id(jobid):
    """Feed ID for it to look up, will return found or not found"""
    global df 
    
    lookup = df.loc[df['ID'] == jobid, 'ID'].values
    print(jobid)
 
    if lookup.size > 0:
        searchResult = 'Found'

    else:
        searchResult = 'Not found'
    
    print(searchResult)
    return searchResult


def get_url(position, location):
    """Generate a url from position and location"""
    template = 'https://www.seek.co.nz/{}-jobs/in-All-{}'
    url = template.format(position, location)
    return url


def get_record(card):
    """Extract job data from a single record"""
    global searchResult2
    searchResult = ''
    job_id = int(card.get('data-job-id'))
    
    
    if searchResult2 != 'No existing csv file found': #If there IS a CSV file, run database lookup
        searchResult = lookup_id(job_id)
        print('There is a CSV File')
        print('Lookup result was : ' + str(searchResult))
    
    if searchResult == 'Not found' or searchResult2 == 'No existing csv file found' : #If there is no match, or NO CSV file, mine data
        job_title = card.find(attrs={"data-automation": "jobTitle"}).text
        job_url = 'https://www.seek.co.nz' + card.find(attrs={"data-automation": "jobTitle"}).get('href')
        
        try:
            company = card.find(attrs={"data-automation": "jobCompany"}).text
        except AttributeError:
            company = ''

        location = card.find(attrs={"data-automation": "jobLocation"}).text

        try:
            job_salary = card.find(attrs={"data-automation": "jobSalary"}).text
        except AttributeError:
            job_salary = ''

        job_category = card.find(attrs={"data-automation": "jobClassification"}).text
        job_subcategory = card.find(attrs={"data-automation": "jobSubClassification"}).text
        job_short_description = card.find(attrs={"data-automation": "jobShortDescription"}).text

        try:
            job_listing_date = card.find(attrs={"data-automation": "jobListingDate"}).text
        except AttributeError:
            job_listing_date = 'Featured'

        date_mined = datetime.today().strftime('%Y-%m-%d')
        
        bullet_points = ''
        x = 0
        for li in card.select('li'):
            if x == 0:
                bullet_points = bullet_points + li.text
                x = 1
            else:
                bullet_points = bullet_points + ' - ' + li.text

        record = (job_id,job_title,company,location,job_listing_date,date_mined,job_short_description,
                  bullet_points,job_salary,job_category,job_subcategory,job_url)
        print(searchResult)
        print('added')
        return record
    


def main(position,location):
    """Run the main program routine"""
    global searchResult2
    searchResult2 = ''
    # Loads CSV and dataframe
    global df
    try:
        df = pd.read_csv('results.csv')  
 
    except FileNotFoundError:
        searchResult2 = 'No existing csv file found'
        print(searchResult2)
     
        

    
    records = []
    url = get_url(position, location)
    print(url)
    
    # extract the job data
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('article')

        for card in cards:
            time.sleep(1)
            record = get_record(card)
            if record is not None:   
                records.append(record)       
        
        try:
            url = 'https://www.seek.co.nz' + soup.find(attrs={"data-automation": "page-next"}).get('href')
            print(url)
        except AttributeError:
            print('no more pages, saving data')
            print(records)
            break
            
            
    # save the job data and creates titles/csv file if not already created
  
    
    if searchResult2 == 'No existing csv file found':
        with open('results.csv', 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['ID','JobTitle', 'Company', 'Location', 'DateListed', 'DateMined', 'ShortDesc', 'BulletPoints',
                             'Salary','Cat','SubCat','URL'])
            writer.writerows(records)
            print('saved and done')
    
    else:
        with open('results.csv', 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerows(records)
            print('saved and done')

In [505]:
# run the main program
main('operations-analyst','Auckland')

https://www.seek.co.nz/operations-analyst-jobs/in-All-Auckland
51160410
Found
There is a CSV File
Lookup result was : Found
51083770
Found
There is a CSV File
Lookup result was : Found
51131332
Found
There is a CSV File
Lookup result was : Found
51148039
Found
There is a CSV File
Lookup result was : Found
51148040
Found
There is a CSV File
Lookup result was : Found
51152044
Found
There is a CSV File
Lookup result was : Found
51094802
Found
There is a CSV File
Lookup result was : Found
51184763
Found
There is a CSV File
Lookup result was : Found
51187685
Found
There is a CSV File
Lookup result was : Found
51189018
Found
There is a CSV File
Lookup result was : Found
51185313
Found
There is a CSV File
Lookup result was : Found
51198266
Found
There is a CSV File
Lookup result was : Found
51130906
Found
There is a CSV File
Lookup result was : Found
51084001
Found
There is a CSV File
Lookup result was : Found
51146430
Found
There is a CSV File
Lookup result was : Found
51072950
Found
There i