In [2]:
import json
import csv
from tqdm.notebook import tqdm
import os
import pandas as pd
import time
import config
from datetime import date
# pip install careerjet-api-client
# go to __init__.py careerjet-api-client, correct the Except, e: and the from urlparse -> to -> from urllib.parse
from careerjet_api_client import CareerjetAPIClient

# Mandatory Search Params
- `affid` : Affiliate ID provided by Careerjet. Requires to open a Careerjet partner account http://www.careerjet.co.uk/partners.
- `user_ip` : IP address of the end-user to whom the search results will be displayed.
- `user_agent` : User agent of the end-user's browser.
- `url` : URL of page that will display the search results
## Search Params
Please note that each parameter is optional.

`keywords`: Keywords to match the title, content or company name of a job offer

`location`: Location of requested jobs

`sort`: Sort type. This can be: relevance (default) — sorted by decreasing relevancy, date — sorted by decreasing date and salary — sorted by decreasing salary.

`start_num`: Position of returned job postings within the entire result space. Should be >= 1 and <= Number of hits.

`pagesize`: Number of jobs returned in one call.

`page`: Page number of returned jobs within the entire result space. Should be >=1. If this value is set, it overrides start_num.

`contracttype`: Selected contract type.p — permanent job, c — contract, t — temporary, i — training, v — voluntary, none — all contract types.

`contractperiod`: Selected contract period. f — full time, p — part time, none — all contract periods.

## Locale code
The locale code needs to be supplied in the contructor of the API client. It defines the default location as well as the language in which the search results are returned. Each locale corresponds to a Careerjet site.

The default is 'en_GB'.

In [3]:
cj  =  CareerjetAPIClient("pt_PT")

result_json = cj.search({
                        'affid'       :  config.affid,
                        'user_ip'     : '11.22.33.44',
                        'url'         : 'http://www.example.com/jobsearch?sort=date&l=Portugal',
                        'user_agent'  : 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0',
                        'location'    : 'Portugal',
                        'sort'        : 'date', 
                        'pagesize'    : 99,
                        'page'        : 2,
                        'contracttype': None,
                      })

In [18]:
jobs = result_json['jobs']
jobs[0]['date'][:-13]

'Tue, 12 Oct 2021'

In [None]:
jobs = result_json['jobs']
n_hits = result_json['hits']
n_pages = result_json['pages']

page_size = 99
for page in range(1, n_pages+1):

    # Get json
    result_json = cj.search({
                        'affid'       :  config.affid,
                        'user_ip'     : '11.22.33.44',
                        'url'         : 'http://www.example.com/jobsearch?sort=date&l=Portugal',
                        'user_agent'  : 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0',
                        'location'    : 'Portugal',
                        'sort'        : 'date', 
                        'pagesize'    : page_size,
                        'page'        : page,
                        'contracttype': None,
                      })

    # Get jobs
    jobs += result_json['jobs']

    left_hits = n_hits - page_size
    if left_hits < 99:
        page_size = left_hits
    print(f"{left_hits} hits left...")

# Save Data
if os.path.exists('career_jet_jobs.json') == False:
    with open('career_jet_jobs.json', 'w', encoding='utf-8') as json_file:
        json.dump(jobs, json_file, indent=0, ensure_ascii=False)
    json_file.close()
else:
    with open('career_jet_jobs.json', 'a+', encoding='utf-8') as json_file:
        json.dump(jobs, json_file, indent=0, ensure_ascii=False)
    json_file.close()

# Define Functions

In [64]:
def save_data_to_json(file_name, data):
    # Save Data
    if os.path.exists(file_name+'.json') == False:
        with open(file_name+'.json', 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, indent=0, ensure_ascii=False)
        json_file.close()
    else:
        with open(file_name+'.json', 'a+', encoding='utf-8') as json_file:
            json.dump(data, json_file, indent=0, ensure_ascii=False)
        json_file.close()

In [65]:
def pandas_json_to_csv(file_name):
    """Convert json to csv using pandas, needs to be a structured json [{},{},...]
    :parameter: file_name -> 'example' do not add the '.json' """
    # Json to csv
    df = pd.read_json(file_name+'.json')
    df.to_csv(file_name+'.csv')

In [78]:
def main(file_name):
    
    page_size = 99
    # start with arbitrarily large number
    n_pages = 10000
    stop = False
    cj  =  CareerjetAPIClient("pt_PT")

    result_json = cj.search({
                            'affid'       :  config.affid,
                            'user_ip'     : '11.22.33.44',
                            'url'         : 'http://www.example.com/jobsearch?sort=date&l=Portugal',
                            'user_agent'  : 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0',
                            'location'    : 'Portugal',
                            'sort'        : 'date', 
                            'pagesize'    : 99,
                            'page'        : 1,
                            'contracttype': None,
                        })
    jobs = result_json['jobs']

    try:
        salary = str(jobs['salary_min']) + ' - ' + str(jobs['salary_max'])
    except:
        salary = jobs['salary']

    job_offers = [
        {
            'job_title': jobs['title'],
            'job_description': jobs['description'],
            'post_date': jobs['date'][:-13],
            'scrape_date': date.today().strftime("%d/%m/%Y"),
            'company': jobs['company'],
            'job_location': jobs['locations'],
            'job_category': '',
            'job_href': jobs['url'],
            'salary': salary
        }
    ]
    # Get number of pages
    n_pages = result_json['pages']

    for page in tqdm(range(2, n_pages+1)):
        # Wait time 
        time.sleep(0.5)

        # Get json
        result_json = cj.search({
                            'affid'       :  config.affid,
                            'user_ip'     : '11.22.33.44',
                            'url'         : 'http://www.example.com/jobsearch?sort=date&l=Portugal',
                            'user_agent'  : 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0',
                            'location'    : 'Portugal',
                            'sort'        : 'date', 
                            'pagesize'    : page_size,
                            'page'        : page,
                            'contracttype': None,
                        })
        # Get jobs
        jobs += result_json['jobs']

        try:
            salary = str(jobs['salary_min']) + ' - ' + str(jobs['salary_max'])
        except:
            salary = jobs['salary']

        job_offers += [
            {
                'job_title': jobs['title'],
                'job_description': jobs['description'],
                'post_date': jobs['date'][:-13],
                'scrape_date': date.today().strftime("%d/%m/%Y"),
                'company': jobs['company'],
                'job_location': jobs['locations'],
                'job_category': '',
                'job_href': jobs['url'],
                'salary': salary
            }
        ]

    # Save data to json
    save_data_to_json(file_name, jobs)
    # Convert data to csv
    pandas_json_to_csv(file_name)        

# Run Main

In [79]:
main("career_jet_jobs")

  0%|          | 0/386 [00:00<?, ?it/s]

In [81]:
df = pd.read_csv('career_jet_jobs.csv')

In [85]:
df.head()

Unnamed: 0.1,Unnamed: 0,locations,site,date,url,title,description,company,salary,salary_min,salary_type,salary_currency_code,salary_max
0,0,Lisboa,,2021-09-23 07:59:59+00:00,http://jobviewtrack.com/pt-pt/job-124d416c4912...,"Voice, email and chat Tech Advisor based in Li...",German Tech Advisor in Lisbon with double bo...,Lisbon Job Offers,,,,,
1,1,Lisboa,,2021-09-23 07:59:41+00:00,http://jobviewtrack.com/pt-pt/job-1b4a41684303...,DevSecOps Engineer,"The Company Hitachi Vantara, a wholly-owned ...",Hitachi,,,,,
2,2,Aveiro,,2021-09-23 07:59:27+00:00,http://jobviewtrack.com/pt-pt/job-194a41625d01...,Operadores de Produção / Operadores de máquina...,Descrição do emprego: Operadores de Produção...,Flexiplan,,,,,
3,3,Castelo Branco,,2021-09-23 07:59:19+00:00,http://jobviewtrack.com/pt-pt/job-1c4f41604210...,Motorista de pesados,Descrição do emprego: Anúncio de emprego: ...,STBB SA,,,,,
4,4,"Santa Marta de Portuzelo, Viana do Castelo",,2021-09-23 07:59:15+00:00,http://jobviewtrack.com/pt-pt/job-1f1c41625d01...,Operador de Máquinas de Costura,Condições Requeridas Habilitações Escolares ...,,&euro;665 per month,665.0,M,EUR,665.0
