# Job crawler - Gupy  

#### Load Libs

In [1]:
import requests
import pandas as pd
from datetime import date
import time
import random
from google.cloud import bigquery
import pandas_gbq
import warnings
warnings. filterwarnings("ignore")

#### Load raw data

In [2]:
def normalize_columns(df):
    # Renomeando colunas
    df_new = df[['term','date_extraction', 'name', 'description', \
            'careerPageName', 'type', 'publishedDate', \
            'applicationDeadline', 'isRemoteWork', 'city', \
                'state', 'country', 'jobUrl', 'careerPageUrl']]
    
    df_new.rename(columns={'careerPageName': 'company_name', 
                            'publishedDate': 'published_date', 
                            'applicationDeadline': 'application_deadline', 
                            'isRemoteWork': 'is_remote_work', 
                            'jobUrl': 'job_url', 
                            'careerPageUrl': 'company_url'}, inplace=True)
    return df_new

In [3]:
# defining the url to get the data
url = "https://portal.api.gupy.io/api/job"

# defining the job title to search for
search_terms = ['Data', 'dados']

# defining the dataframe to store the data
df_jobs_raw = pd.DataFrame()

# defining today to use in some cases
today = date.today()

for term in search_terms:
    querystring = {"name":term,"offset":"1","limit":"5000"}

    time.sleep(random.randint(3, 6))
    r = requests.request("GET", url, params=querystring)

    data = r.json()

    df_term = pd.json_normalize(data, record_path =['data'])
    df_term['term'] = term
    df_term['date_extraction'] = today

    df_jobs_raw = pd.concat([df_jobs_raw, df_term], ignore_index=True)
    df_jobs_normalized = normalize_columns(df_jobs_raw)

df_jobs = df_jobs_normalized.copy()

display(df_jobs.head())

Unnamed: 0,term,date_extraction,name,description,company_name,type,published_date,application_deadline,is_remote_work,city,state,country,job_url,company_url
0,Data,2023-04-19,Analista Data Center JR - Virtualização (VMWare),SOBRE A EMPRESA:A Unidade de Negócio de Cloud ...,Telefónica Tech,vacancy_type_effective,2023-04-19T12:05:04.822Z,,False,Santana de Parnaíba,São Paulo,Brasil,https://ttech.gupy.io/job/eyJqb2JJZCI6NDcwMzc0...,https://ttech.gupy.io/eyJzb3VyY2UiOiJndXB5X3Bv...
1,Data,2023-04-19,ANALISTA II BIG DATA - GOIÂNIA/GO II || POI - ...,"Para compor ainda mais esse time de sucesso, b...",Evolução profissional a gente faz em casa,vacancy_type_effective,2023-04-18T19:14:00.918Z,,False,Goiânia,Goiás,Brasil,https://poi.gupy.io/job/eyJqb2JJZCI6NDY5ODUyNy...,https://poi.gupy.io/eyJzb3VyY2UiOiJndXB5X3Bvcn...
2,Data,2023-04-19,Estágio em Data Quality,Quer fazer parte de um time alta performance?&...,Neogrid Carreiras,vacancy_type_internship,2023-04-18T17:48:42.627Z,,True,Porto Alegre,Rio Grande do Sul,Brasil,https://neogridcarreiras.gupy.io/job/eyJqb2JJZ...,https://neogridcarreiras.gupy.io/eyJzb3VyY2UiO...
3,Data,2023-04-19,Assistente de Data Quality [LETT],Quer fazer parte de um time alta performance?&...,Neogrid Carreiras,vacancy_type_effective,2023-04-18T17:27:18.497Z,,True,São Paulo,São Paulo,Brasil,https://neogridcarreiras.gupy.io/job/eyJqb2JJZ...,https://neogridcarreiras.gupy.io/eyJzb3VyY2UiO...
4,Data,2023-04-19,Assistente de Data Quality (Manutenção),Quer fazer parte de um time alta performance?&...,Neogrid Carreiras,vacancy_type_effective,2023-04-18T17:25:43.453Z,,False,São Paulo,São Paulo,Brasil,https://neogridcarreiras.gupy.io/job/eyJqb2JJZ...,https://neogridcarreiras.gupy.io/eyJzb3VyY2UiO...


#### Understanding and cleaning data

In [4]:
df_jobs.describe()

Unnamed: 0,term,date_extraction,name,description,company_name,type,published_date,application_deadline,is_remote_work,city,state,country,job_url,company_url
count,692,692,692,692,692,692,692,33,692,692,692,692,692,692
unique,2,1,599,666,360,8,685,16,2,98,17,8,685,364
top,dados,2023-04-19,Analista de Dados,Destinado a formados dos cursos de Sistemas de...,Stefanini Group,vacancy_type_effective,2023-02-28T01:13:17.152Z,2023-04-24T00:00:00.000Z,False,São Paulo,São Paulo,Brasil,https://peers.gupy.io/job/eyJqb2JJZCI6NDI4MTc4...,https://stefanini.gupy.io/eyJzb3VyY2UiOiJndXB5...
freq,547,692,17,4,19,518,2,5,436,240,344,605,2,19


##### Change datatypes

In [5]:
df_jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 692 entries, 0 to 691
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   term                  692 non-null    object
 1   date_extraction       692 non-null    object
 2   name                  692 non-null    object
 3   description           692 non-null    object
 4   company_name          692 non-null    object
 5   type                  692 non-null    object
 6   published_date        692 non-null    object
 7   application_deadline  33 non-null     object
 8   is_remote_work        692 non-null    bool  
 9   city                  692 non-null    object
 10  state                 692 non-null    object
 11  country               692 non-null    object
 12  job_url               692 non-null    object
 13  company_url           692 non-null    object
dtypes: bool(1), object(13)
memory usage: 71.1+ KB


In [6]:
# change columns to datetime
cols = ['date_extraction', 'application_deadline', 'published_date']

for col in cols:
    df_jobs[col] = pd.to_datetime(df_jobs[col], format="%Y-%m-%d")

In [7]:
# change columns to category
cols = ['term', 'type', 'city', 'state', 'country']

for col in cols:
    df_jobs[col] = df_jobs[col].astype('category')

In [8]:
df_jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 692 entries, 0 to 691
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   term                  692 non-null    category           
 1   date_extraction       692 non-null    datetime64[ns]     
 2   name                  692 non-null    object             
 3   description           692 non-null    object             
 4   company_name          692 non-null    object             
 5   type                  692 non-null    category           
 6   published_date        692 non-null    datetime64[ns, UTC]
 7   application_deadline  33 non-null     datetime64[ns, UTC]
 8   is_remote_work        692 non-null    bool               
 9   city                  692 non-null    category           
 10  state                 692 non-null    category           
 11  country               692 non-null    category           
 12  job_url 

##### Check missing values

In [9]:
df_jobs.isna().sum()

term                      0
date_extraction           0
name                      0
description               0
company_name              0
type                      0
published_date            0
application_deadline    659
is_remote_work            0
city                      0
state                     0
country                   0
job_url                   0
company_url               0
dtype: int64

##### Fixing duplicated values

In [10]:
# check for duplicated rows
df_jobs.duplicated().sum()

0

In [11]:
# see if any data is duplicated for job_url column and drop it
print(f"Duplicated before: {df_jobs.duplicated(subset='job_url').sum()}" )
df_jobs.drop_duplicates(subset='job_url', inplace=True)

print(f"Duplicated after: {df_jobs.duplicated(subset='job_url').sum()}")

Duplicated before: 7
Duplicated after: 0


In [12]:
df_jobs.sample(20)

Unnamed: 0,term,date_extraction,name,description,company_name,type,published_date,application_deadline,is_remote_work,city,state,country,job_url,company_url
678,dados,2023-04-19,Cientista de Dados,Vaga para nosso BANCO DE TALENTOSProcuramos Ci...,Brivia,vacancy_type_talent_pool,2021-06-04 22:19:29+00:00,NaT,True,Florianópolis,Santa Catarina,Brasil,https://brivia.gupy.io/job/eyJqb2JJZCI6OTQxMTE...,https://brivia.gupy.io/eyJzb3VyY2UiOiJndXB5X3B...
427,dados,2023-04-19,Pessoa Gerente de Dados SR,"Venha conosco imaginar o amanhã, cuidar do hoj...",Getnet Brasil,vacancy_type_effective,2023-03-16 14:29:08.421000+00:00,NaT,False,São Paulo,São Paulo,Brasil,https://getnet.gupy.io/job/eyJqb2JJZCI6NDQ2MjM...,https://getnet.gupy.io/eyJzb3VyY2UiOiJndXB5X3B...
459,dados,2023-04-19,Coordenador(a) de Dados - área de Prevenção a ...,A iuguSomos a primeira plataforma online do Br...,iugu,vacancy_type_effective,2023-03-09 15:17:04.837000+00:00,NaT,False,São Paulo,São Paulo,Brasil,https://iugu.gupy.io/job/eyJqb2JJZCI6NDMzMDY0N...,https://iugu.gupy.io/eyJzb3VyY2UiOiJndXB5X3Bvc...
413,dados,2023-04-19,PESSOA DESENVOLVEDORA - AUTOMAÇÃO SR (DADOS),AQUI SUA CARREIRA COMEÇA GRANDE.VENHA SER UMA ...,YDUQS - Vagas Tech,vacancy_type_effective,2023-03-20 18:31:16.218000+00:00,NaT,False,Rio de Janeiro,Rio de Janeiro,,https://yduqstech.gupy.io/job/eyJqb2JJZCI6NDQ2...,https://yduqstech.gupy.io/eyJzb3VyY2UiOiJndXB5...
551,dados,2023-04-19,[Dados e BI] Analista de dados,Estamos procurando um Analista de Dados para c...,Beep Saúde,vacancy_type_effective,2023-01-27 17:30:04.272000+00:00,NaT,False,Rio de Janeiro,Rio de Janeiro,Brasil,https://beepsaude.gupy.io/job/eyJqb2JJZCI6Mzk0...,https://beepsaude.gupy.io/eyJzb3VyY2UiOiJndXB5...
593,dados,2023-04-19,Advogado(a) Pleno | Privacidade e Proteção de ...,"Se você é apaixonado(a) por Direito, Tecnologi...",Peck Advogados,vacancy_type_associate,2022-12-23 22:02:07.228000+00:00,NaT,False,São Paulo,São Paulo,Brasil,https://peckadvogados.gupy.io/job/eyJqb2JJZCI6...,https://peckadvogados.gupy.io/eyJzb3VyY2UiOiJn...
263,dados,2023-04-19,Assistente de Base de Dados,"Aqui na Nelogica, desenvolvemos soluções para ...",Nelogica,vacancy_type_effective,2023-04-10 17:26:30.678000+00:00,NaT,False,São Paulo,São Paulo,Brasil,https://nelogica.gupy.io/job/eyJqb2JJZCI6NDYyN...,https://nelogica.gupy.io/eyJzb3VyY2UiOiJndXB5X...
76,Data,2023-04-19,Danone - Master Data Manager W3,We have an amazing opportunity in our Mexico C...,Oportunidad Laboral en Page Resourcing Mexico,vacancy_type_effective,2023-02-08 22:55:10.898000+00:00,NaT,False,,,México,https://pageoutsourcingmx.gupy.io/job/eyJqb2JJ...,https://pageoutsourcingmx.gupy.io/eyJzb3VyY2Ui...
600,dados,2023-04-19,Especialista Banco de Dados - Projetos,Vice-Presidência: VP B2BDiretoria: Unidade de ...,Vem Pra Vivo,vacancy_type_effective,2022-12-15 14:11:03.366000+00:00,NaT,False,Barueri,São Paulo,Brasil,https://vivo.gupy.io/job/eyJqb2JJZCI6MzY5ODA2O...,https://vivo.gupy.io/eyJzb3VyY2UiOiJndXB5X3Bvc...
20,Data,2023-04-19,Estágio - Data Science NE,Destinado a universitários dos cursos de Engen...,Visagio,vacancy_type_internship,2023-04-08 03:47:59.613000+00:00,NaT,False,Recife,Pernambuco,Brasil,https://visagio.gupy.io/job/eyJqb2JJZCI6NDYxOD...,https://visagio.gupy.io/eyJzb3VyY2UiOiJndXB5X3...


## Load it
Send the data to Bigquery

In [17]:
# use local json file to authenticate
SERVICE_ACCOUNT_JSON = r"D:\DataAnalytics\projects\jobs_tracker\job-tracker-384222-84c152151770.json"
client = bigquery.Client.from_service_account_json(SERVICE_ACCOUNT_JSON)

pandas_gbq.to_gbq(df_jobs, 'job_portal_data.gupy', project_id='job-tracker-384222', if_exists='append')

100%|██████████| 1/1 [00:00<00:00, 990.16it/s]
