# Job crawler - Gupy  

#### Load Libs

In [15]:
import requests
import pandas as pd
from datetime import date
import time
import random
from google.cloud import bigquery
import pandas_gbq
import warnings
warnings. filterwarnings("ignore")

#### Load raw data

In [16]:
def normalize_columns(df):
    # Renomeando colunas
    df_new = df[['portal', 'term','date_extraction', 'name', 'description', \
            'careerPageName', 'type', 'publishedDate', \
            'applicationDeadline', 'isRemoteWork', 'city', \
                'state', 'country', 'jobUrl', 'careerPageUrl']]
    
    df_new.rename(columns={'careerPageName': 'company_name', 
                            'publishedDate': 'published_date', 
                            'applicationDeadline': 'application_deadline', 
                            'isRemoteWork': 'is_remote_work', 
                            'jobUrl': 'job_url', 
                            'careerPageUrl': 'company_url'}, inplace=True)
    return df_new

In [17]:
# defining the url to get the data
url = "https://portal.api.gupy.io/api/job"

# defining the job title to search for
search_terms = ['Data', 'dados']

# defining the dataframe to store the data
df_jobs_raw = pd.DataFrame()

# defining today to use in some cases
today = date.today()

for term in search_terms:
    querystring = {"name":term,"offset":"1","limit":"5000"}

    time.sleep(random.randint(3, 6))
    r = requests.request("GET", url, params=querystring)

    data = r.json()

    df_term = pd.json_normalize(data, record_path =['data'])
    df_term['portal'] = "Gupy"
    df_term['term'] = term
    df_term['date_extraction'] = today

    df_jobs_raw = pd.concat([df_jobs_raw, df_term], ignore_index=True)
    df_jobs_normalized = normalize_columns(df_jobs_raw)

df_jobs = df_jobs_normalized.copy()

display(df_jobs.head())

Unnamed: 0,portal,term,date_extraction,name,description,company_name,type,published_date,application_deadline,is_remote_work,city,state,country,job_url,company_url
0,Gupy,Data,2023-04-19,Analista Data Center JR - Virtualização (VMWare),SOBRE A EMPRESA:A Unidade de Negócio de Cloud ...,Telefónica Tech,vacancy_type_effective,2023-04-19T12:05:04.822Z,,False,Santana de Parnaíba,São Paulo,Brasil,https://ttech.gupy.io/job/eyJqb2JJZCI6NDcwMzc0...,https://ttech.gupy.io/eyJzb3VyY2UiOiJndXB5X3Bv...
1,Gupy,Data,2023-04-19,ANALISTA II BIG DATA - GOIÂNIA/GO II || POI - ...,"Para compor ainda mais esse time de sucesso, b...",Evolução profissional a gente faz em casa,vacancy_type_effective,2023-04-18T19:14:00.918Z,,False,Goiânia,Goiás,Brasil,https://poi.gupy.io/job/eyJqb2JJZCI6NDY5ODUyNy...,https://poi.gupy.io/eyJzb3VyY2UiOiJndXB5X3Bvcn...
2,Gupy,Data,2023-04-19,Estágio em Data Quality,Quer fazer parte de um time alta performance?&...,Neogrid Carreiras,vacancy_type_internship,2023-04-18T17:48:42.627Z,,True,Porto Alegre,Rio Grande do Sul,Brasil,https://neogridcarreiras.gupy.io/job/eyJqb2JJZ...,https://neogridcarreiras.gupy.io/eyJzb3VyY2UiO...
3,Gupy,Data,2023-04-19,Assistente de Data Quality [LETT],Quer fazer parte de um time alta performance?&...,Neogrid Carreiras,vacancy_type_effective,2023-04-18T17:27:18.497Z,,True,São Paulo,São Paulo,Brasil,https://neogridcarreiras.gupy.io/job/eyJqb2JJZ...,https://neogridcarreiras.gupy.io/eyJzb3VyY2UiO...
4,Gupy,Data,2023-04-19,Assistente de Data Quality (Manutenção),Quer fazer parte de um time alta performance?&...,Neogrid Carreiras,vacancy_type_effective,2023-04-18T17:25:43.453Z,,False,São Paulo,São Paulo,Brasil,https://neogridcarreiras.gupy.io/job/eyJqb2JJZ...,https://neogridcarreiras.gupy.io/eyJzb3VyY2UiO...


#### Understanding and cleaning data

In [18]:
df_jobs.describe()

Unnamed: 0,portal,term,date_extraction,name,description,company_name,type,published_date,application_deadline,is_remote_work,city,state,country,job_url,company_url
count,692,692,692,692,692,692,692,692,33,692,692,692,692,692,692
unique,1,2,1,599,666,360,8,685,16,2,98,17,8,685,364
top,Gupy,dados,2023-04-19,Analista de Dados,Destinado a formados dos cursos de Sistemas de...,Stefanini Group,vacancy_type_effective,2023-02-28T01:13:17.152Z,2023-04-24T00:00:00.000Z,False,São Paulo,São Paulo,Brasil,https://peers.gupy.io/job/eyJqb2JJZCI6NDI4MTc4...,https://stefanini.gupy.io/eyJzb3VyY2UiOiJndXB5...
freq,692,547,692,17,4,19,518,2,5,436,240,344,605,2,19


##### Change datatypes

In [19]:
df_jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 692 entries, 0 to 691
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   portal                692 non-null    object
 1   term                  692 non-null    object
 2   date_extraction       692 non-null    object
 3   name                  692 non-null    object
 4   description           692 non-null    object
 5   company_name          692 non-null    object
 6   type                  692 non-null    object
 7   published_date        692 non-null    object
 8   application_deadline  33 non-null     object
 9   is_remote_work        692 non-null    bool  
 10  city                  692 non-null    object
 11  state                 692 non-null    object
 12  country               692 non-null    object
 13  job_url               692 non-null    object
 14  company_url           692 non-null    object
dtypes: bool(1), object(14)
memory usage: 76.

In [33]:
# change columns to datetime
cols = ['date_extraction', 'application_deadline', 'published_date']

for col in cols:
    df_jobs[col] = pd.to_datetime(df_jobs[col], format="%Y-%m-%d")

In [21]:
# change columns to category
cols = ['term', 'type', 'city', 'state', 'country']

for col in cols:
    df_jobs[col] = df_jobs[col].astype('category')

In [34]:
df_jobs['published_date']

0     2023-04-19 12:05:04.822000+00:00
1     2023-04-18 19:14:00.918000+00:00
2     2023-04-18 17:48:42.627000+00:00
3     2023-04-18 17:27:18.497000+00:00
4     2023-04-18 17:25:43.453000+00:00
                    ...               
687          2021-03-01 22:36:58+00:00
688          2021-01-08 21:01:27+00:00
689          2020-12-05 21:45:14+00:00
690          2020-05-29 19:52:41+00:00
691          2020-03-31 19:21:48+00:00
Name: published_date, Length: 685, dtype: datetime64[ns, UTC]

In [31]:
df_jobs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 685 entries, 0 to 691
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   portal                685 non-null    object             
 1   term                  685 non-null    category           
 2   date_extraction       685 non-null    datetime64[ns]     
 3   name                  685 non-null    object             
 4   description           685 non-null    object             
 5   company_name          685 non-null    object             
 6   type                  685 non-null    category           
 7   published_date        685 non-null    datetime64[ns, UTC]
 8   application_deadline  33 non-null     datetime64[ns, UTC]
 9   is_remote_work        685 non-null    bool               
 10  city                  685 non-null    category           
 11  state                 685 non-null    category           
 12  country 

##### Check missing values

In [23]:
df_jobs.isna().sum()

portal                    0
term                      0
date_extraction           0
name                      0
description               0
company_name              0
type                      0
published_date            0
application_deadline    659
is_remote_work            0
city                      0
state                     0
country                   0
job_url                   0
company_url               0
dtype: int64

##### Fixing duplicated values

In [24]:
# check for duplicated rows
df_jobs.duplicated().sum()

0

In [25]:
# see if any data is duplicated for job_url column and drop it
print(f"Duplicated before: {df_jobs.duplicated(subset='job_url').sum()}" )
df_jobs.drop_duplicates(subset='job_url', inplace=True)

print(f"Duplicated after: {df_jobs.duplicated(subset='job_url').sum()}")

Duplicated before: 7
Duplicated after: 0


In [26]:
df_jobs.sample(20)

Unnamed: 0,portal,term,date_extraction,name,description,company_name,type,published_date,application_deadline,is_remote_work,city,state,country,job_url,company_url
215,Gupy,dados,2023-04-19,Engenheiro de Dados Sênior,# Quem somos?&nbsp;📲A&nbsp;3CON&nbsp;é enabler...,3CON | IT & Digital,vacancy_type_effective,2023-04-14 14:08:44.801000+00:00,NaT,False,Rio de Janeiro,Rio de Janeiro,Brasil,https://trescon.gupy.io/job/eyJqb2JJZCI6NDY1NT...,https://trescon.gupy.io/eyJzb3VyY2UiOiJndXB5X3...
198,Gupy,dados,2023-04-19,Cientista de Dados Especialista I (Lojas e Fra...,O que é o&nbsp;gb.tech?&nbsp;&nbsp;Somos o tim...,Grupo Boticário,vacancy_type_effective,2023-04-14 20:49:51.321000+00:00,2023-04-28 00:00:00+00:00,True,,,,https://grupoboticario.gupy.io/job/eyJqb2JJZCI...,https://grupoboticario.gupy.io/eyJzb3VyY2UiOiJ...
270,Gupy,dados,2023-04-19,Engenheiro de Dados Sênior (Tech Lead),O QUE É SER UM MUTANT?Quando você decide ser M...,Mutant,vacancy_type_effective,2023-04-10 13:48:16.934000+00:00,NaT,True,São Paulo,São Paulo,Brasil,https://mutantbrvagas.gupy.io/job/eyJqb2JJZCI6...,https://mutantbrvagas.gupy.io/eyJzb3VyY2UiOiJn...
445,Gupy,dados,2023-04-19,Analista de Dados JR (Vaga afirmativa para pes...,Nós estamos procurando um talento para se torn...,Embraer,vacancy_type_effective,2023-03-13 18:57:28.031000+00:00,NaT,False,São José dos Campos,São Paulo,Brasil,https://embraer.gupy.io/job/eyJqb2JJZCI6NDM1MD...,https://embraer.gupy.io/eyJzb3VyY2UiOiJndXB5X3...
196,Gupy,dados,2023-04-19,ANALISTA DE DADOS SR - MOVIDA,"OLÁ, SEJA BEM VINDO AO MOVE CARREIRAS, RECRUTA...",Recrutamento Interno SIMPAR,vacancy_type_effective,2023-04-14 21:00:48.496000+00:00,2023-04-21 00:00:00+00:00,False,São Paulo,São Paulo,Brasil,https://recrutamentointernosimpar.gupy.io/job/...,https://recrutamentointernosimpar.gupy.io/eyJz...
396,Gupy,dados,2023-04-19,ENGENHEIRO DE DADOS PL,Fazer parte da Afinz é integrar uma empresa qu...,Afinz,vacancy_type_effective,2023-03-22 14:04:17.547000+00:00,NaT,True,Sorocaba,São Paulo,Brasil,https://afinz.gupy.io/job/eyJqb2JJZCI6NDQ4ODc3...,https://afinz.gupy.io/eyJzb3VyY2UiOiJndXB5X3Bv...
570,Gupy,dados,2023-04-19,Especialista Ciência de Dados,Integrar o time de dados de Cognição significa...,Sicredi,vacancy_type_effective,2023-01-16 12:16:22.586000+00:00,NaT,False,Porto Alegre,Rio Grande do Sul,Brasil,https://sicredi.gupy.io/job/eyJqb2JJZCI6MzkyMT...,https://sicredi.gupy.io/eyJzb3VyY2UiOiJndXB5X3...
522,Gupy,dados,2023-04-19,Analista de Migração de Dados,Estamos selecionando Analista de TI com o obje...,Larco Petroleo,vacancy_type_effective,2023-02-10 19:22:29.017000+00:00,NaT,False,Salvador,Bahia,Brasil,https://larcopetroleo.gupy.io/job/eyJqb2JJZCI6...,https://larcopetroleo.gupy.io/eyJzb3VyY2UiOiJn...
582,Gupy,dados,2023-04-19,Advogado(a) Pleno - Proteção de Dados e Privac...,"Se você é apaixonado(a) por Direito, Tecnologi...",Peck Advogados,vacancy_type_associate,2023-01-06 15:31:30.424000+00:00,NaT,False,São Paulo,São Paulo,Brasil,https://peckadvogados.gupy.io/job/eyJqb2JJZCI6...,https://peckadvogados.gupy.io/eyJzb3VyY2UiOiJn...
321,Gupy,dados,2023-04-19,Vaga Analista de Automação e Dados,Estamos procurando um analista para se juntar ...,evermart,vacancy_legal_entity,2023-03-31 22:11:55.983000+00:00,NaT,True,São Paulo,São Paulo,Brasil,https://evermart.gupy.io/job/eyJqb2JJZCI6NDU2M...,https://evermart.gupy.io/eyJzb3VyY2UiOiJndXB5X...


## Load it
Send the data to Bigquery

In [29]:
# use local json file to authenticate
SERVICE_ACCOUNT_JSON = r"..\job-tracker-384222-84c152151770.json"
client = bigquery.Client.from_service_account_json(SERVICE_ACCOUNT_JSON)

pandas_gbq.to_gbq(df_jobs, 'job_portal_data.gupy', project_id='job-tracker-384222', if_exists='append')

100%|██████████| 1/1 [00:00<?, ?it/s]
