In [249]:
import pandas as pd
import numpy as np
import datetime as dt
from w3lib.html import remove_tags
import re

# Questions
- Check if cleaning the job title only in the main dataframe and after deleting exact duplicates yields more or less duplicates.
- For the duplicates, delete based on which subset? 
    - `subset=['job_title', 'job_description', 'company', 'job_location']`
    - `subset=['job_title', 'company', 'job_location']`
    - `subset=['job_title', 'job_description', 'company', 'job_location', 'post_date']`

In [250]:
import os
while not os.getcwd().endswith("Data Cleaning"):
    os.chdir("..")

os.getcwd()

'c:\\Users\\gilnr\\OneDrive - NOVASBE\\Work Project\\Thesis - Code\\Data Cleaning'

In [251]:
from pathlib import Path, PureWindowsPath

main_folder = PureWindowsPath("c:\\Users\\gilnr\\OneDrive - NOVASBE\\Work Project\\Thesis - Code")
MAIN_FOLDER = Path(main_folder)
DATA_FOLDER = MAIN_FOLDER / "Data"
DATA_FOLDER

WindowsPath('c:/Users/gilnr/OneDrive - NOVASBE/Work Project/Thesis - Code/Data')

# Load all datasets 

In [252]:
bons_empregos = pd.read_json(DATA_FOLDER / 'bons_empregos_jobs.json')
bons_empregos_2 = pd.read_json(DATA_FOLDER / 'BonsEmpregos.json', lines=True)
career_jet = pd.read_json(DATA_FOLDER / 'career_jet_api.json', lines=True)
carga_de_trabalhos = pd.read_json(DATA_FOLDER / 'CargaDeTrabalhos.json', lines=True)
emprego_xl_2 = pd.read_json(DATA_FOLDER / 'emprego_xl_jobs.json')
emprego_xl = pd.read_json(DATA_FOLDER / 'EmpregoXl.json', lines=True)
emprego_org = pd.read_json(DATA_FOLDER / 'EmpregoOrg.json')
itjobs = pd.read_json(DATA_FOLDER / 'itjobs_api.json', lines=True)
jooble = pd.read_json(DATA_FOLDER / 'jooble_api.json', lines=True)
landing_jobs = pd.read_json(DATA_FOLDER / 'landingjobs_api.json', lines=True)
net_empregos_2 = pd.read_json(DATA_FOLDER / 'net_empregos.json')
net_empregos = pd.read_json(DATA_FOLDER / 'NetEmpregos.json', lines=True)

In [253]:
dataframes = [bons_empregos, bons_empregos_2, career_jet, carga_de_trabalhos, emprego_xl, 
              emprego_xl_2, emprego_org, itjobs, jooble, landing_jobs, net_empregos, net_empregos_2]
total = 0
for i in dataframes:
    print(f'Number of job vacancies: {len(i)}')
    total += len(i)
print(f'Total job vacancies before processing {total}')

Number of job vacancies: 2576
Number of job vacancies: 2427
Number of job vacancies: 19602
Number of job vacancies: 400
Number of job vacancies: 40411
Number of job vacancies: 36318
Number of job vacancies: 99
Number of job vacancies: 10954
Number of job vacancies: 3253
Number of job vacancies: 1907
Number of job vacancies: 44202
Number of job vacancies: 54800
Total job vacancies before processing 216949


In [254]:
jobs_dfs = [bons_empregos, career_jet, carga_de_trabalhos, emprego_xl, emprego_org, itjobs, jooble, landing_jobs, net_empregos]
websites = ['Bons empregos', 'Career Jet', 'Carga de Trabalhos', 'Emprego XL', 'Emprego.org','ITjobs','Jooble','Landing Jobs','Net-empregos']
total = 0
for idx,i in enumerate(jobs_dfs):
    print(f'Number of job vacancies: {len(i)}, {websites[idx]}')
    total += len(i)
print(f'Total job vacancies before processing {total}')

Number of job vacancies: 2576, Bons empregos
Number of job vacancies: 19602, Career Jet
Number of job vacancies: 400, Carga de Trabalhos
Number of job vacancies: 40411, Emprego XL
Number of job vacancies: 99, Emprego.org
Number of job vacancies: 10954, ITjobs
Number of job vacancies: 3253, Jooble
Number of job vacancies: 1907, Landing Jobs
Number of job vacancies: 44202, Net-empregos
Total job vacancies before processing 123404


# Data Cleaning
- For each website there are specific categories that we need to attend. Some require filtering for job location, others cleaning the job description, and the post date.

## Drop Job Vacancies
What makes a unique job vacancy?
- For our analysis it will be: [job_title, job_description, company, job_location]

# General Functions

In [255]:
def copy_df(dataframe):
   return dataframe.copy()

def replacenan(dataframe):
    dataframe.replace('nan', np.nan, inplace=True)
    return dataframe
    
def dropNullJobs(dataframe):
    """
    Drop null values that make an online job vacancy unusable for analysis.
    The subset to drop is: ['post_date', 'job_title', 'job_description']
    """
    dataframe.dropna(subset=['post_date', 'job_title', 'job_description'], inplace=True)
    return dataframe

# remove duplicates
def removeDupes(dataframe, subset=['job_title', 'job_description', 'company', 'job_location']):
    dataframe = dataframe.sort_values(by='post_date').drop_duplicates(subset=subset, keep='last')
    return dataframe

def listToRows(dataframe, column):
    return dataframe.explode(column)

def removeTags(dataframe, column_list):
    for i in column_list:
        dataframe[i] = dataframe[i].apply(remove_tags)
    return dataframe

# Description
def clean_text(text):
    to_replace = ['\r', '\n', '•']
    replace = [' ', ' ', '\n']

    for idx, val in enumerate(to_replace):
        text = text.replace(val, replace[idx])
    text = text.strip()
    return text

def cleanDescription(dataframe, column_list):
    for i in column_list:
        dataframe[i] = dataframe[i].apply(lambda x: clean_text(x))
    return dataframe

def invertDate(x):
    if type(x) == float:
        return np.nan
    date = x.split('-')
    return date[2].strip()+'-'+date[1].strip()+'-'+date[0].strip()

def pipeInvertDate(dataframe, function=invertDate):
    dataframe['post_date'] = dataframe['post_date'].apply(lambda x: function(x))
    return dataframe

# Date Related Functions

In [256]:
def postDatePreprocess(dataframe, sep=" "):
    dataframe['post_date'] = dataframe['post_date'].apply(lambda x: x.split(sep)[0]) 
    return dataframe

def postDateFillNa(dataframe):
    dataframe['post_date'] = dataframe['post_date'].ffill(limit=1).bfill()
    return dataframe

# convert portuguese months to numbers
def longToShortDate(x, sep):
    months = ['janeiro', 'fevereiro','março', 'abril', 'maio', 'junho', 'julho', 'agosto', 'setembro', 'outubro', 'novembro', 'dezembro']
    months_dic = {value:idx+1 for idx, value in enumerate(months)}
    date = [i.strip() for i in x.split(sep)]
    return f'{date[0]}/{months_dic[date[1]]}/{date[2]}'

# convert to datetime object
def convertToDatetime(dataframe, function, sep=' '):
    # Remove comma from date
    dataframe['post_date'] = dataframe['post_date'].apply(lambda x: str(x).lower().replace(',',''))
    dataframe['post_date'] = dataframe['post_date'].apply(lambda x: dt.datetime.strptime(function(x, sep), "%d/%m/%Y"))
    return dataframe

# Convert Scrape date to datetime
def toDatetime(dataFrame, columns_list):
    for i in columns_list:
        dataFrame[i] = pd.to_datetime(dataFrame[i])
    return dataFrame
    
def notDateToNan(x):
    if re.findall('(0[1-9]|[12][0-9]|3[01])[-](0[1-9]|1[012])[-](19|20)\d\d', str(x)) != []:
        return x
    else:
        return np.nan

def applyFuncToColumn(dataframe, function=notDateToNan, columns_list=['post_date']):
    for i in columns_list:
        dataframe[i] = dataframe[i].apply(lambda x: function(x))
    return dataframe

## Define `Clean Job Title` Functions

In [257]:
import functools
from typing import Callable

ComposableFunction = Callable[[str], str]

def compose(*functions: ComposableFunction) -> ComposableFunction:
    return functools.reduce(lambda f, g: lambda x: g(f(x)), functions)

def cleanJobChars(x: str) -> str:
    # Capitalize the job title
    x = x.lower()
    stop_chars = ['(m/f)', 'm/f', '-', ' - ', ' – ', '(remote)', ' / ', '(', 'para']
    ex = False
    for i in stop_chars:
        if i in x:
            if i == '-':
                for exception in ['-se', '-o', '-a', '-os', '-as', 'e-', '-e']:
                    if exception in x:
                        ex = True
                if ex == False:
                    x = x.split(i)[0]
                pass
            else:
                x = x.split(i)[0]     
    return x.strip()

def replaceGenderWords(x: str) -> str:
    gender_words = ['/a', '/o', '/as', '/os', '/e', '/es']
    for i in gender_words:
        x = x.replace(i, '')
    return x

def replaceCommonFillers(x: str) -> str:
    fillers = ['recruta-se', 'oferta de emprego:', 'oferta:', 'oferta de emprego', 'oferta', 'precisa-se', 'precisas-se']
    for i in fillers:
        x = x.replace(i, '')
    return x

cleanJobTitle = compose(replaceCommonFillers, replaceGenderWords, cleanJobChars)

# How to use:
# applyFuncToColumn(bons_empregos, function=cleanJobTitle, columns_list=['job_title'])

# Total jobs grouped by Year and Month

In [258]:
def totalJobsByYearMonth(dataframe):
    dataframe['post_year'] = dataframe['post_date'].dt.year
    dataframe['post_month'] = dataframe['post_date'].dt.month
    return pd.DataFrame(dataframe.groupby(['post_year', 'post_month'])['job_title'].count()).sort_values(by=['post_year', 'post_month'], ascending=False)

## Bons Empregos
- Specific functions:
    - `getPortugalLocation`

In [259]:
def getPortugalLocation(dataframe):
    # Get only job offers in Portugal
    dataframe = dataframe.loc[dataframe['job_location'] != 'Estrangeiro'].copy()
    return dataframe

In [260]:
bons_empregos_clean = (bons_empregos.
                    pipe(copy_df).
                    pipe(replacenan).
                    pipe(postDateFillNa).
                    pipe(dropNullJobs).
                    pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(toDatetime, ['scrape_date']).
                    pipe(getPortugalLocation).
                    pipe(convertToDatetime, longToShortDate).
                    pipe(removeDupes)
)

print(f'Previous shape: {bons_empregos.shape}\nCurrent shape:{bons_empregos_clean.shape}')
bons_empregos_clean.head()

Previous shape: (2576, 8)
Current shape:(1988, 8)


Unnamed: 0,job_location,job_category,job_description,job_title,post_date,scrape_date,job_href,company
2575,Porto,Outros,"Palavras-chave: Java, Jenkins, Micro-serviços\...",java developer,2020-10-06,2021-05-10,https://www.bonsempregos.com/oferta-emprego/ja...,Dellent
2569,Porto,Outros,A Keller Williams é uma empresa de consultoria...,gestor,2020-10-06,2021-05-10,https://www.bonsempregos.com/oferta-emprego/ge...,
2570,Porto,Outros,Empresa do ramo da Engenharia Eletrotécnica/Me...,engenheiro mecânico,2020-10-06,2021-05-10,https://www.bonsempregos.com/oferta-emprego/en...,
2574,Porto,Outros,Operador de ETAR (M/F) - Braga\nO grupo dst de...,operador de etar,2020-10-06,2021-05-10,https://www.bonsempregos.com/oferta-emprego/op...,grupo
2567,Braga,Comercial e Serviços,A Remax foi escolhida pela revista Exame a Mel...,consultor imobiliário,2020-10-07,2021-05-10,https://www.bonsempregos.com/oferta-emprego/co...,


In [261]:
totalJobsByYearMonth(bons_empregos_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,10,10
2021,9,223
2021,8,74
2021,7,218
2021,6,209
2021,5,208
2021,4,203
2021,3,190
2021,2,139
2021,1,141


## Career Jet

In [262]:
# convert job location to list
career_jet['job_location'] = career_jet['job_location'].apply(lambda x: x.split(','))

In [263]:
career_jet_clean = (career_jet.
                    pipe(copy_df).
                    pipe(replacenan).
                    pipe(postDateFillNa).
                    pipe(dropNullJobs).
                    pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(toDatetime, ['scrape_date', 'post_date']).
                    pipe(listToRows, 'job_location').
                    pipe(removeDupes)
)
print(f'Previous shape: {career_jet.shape}\nCurrent shape:{career_jet_clean.shape}')
career_jet_clean.head()

Previous shape: (19602, 9)
Current shape:(2978, 9)


Unnamed: 0,job_title,job_description,post_date,scrape_date,company,job_location,job_category,job_href,salary
6336,assistente call center,A Vertente Humana é uma empresa que presta ser...,2021-10-21,2021-10-22,,Viana do Castelo,,http://jobviewtrack.com/pt-pt/job-481b416c5e17...,
6337,chefe de receção vilamoura,Chefe de Receção Vilamoura (m/f) Referencia:...,2021-10-21,2021-10-22,Michael Page,Faro,,http://jobviewtrack.com/pt-pt/job-4c48416e4501...,
6338,eletricista,Realizar a manutenção diária em conformidade c...,2021-10-21,2021-10-22,Talenter,Portugal,,http://jobviewtrack.com/pt-pt/job-481e41684101...,
6339,motorista de pesado c+e,"A Vertente Humana, empresa de Trabalho Temporá...",2021-10-21,2021-10-22,Vertente Humana,Setúbal,,http://jobviewtrack.com/pt-pt/job-494d41604210...,
6341,controller de gestão,Ligamos grandes profissionais a grandes empres...,2021-10-21,2021-10-22,Adecco,Porto,,http://jobviewtrack.com/pt-pt/job-1913416e420a...,


In [264]:
assert career_jet_clean.post_date.dtypes == career_jet_clean.scrape_date.dtypes

In [265]:
totalJobsByYearMonth(career_jet_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,10,2978


## Carga de Trabalhos

In [266]:
carga_de_trabalhos_clean = (carga_de_trabalhos.
                    pipe(copy_df).
                    pipe(replacenan).
                    pipe(postDateFillNa).
                    pipe(dropNullJobs).
                    pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(toDatetime, ['scrape_date']).
                    pipe(convertToDatetime, longToShortDate, '/').
                    pipe(removeDupes)
)

print(f'Previous shape: {carga_de_trabalhos.shape}\nCurrent shape:{carga_de_trabalhos_clean.shape}')
carga_de_trabalhos_clean.head()

Previous shape: (400, 7)
Current shape:(84, 7)


Unnamed: 0,job_description,job_title,post_date,scrape_date,company,job_location,job_href
319,Estamos à procura de um/uma Tax &amp; Accounti...,tax &amp; accounting software specialist,2021-08-28,2021-10-30,PHC Software,Oeiras,http://www.cargadetrabalhos.net/2021/08/28/tax...
398,Estamos à procura de um/uma support trainee or...,support trainee,2021-08-28,2021-10-30,PHC Software,Oeiras ou Porto,http://www.cargadetrabalhos.net/2021/08/28/sup...
316,Estamos à procura de um/uma Customer Functiona...,customer functional analyst,2021-08-29,2021-10-30,PHC Software,Oeiras ou Porto,http://www.cargadetrabalhos.net/2021/08/29/cus...
397,Estamos à procura de um/uma junior training co...,junior training consultant,2021-08-29,2021-10-30,PHC Software,Oeiras ou Porto,http://www.cargadetrabalhos.net/2021/08/29/jun...
77,Somos uma produtora de som sediada em Lisboa e...,casting vozes polonesas,2021-08-30,2021-10-25,Anónimo,Lisboa,http://www.cargadetrabalhos.net/2021/08/30/cas...


In [267]:
totalJobsByYearMonth(carga_de_trabalhos_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,10,38
2021,9,34
2021,8,12


## Emprego XL

In [268]:
emprego_xl_clean = (emprego_xl.
                    pipe(copy_df).
                    pipe(replacenan).
                    pipe(postDateFillNa).
                    pipe(applyFuncToColumn).
                    pipe(pipeInvertDate).
                    pipe(dropNullJobs).
                    pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(toDatetime, ['scrape_date', 'post_date']).
                    # # pipe(convertToDatetime, longToShortDate, '/').
                    pipe(removeDupes)
)

print(f'Previous shape: {emprego_xl.shape}\nCurrent shape:{emprego_xl_clean.shape}')
emprego_xl_clean.head()

Previous shape: (40411, 7)
Current shape:(23758, 7)


Unnamed: 0,job_description,job_title,post_date,scrape_date,company,job_href,job_location
39090,"A rede CENTURY 21®, fundada nos EUA em 1971, é...",novos talentos comerciais,2020-11-02,2021-10-26,,https://www.empregoxl.com/emprego/434009/novos...,Lisboa
39087,Ser Gestor (a) Imobiliário (a) é acompanhar fa...,já considerou a actividade imobiliária?,2020-11-02,2021-10-26,Remax Metrópole,https://www.empregoxl.com/emprego/434006/ja-co...,Lisboa
39085,"A RE/MAX Liberty, presente no mercado imobiliá...",admitimos 2 profissionais,2020-11-02,2021-10-26,Remax Liberty,https://www.empregoxl.com/emprego/434000/admit...,Braga
39147,A ManpowerGroup Portugal encontra-se a recruta...,apoio ao cliente 09h,2020-11-02,2021-10-26,ManpowerGroup,https://www.empregoxl.com/emprego/433985/apoio...,Lisboa
39146,DESCRICÃO/FUNCÃOPrecisa-se administrativa com ...,coordenadora loja,2020-11-02,2021-10-26,Re/Max Soul,https://www.empregoxl.com/emprego/433983/coord...,Lisboa


In [269]:
totalJobsByYearMonth(emprego_xl_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,10,1954
2021,9,1833
2021,8,1642
2021,7,1706
2021,6,2260
2021,5,2360
2021,4,1937
2021,3,1840
2021,2,1350
2021,1,1758


## Emprego.Org `SCRAPE AGAIN WITH SCRAPY FOR THE CORRECT FIELDS`

In [270]:
emprego_org.head()

Unnamed: 0,job_title,job_description,post_date,scrape_date,job_location,job_href,salary,company
0,Consultor(a) Imobiliário(a) – MaisConsultores ...,Gostava de poder mudar a sua vida? De poder te...,2021-10-26 / Viewed 21 times,26/10/2021,Setúbal,https://empregos.org/view.php?job_id=2796901&t...,1250 EUR,
1,Oferta de emprego: Empregado de Refeitório (M/...,,2021-10-26 / Viewed 2544 times,26/10/2021,Viana Do Castelo Viana Do Castelo,https://empregos.org/view.php?job_id=2790308&t...,Não especificado,Grupo Trivalor
2,Abertura Nova Filial- Porto/ Gondomar:,Somos uma multinacional de comércio e serviços...,2021-10-25 / Viewed 1969 times,26/10/2021,Porto Porto,https://empregos.org/view.php?job_id=2796890&t...,12000 EUR,
3,Oferta: Operador Fabril:Grupo Constant,,2021-10-26 / Viewed 2521 times,26/10/2021,Viana Do Castelo Viana Do Castelo,https://empregos.org/view.php?job_id=2790310&t...,Não especificado,Grupo Constant
4,Oferta de emprego: Empregada de Limpeza (M/F) ...,,2021-10-26 / Viewed 2505 times,26/10/2021,Viana Do Castelo Viana Do Castelo,https://empregos.org/view.php?job_id=2790312&t...,Não especificado,Eurofirms E.T.T.


In [271]:
emprego_org_clean = (emprego_org.
                    pipe(copy_df).
                    pipe(replacenan).
                    pipe(postDateFillNa).
                    pipe(dropNullJobs).
                    pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(postDatePreprocess, '/').
                    pipe(toDatetime, ['scrape_date', 'post_date']).
                    pipe(removeDupes)
)

print(f'Previous shape: {emprego_org.shape}\nCurrent shape:{emprego_org_clean.shape}')
emprego_org_clean.head()

Previous shape: (99, 8)
Current shape:(23, 8)


Unnamed: 0,job_title,job_description,post_date,scrape_date,job_location,job_href,salary,company
2,abertura nova filial,Somos uma multinacional de comércio e serviços...,2021-10-25,2021-10-26,Porto Porto,https://empregos.org/view.php?job_id=2796890&t...,12000 EUR,
84,consultor negócios,Pare de adiar o seu sucesso! Encerre 2021 com ...,2021-10-26,2021-10-26,Porto,https://empregos.org/view.php?job_id=2796747&t...,25.000 Ano EUR,Maxgroup
85,gestor comercial,GESTOR COMERCIAL (M/F)\r\n\r\nO Grupo Century2...,2021-10-26,2021-10-26,Lisboa Lisboa,https://empregos.org/view.php?job_id=2796749&t...,Não especificado,century21
92,reforço de equipa,Somos uma empresa MULTINACIONAL em forte expan...,2021-10-26,2021-10-26,Maia Porto,https://empregos.org/view.php?job_id=2796760&t...,10000 EUR,Sumptuoso Crescer unip. lda
93,diretor comercial,"Oportunidade de integrar um projeto de futuro,...",2021-10-26,2021-10-26,Braga Braga,https://empregos.org/view.php?job_id=2796757&t...,1750 EUR,"Sábia Visão - Mediação Imobiliária, Lda"


## ITJOBS

In [272]:
itjobs_clean = (itjobs.
                    pipe(copy_df).
                    pipe(listToRows, 'job_location').
                    pipe(replacenan).
                    pipe(postDateFillNa).
                    pipe(dropNullJobs).
                    pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(toDatetime, ['scrape_date', 'post_date']).
                    pipe(removeDupes)
)

print(f'Previous shape: {itjobs.shape}\nCurrent shape:{itjobs_clean.shape}')
itjobs_clean.head()

Previous shape: (10954, 9)
Current shape:(6186, 9)


Unnamed: 0,job_title,job_description,post_date,scrape_date,company,job_location,job_category,job_ref,salary
3889,golang developer,Golang Developer (W/M) - Lisboa Your connectio...,2021-09-17 09:11:02,2021-10-22,Aubay,Lisboa,,https://www.itjobs.pt/oferta/405750/golang-dev...,
3888,programador pl/sql,"Com 29 anos de experiência, a Rumos Serviços a...",2021-09-17 09:11:28,2021-10-22,Rumos Serviços,Lisboa,,https://www.itjobs.pt/oferta/405805/programado...,
3887,cobol mainframe junior,Cobol Mainframe Junior (m/f) - Lisboa A Sysma...,2021-09-17 09:11:34,2021-10-22,SYSMATCH - Consultores de Sistemas de Informação,Lisboa,,https://www.itjobs.pt/oferta/402281/cobol-main...,
3886,java maven developer,Java Maven Developer (m/f) Porto A Match Prof...,2021-09-17 09:11:38,2021-10-22,Match Profiler,Porto,,https://www.itjobs.pt/oferta/402752/java-maven...,
3884,sharepoint developer,O que pretendemos:- Licenciatura em Engenharia...,2021-09-17 09:11:44,2021-10-22,Noesis Portugal - Consultadoria em Sistemas de...,Porto,,https://www.itjobs.pt/oferta/404516/sharepoint...,


In [273]:
totalJobsByYearMonth(itjobs_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,10,4150
2021,9,2036


## Jooble

In [274]:
jooble_clean = (jooble.
                    pipe(copy_df).
                    pipe(replacenan).
                    pipe(postDateFillNa).
                    pipe(dropNullJobs).
                    pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(toDatetime, ['scrape_date', 'post_date']).
                    pipe(removeTags, ['job_title']).
                    pipe(removeDupes)
)

print(f'Previous shape: {jooble.shape}\nCurrent shape:{jooble_clean.shape}')
jooble_clean.head()

Previous shape: (3253, 9)
Current shape:(1729, 9)


Unnamed: 0,job_title,job_description,post_date,scrape_date,company,job_location,job_category,job_ref,salary
723,rececionista de hotel,"A TIMING, Empresa de Trabalho Temporário e Ge...",2021-10-22,2021-10-22,Timing Portugal,Lisboa,,https://pt.jooble.org/desc/-724743558640476978...,
725,assistente de contabilidade,Descrição da Função: Reportando ao responsável...,2021-10-22,2021-10-22,"Hortisopa, Lda",Sintra,,https://pt.jooble.org/away/3112330984152892454...,
728,operador de caixa,"\r\n\r\n Adecco, empresa multinacional especia...",2021-10-22,2021-10-22,Adecco,Aveiro,,https://pt.jooble.org/desc/940066953020983867?...,
729,assistente dentária,A Experiência que gostávamos que tivesse\n\r\n...,2021-10-22,2021-10-22,OralMED Saúde,Lisboa,,https://pt.jooble.org/desc/-463883459362016540...,
732,secretária jurídica sénior,O nosso cliente é uma conceituada Sociedade de...,2021-10-22,2021-10-22,Michael Page Portugal,Lisboa,,https://pt.jooble.org/away/2953382909130229843...,


In [275]:
totalJobsByYearMonth(jooble_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,10,1729


## Landing Jobs IT

In [276]:
landing_jobs_clean = (landing_jobs.
                    pipe(copy_df).
                    pipe(replacenan).
                    pipe(postDateFillNa).
                    pipe(dropNullJobs).
                    pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(postDatePreprocess, 'T').
                    pipe(toDatetime, ['scrape_date', 'post_date']).
                    # pipe(removeTags, 'job_title').
                    pipe(removeDupes)
)

print(f'Previous shape: {landing_jobs.shape}\nCurrent shape:{landing_jobs_clean.shape}')
landing_jobs_clean.head()

Previous shape: (1907, 9)
Current shape:(302, 9)


Unnamed: 0,job_title,job_description,post_date,scrape_date,company,job_location,job_category,job_ref,salary
1284,back-end developer,"We seek flexible people, open to explore diffe...",2018-11-16,2021-10-30,Sky Technology Centre – Portugal,Lisbon,,https://landing.jobs/at/sky-technology-centre-...,
727,devops engineer,"We hope you haveLinux systems administration, ...",2020-03-27,2021-10-30,Blip,Porto,,https://landing.jobs/at/blip/devops-engineer-i...,
726,devops manager,We hope you haveAt least 5 years of proven tra...,2020-03-27,2021-10-30,Blip,Porto,,https://landing.jobs/at/blip/devops-manager,
5,back-end developer,We hope you haveAt least 3 years working with ...,2020-03-27,2021-10-22,Blip,Porto,,https://landing.jobs/at/blip/back-end-develope...,
6,senior back-end developer,We hope you haveAt least 6 years working with ...,2020-03-27,2021-10-22,Blip,Porto,,https://landing.jobs/at/blip/senior-back-end-d...,


In [277]:
totalJobsByYearMonth(landing_jobs_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,10,5
2021,9,68
2021,8,23
2021,7,5
2021,6,14
2021,5,50
2021,4,19
2021,3,38
2021,2,18
2021,1,4


## Net Empregos

In [278]:
net_empregos_clean = (net_empregos.
                    pipe(copy_df).
                    pipe(replacenan).
                    pipe(pipeInvertDate).
                    pipe(postDateFillNa).
                    pipe(dropNullJobs).
                    # two pipes are needed beacause - for some reason, the function was not replacing some words it should
                    pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(cleanDescription, ['job_title']).
                    pipe(toDatetime, ['scrape_date', 'post_date']).
                    pipe(removeDupes)
)

print(f'Previous shape: {net_empregos.shape}\nCurrent shape:{net_empregos_clean.shape}')
net_empregos_clean.head()

Previous shape: (44202, 8)
Current shape:(34806, 8)


Unnamed: 0,job_description,job_title,post_date,scrape_date,company,job_location,job_category,job_href
44201,A Ankix é uma empresa de competências tecnológ...,iosndroid developer,2021-10-22,2021-10-30,Ankix,Lisboa,Informática ( Programação ),https://www.net-empregos.com/7806923/ios-andro...
44184,A Ankix é uma empresa de competências tecnológ...,java developer,2021-10-22,2021-10-30,Ankix,( Todas as Zonas ),Informática ( Programação ),https://www.net-empregos.com/7807795/java-deve...
44185,A Ankix é uma empresa de competências tecnológ...,consultor liferay senior,2021-10-22,2021-10-30,Ankix,Lisboa,Informática ( Programação ),https://www.net-empregos.com/7806945/consultor...
44186,A Ankix é uma empresa de competências tecnológ...,.net core angular,2021-10-22,2021-10-30,Ankix,Lisboa,Informática ( Programação ),https://www.net-empregos.com/7806942/-net-core...
44187,A Ankix é uma empresa de competências tecnológ...,.net,2021-10-22,2021-10-30,Ankix,Lisboa,Informática ( Programação ),https://www.net-empregos.com/7806938/-net-m-f-...


In [279]:
totalJobsByYearMonth(net_empregos_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,10,34806


# Add Website Column to all dataframes before concat

In [280]:
jobs_dfs = [bons_empregos_clean, career_jet_clean, carga_de_trabalhos_clean, emprego_xl_clean, emprego_org_clean, itjobs_clean, jooble_clean, landing_jobs_clean, net_empregos_clean]
websites = ['Bons empregos', 'Career Jet', 'Carga de Trabalhos', 'Emprego XL', 'Emprego.org','ITjobs','Jooble','Landing Jobs','Net-empregos']

# Add column with website name
for idx, value in enumerate(jobs_dfs):
    value['website'] = websites[idx]

# Concat All dataframes into one for data Deduplication

In [281]:
neworder = ['job_title','job_description','company','job_location','job_category','salary', 'post_date', 'scrape_date','job_href', 'website']

df = pd.concat([i.reindex(columns=neworder) for i in jobs_dfs])

# Validate that the concatenation is happening properly
assert len(df) == sum(len(i) for i in jobs_dfs)

In [282]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71854 entries, 2575 to 11454
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   job_title        71854 non-null  object        
 1   job_description  71854 non-null  object        
 2   company          67053 non-null  object        
 3   job_location     71377 non-null  object        
 4   job_category     47989 non-null  object        
 5   salary           5098 non-null   object        
 6   post_date        71854 non-null  datetime64[ns]
 7   scrape_date      71854 non-null  datetime64[ns]
 8   job_href         63637 non-null  object        
 9   website          71854 non-null  object        
dtypes: datetime64[ns](2), object(8)
memory usage: 6.0+ MB


## Pipeline

In [283]:
def cleanCompany(dataframe):
    def capitalize(x):
        try:
            return x.capitalize()
        except AttributeError:
            return ''
    dataframe['company'] = dataframe['company'].apply(lambda x: capitalize(x))
    return dataframe

In [291]:
df_clean = (df.
            pipe(copy_df).
            pipe(replacenan).sort_values(by='post_date').
            pipe(postDateFillNa).
            pipe(dropNullJobs).
            pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
            pipe(cleanCompany).
            pipe(cleanDescription, ['job_title', 'job_description']).
            pipe(removeDupes, ['job_title', 'job_description','company', 'job_location'])
)
df_clean.reset_index(drop=True, inplace=True)

print(f'Previous shape: {df.shape}\nCurrent shape:{df_clean.shape}')

Previous shape: (71854, 10)
Current shape:(71817, 10)


In [296]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71817 entries, 0 to 71816
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   job_title        71817 non-null  object        
 1   job_description  71817 non-null  object        
 2   company          71817 non-null  object        
 3   job_location     71340 non-null  object        
 4   job_category     47980 non-null  object        
 5   salary           5097 non-null   object        
 6   post_date        71817 non-null  datetime64[ns]
 7   scrape_date      71817 non-null  datetime64[ns]
 8   job_href         63607 non-null  object        
 9   website          71817 non-null  object        
dtypes: datetime64[ns](2), object(8)
memory usage: 5.5+ MB


In [297]:
print(f'There is a total of {len(df_clean)} jobs of which {df_clean.job_title.nunique()} have unique titles')

There is a total of 71817 jobs of which 24457 have unique titles


In [298]:
df_clean.describe()

  df_clean.describe()
  df_clean.describe()


Unnamed: 0,job_title,job_description,company,job_location,job_category,salary,post_date,scrape_date,job_href,website
count,71817,71817,71817.0,71340,47980.0,5097.0,71817,71817,63607,71817
unique,24457,63730,12153.0,701,68.0,193.0,4909,5,62609,9
top,comercial,A Multipessoal é uma empresa de referência no ...,,Lisboa,,,2021-10-30 00:00:00,2021-10-30 00:00:00,http://jobviewtrack.com/pt-pt/job-1e1341795f05...,Net-empregos
freq,593,97,5977.0,31572,11188.0,3723.0,9365,37069,4,34804
first,,,,,,,2018-11-16 00:00:00,2021-05-10 00:00:00,,
last,,,,,,,2021-10-30 17:12:31,2021-10-30 00:00:00,,


In [299]:
df_clean.head()

Unnamed: 0,job_title,job_description,company,job_location,job_category,salary,post_date,scrape_date,job_href,website
0,back-end developer,"We seek flexible people, open to explore diffe...",Sky technology centre – portugal,Lisbon,,,2018-11-16,2021-10-30,,Landing Jobs
1,senior qa engineer,We hope you haveAt least 6 years working as a ...,Blip,Porto,,,2020-03-27,2021-10-22,,Landing Jobs
2,qa engineer,We hope you haveAt least 3 years working as a ...,Blip,Porto,,,2020-03-27,2021-10-22,,Landing Jobs
3,devops engineer,"We hope you haveLinux systems administration, ...",Blip,Porto,,,2020-03-27,2021-10-30,,Landing Jobs
4,back-end developer,We hope you haveAt least 3 years working with ...,Blip,Porto,,,2020-03-27,2021-10-22,,Landing Jobs


In [300]:
totalJobsByYearMonth(df_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,10,45684
2021,9,4184
2021,8,1749
2021,7,1926
2021,6,2483
2021,5,2613
2021,4,2159
2021,3,2068
2021,2,1504
2021,1,1903


In [301]:
with open(DATA_FOLDER / 'full_data_clean.json', 'w', encoding='utf-8') as file:
    df_clean.to_json(file, force_ascii=False, orient='records')