In [100]:
import pandas as pd
import numpy as np
import datetime as dt
from w3lib.html import remove_tags
import re

# Questions
- Check if cleaning the job title only in the main dataframe and after deleting exact duplicates yields more or less duplicates.
- For the duplicates, delete based on which subset? 
    - `subset=['job_title', 'job_description', 'company', 'job_location']`
    - `subset=['job_title', 'company', 'job_location']`
    - `subset=['job_title', 'job_description', 'company', 'job_location', 'post_date']`

In [101]:
from pathlib import Path, PureWindowsPath

main_folder = PureWindowsPath("c:\\Users\\gilnr\\OneDrive - NOVASBE\\Work Project\\Code")
MAIN_FOLDER = Path(main_folder)
DATA_FOLDER = MAIN_FOLDER / "Data"
DATA_FOLDER

WindowsPath('c:/Users/gilnr/OneDrive - NOVASBE/Work Project/Code/Data')

# Load all datasets 

In [102]:
bons_empregos = pd.read_json(DATA_FOLDER / 'bons_empregos_jobs.json')
bons_empregos_2 = pd.read_json(DATA_FOLDER / 'BonsEmpregos.json', lines=True)
career_jet = pd.read_json(DATA_FOLDER / 'career_jet_api.json', lines=True)
carga_de_trabalhos = pd.read_json(DATA_FOLDER / 'CargaDeTrabalhos.json', lines=True)
emprego_xl_2 = pd.read_json(DATA_FOLDER / 'emprego_xl_jobs.json')
emprego_xl = pd.read_json(DATA_FOLDER / 'EmpregoXl.json', lines=True)
emprego_org = pd.read_json(DATA_FOLDER / 'EmpregoOrg.json', lines=True)
itjobs = pd.read_json(DATA_FOLDER / 'itjobs_api.json', lines=True)
jooble = pd.read_json(DATA_FOLDER / 'jooble_api.json', lines=True)
landing_jobs = pd.read_json(DATA_FOLDER / 'landingjobs_api.json', lines=True)
net_empregos_2 = pd.read_json(DATA_FOLDER / 'net_empregos.json')
net_empregos = pd.read_json(DATA_FOLDER / 'NetEmpregos.json', lines=True)

In [103]:
dataframes = [bons_empregos, bons_empregos_2, career_jet, carga_de_trabalhos, emprego_xl, 
              emprego_xl_2, emprego_org, itjobs, jooble, landing_jobs, net_empregos, net_empregos_2]
total = 0
for i in dataframes:
    print(f'Number of job vacancies: {len(i)}')
    total += len(i)
print(f'Total job vacancies before processing {total}')

Number of job vacancies: 2576
Number of job vacancies: 2427
Number of job vacancies: 45738
Number of job vacancies: 6315
Number of job vacancies: 45664
Number of job vacancies: 36318
Number of job vacancies: 198
Number of job vacancies: 24814
Number of job vacancies: 7483
Number of job vacancies: 4547
Number of job vacancies: 200768
Number of job vacancies: 54800
Total job vacancies before processing 431648


In [104]:
jobs_dfs = [bons_empregos, career_jet, carga_de_trabalhos, emprego_xl, emprego_org, itjobs, jooble, landing_jobs, net_empregos]
websites = ['Bons empregos', 'Career Jet', 'Carga de Trabalhos', 'Emprego XL', 'Emprego.org','ITjobs','Jooble','Landing Jobs','Net-empregos']
total = 0
for idx,i in enumerate(jobs_dfs):
    print(f'Number of job vacancies: {len(i)}, {websites[idx]}')
    total += len(i)
print(f'Total job vacancies before processing {total}')

Number of job vacancies: 2576, Bons empregos
Number of job vacancies: 45738, Career Jet
Number of job vacancies: 6315, Carga de Trabalhos
Number of job vacancies: 45664, Emprego XL
Number of job vacancies: 198, Emprego.org
Number of job vacancies: 24814, ITjobs
Number of job vacancies: 7483, Jooble
Number of job vacancies: 4547, Landing Jobs
Number of job vacancies: 200768, Net-empregos
Total job vacancies before processing 338103


# Data Cleaning
- For each website there are specific categories that we need to attend. Some require filtering for job location, others cleaning the job description, and the post date.

## Drop Job Vacancies
What makes a unique job vacancy?
- For our analysis it will be: [job_title, job_description, company, job_location]

# General Functions

In [105]:
def copy_df(dataframe):
   return dataframe.copy()

def replacenan(dataframe):
    dataframe.replace('nan', np.nan, inplace=True)
    dataframe.replace('', np.nan, inplace=True)
    return dataframe
    
def dropNullJobs(dataframe):
    """
    Drop null values that make an online job vacancy unusable for analysis.
    The subset to drop is: ['post_date', 'job_title', 'job_description']
    """
    dataframe.dropna(subset=['post_date', 'job_title', 'job_description'], inplace=True)
    return dataframe

# remove duplicates
def removeDupes(dataframe, subset=['job_title', 'job_description', 'company', 'job_location', 'post_date']):
    dataframe = dataframe.sort_values(by='post_date').drop_duplicates(subset=subset, keep='last')
    return dataframe

def listToRows(dataframe, column):
    return dataframe.explode(column)

def removeTags(dataframe, column_list):
    for i in column_list:
        dataframe[i] = dataframe[i].apply(remove_tags)
    return dataframe

# Description
def clean_text(text):
    to_replace = ['\r', '\n', '•']
    replace = [' ', ' ', '\n']

    for idx, val in enumerate(to_replace):
        text = text.replace(val, replace[idx])
    text = text.strip()
    return text

def cleanDescription(dataframe, column_list):
    for i in column_list:
        dataframe[i] = dataframe[i].apply(lambda x: clean_text(x))
    return dataframe

def invertDate(x):
    if type(x) == float:
        return np.nan
    date = x.split('-')
    return date[2].strip()+'-'+date[1].strip()+'-'+date[0].strip()

def pipeInvertDate(dataframe, function=invertDate):
    dataframe['post_date'] = dataframe['post_date'].apply(lambda x: function(x))
    return dataframe

# Date Related Functions

In [106]:
def postDatePreprocess(dataframe, sep=" "):
    dataframe['post_date'] = dataframe['post_date'].apply(lambda x: x.split(sep)[0]) 
    return dataframe

def postDateFillNa(dataframe):
    dataframe['post_date'] = dataframe['post_date'].ffill(limit=1).bfill()
    return dataframe

# convert portuguese months to numbers
def longToShortDate(x, sep):
    months = ['janeiro', 'fevereiro','março', 'abril', 'maio', 'junho', 'julho', 'agosto', 'setembro', 'outubro', 'novembro', 'dezembro']
    months_dic = {value:idx+1 for idx, value in enumerate(months)}
    date = [i.strip() for i in x.split(sep)]
    # return f'{date[0]}/{months_dic[date[1]]}/{date[2]}'
    return f'{date[2]}/{months_dic[date[1]]}/{date[0]}'


# convert to datetime object
def convertToDatetime(dataframe, function, sep=' '):
    # Remove comma from date
    dataframe['post_date'] = dataframe['post_date'].apply(lambda x: str(x).lower().replace(',',''))
    dataframe['post_date'] = dataframe['post_date'].apply(lambda x: dt.datetime.strptime(function(x, sep), "%Y/%m/%d"))
    return dataframe

# Convert Scrape date to datetime
def toDatetime(dataFrame, columns_list, dayfirst=False):
    for i in columns_list:
        dataFrame[i] = pd.to_datetime(dataFrame[i], dayfirst=dayfirst)
    return dataFrame
    
def notDateToNan(x):
    if re.findall('(0[1-9]|[12][0-9]|3[01])[-](0[1-9]|1[012])[-](19|20)\d\d', str(x)) != []:
        return x
    else:
        return np.nan

def applyFuncToColumn(dataframe, function=notDateToNan, columns_list=['post_date']):
    for i in columns_list:
        dataframe[i] = dataframe[i].apply(lambda x: function(x))
    return dataframe

## Define `Clean Job Title` Functions

In [107]:
import functools
from typing import Callable

ComposableFunction = Callable[[str], str]

def compose(*functions: ComposableFunction) -> ComposableFunction:
    return functools.reduce(lambda f, g: lambda x: g(f(x)), functions)

def cleanJobChars(x: str) -> str:
    # Capitalize the job title
    x = x.lower()
    stop_chars = ['(m/f)', 'm/f', '-', ':', ' - ', ' – ', '(remote)', ' / ', '(', ' para ', '_']
    hyphen_exceptions = ['-se', '-o', '-a', '-os', '-as', 'e-', '-e']
    title_position_exceptions = ['(junior)', '(senior)']
    
    def exception_handle(x: str, exception_list: list):
        ex = False
        for exception in exception_list:
            if exception in x:
                ex = True
            if ex == False:
                aux = x.split(stop)
                for val in aux:
                    if val != stop and val != '':
                        x = val
                        break
        return x
    
    for stop in stop_chars:
        if stop in x:
            if stop == '-':
                exception_handle(x, hyphen_exceptions)
            if stop == '(':
                exception_handle(x, title_position_exceptions)
            if stop == '_':
                x = x.split(stop)[0]
            else:  
                aux = x.split(stop)
                for val in aux:
                    if val != stop and val != '':
                        x = val
                        break
    return x.strip()

def replaceGenderWords(x: str) -> str:
    gender_words = ['/a', '/o', '/as', '/os', '/e', '/es']
    for i in gender_words:
        x = x.replace(i, '')
    return x

def replaceCommonFillers(x: str) -> str:
    fillers = ['recruta-se para', 'recruta-se', 'oferta de emprego:', 'oferta:', 'oferta de emprego', 'oferta', 'precisa-se', 
               'precisas-se', 'part-time']
    for i in fillers:
        x = x.replace(i, '')
    return x

cleanJobTitle = compose(replaceCommonFillers, replaceGenderWords, cleanJobChars)

# How to use:
# applyFuncToColumn(bons_empregos, function=cleanJobTitle, columns_list=['job_title'])

# Total jobs grouped by Year and Month

In [108]:
def totalJobsByYearMonth(dataframe):
    dataframe['post_year'] = dataframe['post_date'].dt.year
    dataframe['post_month'] = dataframe['post_date'].dt.month
    return pd.DataFrame(dataframe.groupby(['post_year', 'post_month'])['job_title'].count()).sort_values(by=['post_year', 'post_month'], ascending=False)

## Bons Empregos
- Specific functions:
    - `getPortugalLocation`

In [109]:
def getPortugalLocation(dataframe):
    # Get only job offers in Portugal
    dataframe = dataframe.loc[dataframe['job_location'] != 'Estrangeiro'].copy()
    return dataframe

In [119]:
bons_empregos_clean = (bons_empregos.
                    pipe(copy_df).
                    pipe(replacenan).
                    pipe(postDateFillNa).
                    pipe(dropNullJobs).
                    # pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(toDatetime, columns_list=['scrape_date'], dayfirst=True).
                    pipe(getPortugalLocation).
                    pipe(convertToDatetime, longToShortDate).
                    pipe(removeDupes, subset=['job_title','job_location', 'job_description', 'company', 'job_href'])
                    # pipe(removeDupes, subset=['job_title','job_location', 'job_description', 'company'])
)

print(f'Previous shape: {bons_empregos.shape}\nCurrent shape:{bons_empregos_clean.shape}')
bons_empregos_clean.head()

Previous shape: (2576, 8)
Current shape:(2359, 8)


Unnamed: 0,job_location,job_category,job_description,job_title,post_date,scrape_date,job_href,company
2575,Porto,Outros,"Palavras-chave: Java, Jenkins, Micro-serviços\...",Java Developer - Lisboa,2020-10-06,2021-10-05,https://www.bonsempregos.com/oferta-emprego/ja...,Dellent
2569,Porto,Outros,A Keller Williams é uma empresa de consultoria...,Gestor(a) de Negócios,2020-10-06,2021-10-05,https://www.bonsempregos.com/oferta-emprego/ge...,
2570,Porto,Outros,Empresa do ramo da Engenharia Eletrotécnica/Me...,Engenheiro Mecânico (m/f) – Estágio Profission...,2020-10-06,2021-10-05,https://www.bonsempregos.com/oferta-emprego/en...,
2574,Porto,Outros,Operador de ETAR (M/F) - Braga\nO grupo dst de...,Operador de ETAR (M/F) - Braga,2020-10-06,2021-10-05,https://www.bonsempregos.com/oferta-emprego/op...,grupo
2572,Porto,Outros,"O Grupo Remax Negócios, presente no mercado im...",Gestor de Clientes (m/f),2020-10-06,2021-10-05,https://www.bonsempregos.com/oferta-emprego/ge...,Grupo


In [111]:
totalJobsByYearMonth(bons_empregos_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,10,10
2021,9,226
2021,8,82
2021,7,243
2021,6,241
2021,5,244
2021,4,243
2021,3,229
2021,2,170
2021,1,159


## Career Jet

In [112]:
# convert job location to list
career_jet['job_location'] = career_jet['job_location'].apply(lambda x: x.split(','))

In [123]:
career_jet_clean = (career_jet.
                    pipe(copy_df).
                    pipe(replacenan).
                    pipe(postDateFillNa).
                    pipe(dropNullJobs).
                    # pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(toDatetime, columns_list=['scrape_date', 'post_date'], dayfirst=True).
                    pipe(listToRows, 'job_location').
                    pipe(removeDupes)
                    # pipe(removeDupes, subset=['job_title','job_location', 'job_description', 'company', 'post_date'])
)

print(f'Previous shape: {career_jet.shape}\nCurrent shape:{career_jet_clean.shape}')
career_jet_clean.head()

Previous shape: (45738, 9)
Current shape:(8674, 9)


Unnamed: 0,job_title,job_description,post_date,scrape_date,company,job_location,job_category,job_href,salary
6336,Assistente Call Center (m/f) – Teletrabalho - ...,A Vertente Humana é uma empresa que presta ser...,2021-10-21,2021-10-22,,Viana do Castelo,,http://jobviewtrack.com/pt-pt/job-481b416c5e17...,
6337,Chefe de Receção Vilamoura,Chefe de Receção Vilamoura (m/f) Referencia:...,2021-10-21,2021-10-22,Michael Page,Faro,,http://jobviewtrack.com/pt-pt/job-4c48416e4501...,
6338,Eletricista (M/F),Realizar a manutenção diária em conformidade c...,2021-10-21,2021-10-22,Talenter,Portugal,,http://jobviewtrack.com/pt-pt/job-481e41684101...,
6339,Motorista de Pesado C+E (m/f)- Setubal,"A Vertente Humana, empresa de Trabalho Temporá...",2021-10-21,2021-10-22,Vertente Humana,Setúbal,,http://jobviewtrack.com/pt-pt/job-494d41604210...,
6341,Controller de Gestão,Ligamos grandes profissionais a grandes empres...,2021-10-21,2021-10-22,Adecco,Porto,,http://jobviewtrack.com/pt-pt/job-1913416e420a...,


In [67]:
assert career_jet_clean.post_date.dtypes == career_jet_clean.scrape_date.dtypes

In [68]:
totalJobsByYearMonth(career_jet_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,12,1454
2021,11,4196
2021,10,2858


## Carga de Trabalhos

In [127]:
carga_de_trabalhos_clean = (carga_de_trabalhos.
                    pipe(copy_df).
                    pipe(replacenan).
                    pipe(postDateFillNa).
                    pipe(dropNullJobs).
                    # pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(toDatetime, columns_list=['scrape_date'], dayfirst=True).
                    pipe(convertToDatetime, longToShortDate, '/').
                    # pipe(removeDupes)
                    pipe(removeDupes, subset=['job_title','job_location', 'job_description', 'company', 'job_href'])
                    
)

print(f'Previous shape: {carga_de_trabalhos.shape}\nCurrent shape:{carga_de_trabalhos_clean.shape}')
carga_de_trabalhos_clean.head()

Previous shape: (6315, 7)
Current shape:(5921, 7)


Unnamed: 0,job_description,job_title,post_date,scrape_date,company,job_location,job_href
6314,Somos um projecto editorial e queremos recruta...,procuramos designer gráfico (a),2020-01-01,2021-11-12,SCM Supply Chain Magazine,Alcochete,https://www.cargadetrabalhos.net/2020/01/01/pr...
6313,Procuramos Designer Gráfico para trabalho de c...,designer gráfico,2020-01-01,2021-11-12,EA Produções,"Lisboa, Benfica",https://www.cargadetrabalhos.net/2020/01/01/de...
6312,Somos um estúdio de Design Full Service em exp...,comercial / business acquisition,2020-01-01,2021-11-12,Raio Criativo,Lisboa,https://www.cargadetrabalhos.net/2020/01/01/co...
6311,"As senior Software Engineer and technologist, ...",lead developer (m/f),2020-01-02,2021-11-12,Yomp – Digital Design &amp; Development,Guimarães,https://www.cargadetrabalhos.net/2020/01/02/le...
6310,A Agroop é uma empresa de AgriTech que está a ...,marketeer digital,2020-01-02,2021-11-12,Agroop,Sintra (junto ao Taguspark),https://www.cargadetrabalhos.net/2020/01/02/ma...


In [70]:
totalJobsByYearMonth(carga_de_trabalhos_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,11,178
2021,10,329
2021,9,313
2021,8,261
2021,7,261
2021,6,298
2021,5,356
2021,4,280
2021,3,284
2021,2,227


## Emprego XL

In [71]:
emprego_xl_clean = (emprego_xl.
                    pipe(copy_df).
                    pipe(replacenan).
                    pipe(postDateFillNa).
                    pipe(applyFuncToColumn).
                    pipe(pipeInvertDate).
                    pipe(dropNullJobs).
                    # pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(toDatetime, columns_list=['scrape_date', 'post_date'], dayfirst=True).
                    # # pipe(convertToDatetime, longToShortDate, '/').
                    pipe(removeDupes)
)

print(f'Previous shape: {emprego_xl.shape}\nCurrent shape:{emprego_xl_clean.shape}')
emprego_xl_clean.head()

Previous shape: (45664, 7)
Current shape:(26297, 7)


Unnamed: 0,job_description,job_title,post_date,scrape_date,company,job_href,job_location
39090,"A rede CENTURY 21®, fundada nos EUA em 1971, é...",Novos Talentos Comerciais (m/f),2020-11-02,2021-10-26,,https://www.empregoxl.com/emprego/434009/novos...,Lisboa
39092,"A CENTURY21 Confiança, é um Grupo de 5 lojas ,...",Ex Assessor(a) p/comercial Century21 Confiança,2020-11-02,2021-10-26,century21 confianca povoa de varzim,https://www.empregoxl.com/emprego/433873/ex-as...,Porto
39087,Ser Gestor (a) Imobiliário (a) é acompanhar fa...,Já considerou a Actividade Imobiliária?,2020-11-02,2021-10-26,Remax Metrópole,https://www.empregoxl.com/emprego/434006/ja-co...,Lisboa
39085,"A RE/MAX Liberty, presente no mercado imobiliá...",Admitimos 2 profissionais (m/f),2020-11-02,2021-10-26,Remax Liberty,https://www.empregoxl.com/emprego/434000/admit...,Braga
39147,A ManpowerGroup Portugal encontra-se a recruta...,Apoio ao Cliente 09h,2020-11-02,2021-10-26,ManpowerGroup,https://www.empregoxl.com/emprego/433985/apoio...,Lisboa


In [72]:
totalJobsByYearMonth(emprego_xl_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,12,288
2021,11,1870
2021,10,1935
2021,9,1867
2021,8,1674
2021,7,1735
2021,6,2304
2021,5,2400
2021,4,1973
2021,3,1905


## Emprego.Org

In [73]:
emprego_org_clean = (emprego_org.
                    pipe(copy_df).
                    pipe(replacenan).
                    pipe(postDateFillNa).
                    pipe(dropNullJobs).
                    # pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(postDatePreprocess, '/').
                    pipe(toDatetime, columns_list=['scrape_date'], dayfirst=True).
                    pipe(toDatetime, ['post_date']).
                    pipe(removeDupes)
)

print(f'Previous shape: {emprego_org.shape}\nCurrent shape:{emprego_org_clean.shape}')
emprego_org_clean.head()

Previous shape: (198, 8)
Current shape:(122, 8)


Unnamed: 0,job_title,job_description,post_date,scrape_date,job_location,job_href,salary,company
2,Abertura Nova Filial- Porto/ Gondomar:,Somos uma multinacional de comércio e serviços...,2021-10-25,2021-10-26,Porto Porto,https://empregos.org/view.php?job_id=2796890&t...,12000 EUR,
85,GESTOR COMERCIAL (M/F):century21,GESTOR COMERCIAL (M/F)\r\n\r\nO Grupo Century2...,2021-10-26,2021-10-26,Lisboa Lisboa,https://empregos.org/view.php?job_id=2796749&t...,Não especificado,century21
92,Reforço de Equipa - Trofa (m/f):Sumptuoso Cres...,Somos uma empresa MULTINACIONAL em forte expan...,2021-10-26,2021-10-26,Maia Porto,https://empregos.org/view.php?job_id=2796760&t...,10000 EUR,Sumptuoso Crescer unip. lda
94,Gestor de Vendas/ Formação Gratuita (m/f):Sábi...,"Com forte visibilidade no mercado, o Grupo RE/...",2021-10-26,2021-10-26,Braga Braga,https://empregos.org/view.php?job_id=2796758&t...,1750 EUR,"Sábia Visão - Mediação Imobiliária, Lda"
84,Consultor Negócios - Porto:Maxgroup,Pare de adiar o seu sucesso! Encerre 2021 com ...,2021-10-26,2021-10-26,Porto,https://empregos.org/view.php?job_id=2796747&t...,25.000 Ano EUR,Maxgroup


In [74]:
totalJobsByYearMonth(emprego_org_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,12,99
2021,10,23


## ITJOBS

In [75]:
def simplifyDate(x):
    return dt.datetime.strptime(x.split(' ')[0], '%Y-%m-%d')
# simplifyDate('2021-09-17 09:11:28')

In [76]:
itjobs_clean = (itjobs.
                    pipe(copy_df).
                    pipe(listToRows, 'job_location').
                    pipe(replacenan).
                    pipe(postDateFillNa).
                    pipe(dropNullJobs).
                    # pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(applyFuncToColumn, function=simplifyDate, columns_list=['post_date']).
                    pipe(toDatetime, columns_list=['scrape_date'], dayfirst=True).
                    pipe(toDatetime, ['post_date']).
                    # pipe(.apply(lambda x: dt.datetime.strftime('%Y-%m-%d'))).
                    pipe(removeDupes)
)

print(f'Previous shape: {itjobs.shape}\nCurrent shape:{itjobs_clean.shape}')
itjobs_clean.head()

Previous shape: (24814, 9)
Current shape:(9652, 9)


Unnamed: 0,job_title,job_description,post_date,scrape_date,company,job_location,job_category,job_ref,salary
3818,Oracle Fusion Middleware Administrator,"Dellent Consulting is a Portuguese consulting,...",2021-09-17,2021-10-22,Dellent Consulting,Porto,,https://www.itjobs.pt/oferta/405759/oracle-fus...,
3823,Node.JS Developer,Madiff builds and delivers International Remot...,2021-09-17,2021-10-22,Madiff,,,https://www.itjobs.pt/oferta/405876/node-js-de...,
3824,Java Developer - Senior,A DECODE é uma empresa onde podes ser tu mesmo...,2021-09-17,2021-10-22,Decode,Leiria,,https://www.itjobs.pt/oferta/405464/java-devel...,
3824,Java Developer - Senior,A DECODE é uma empresa onde podes ser tu mesmo...,2021-09-17,2021-10-22,Decode,Lisboa,,https://www.itjobs.pt/oferta/405464/java-devel...,
3824,Java Developer - Senior,A DECODE é uma empresa onde podes ser tu mesmo...,2021-09-17,2021-10-22,Decode,Porto,,https://www.itjobs.pt/oferta/405464/java-devel...,


In [77]:
totalJobsByYearMonth(itjobs_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,12,381
2021,11,4393
2021,10,3277
2021,9,1601


## Jooble

In [78]:
jooble_clean = (jooble.
                    pipe(copy_df).
                    pipe(replacenan).
                    pipe(postDateFillNa).
                    pipe(dropNullJobs).
                    # pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(toDatetime, columns_list=['scrape_date', 'post_date'], dayfirst=True).
                    pipe(removeTags, ['job_title']).
                    pipe(removeDupes)
)

print(f'Previous shape: {jooble.shape}\nCurrent shape:{jooble_clean.shape}')
jooble_clean.head()

Previous shape: (7483, 9)
Current shape:(3711, 9)


Unnamed: 0,job_title,job_description,post_date,scrape_date,company,job_location,job_category,job_ref,salary
728,Operador de Caixa (M/F) - Mamodeiro,"\r\n\r\n Adecco, empresa multinacional especia...",2021-10-22,2021-10-22,Adecco,Aveiro,,https://pt.jooble.org/desc/940066953020983867?...,
729,Assistente Dentária,A Experiência que gostávamos que tivesse\n\r\n...,2021-10-22,2021-10-22,OralMED Saúde,Lisboa,,https://pt.jooble.org/desc/-463883459362016540...,
719,Administrativo (m/f) [refª 912901],"MANPOWERGROUP Portugal, Líder mundial em servi...",2021-10-22,2021-10-22,ManpowerGroup,Lisboa,,https://pt.jooble.org/desc/-387992539291138291...,
734,Administrativa Comercial (m/f) - Alcochete,\r\n A Kelly Services é uma empresa de Gestão ...,2021-10-22,2021-10-22,Kelly Services Portugal,Setúbal,,https://pt.jooble.org/desc/-621383893912223577...,
740,Rececionista (noturno) (M/F) - Fátima,"Adecco, empresa multinacional especializada n...",2021-10-22,2021-10-22,Adecco,Ourém,,https://pt.jooble.org/desc/46539090233946467?c...,


In [79]:
totalJobsByYearMonth(jooble_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,12,1051
2021,11,1944
2021,10,716


## Landing Jobs IT

In [81]:
landing_jobs_clean = (landing_jobs.
                    pipe(copy_df).
                    pipe(replacenan).
                    pipe(postDateFillNa).
                    pipe(dropNullJobs).
                    # pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(postDatePreprocess, 'T').
                    pipe(toDatetime, columns_list=['scrape_date'], dayfirst=True).
                    pipe(toDatetime, ['post_date']).
                    pipe(removeDupes)
)

print(f'Previous shape: {landing_jobs.shape}\nCurrent shape:{landing_jobs_clean.shape}')
landing_jobs_clean.head()

Previous shape: (4547, 9)
Current shape:(767, 9)


Unnamed: 0,job_title,job_description,post_date,scrape_date,company,job_location,job_category,job_ref,salary
1909,Back-end Developer,"We seek flexible people, open to explore diffe...",2018-11-16,2021-11-02,Sky Technology Centre – Portugal,Lisbon,,https://landing.jobs/at/sky-technology-centre-...,
2591,Senior QA Engineer,We hope you haveAt least 6 years working as a ...,2020-03-27,2021-11-12,Blip,Porto,,https://landing.jobs/at/blip/senior-qa-enginee...,
2589,DevOps Engineer,"We hope you haveLinux systems administration, ...",2020-03-27,2021-11-12,Blip,Porto,,https://landing.jobs/at/blip/devops-engineer-i...,
2588,DevOps Manager,We hope you haveAt least 5 years of proven tra...,2020-03-27,2021-11-12,Blip,Porto,,https://landing.jobs/at/blip/devops-manager,
2587,Senior Back-end Developer,We hope you haveAt least 6 years working with ...,2020-03-27,2021-11-12,Blip,Porto,,https://landing.jobs/at/blip/senior-back-end-d...,


In [91]:
landing_jobs[landing_jobs.duplicated(subset=['job_title','job_description','job_location','company','post_date'])]

Unnamed: 0,job_title,job_description,post_date,scrape_date,company,job_location,job_category,job_ref,salary
89,Linux/Unix System Administrator,Experience with&nbsp;Linux/Unix&nbsp;focused o...,2020-12-09T15:50:02.806Z,22/10/2021,Oramix,Lisbon,,https://landing.jobs/at/oramix-sistemas-de-inf...,
97,Software Engineer (React),A university degree in Computer Science or a r...,2020-12-22T09:52:20.057Z,22/10/2021,Adnovum Portugal,Lisbon,,https://landing.jobs/at/adnovum-portugal/softw...,
98,Front-end Developer,At least 3 years of creating front end user in...,2021-06-16T08:42:10.183Z,22/10/2021,Amyris,Lisbon,,https://landing.jobs/at/amyris/front-end-devel...,30000 - 40000
99,Senior Back-end Developer,"Design, develop, test, document and maintain e...",2021-06-16T08:42:02.208Z,22/10/2021,Amyris,Lisbon,,https://landing.jobs/at/amyris/senior-back-end...,36000 - 55000
100,Back-end Engineer,2+ years of experience working on a cloud-nati...,2021-07-06T10:34:17.879Z,22/10/2021,SingleStore,Lisbon,,https://landing.jobs/at/singlestore/back-end-e...,
...,...,...,...,...,...,...,...,...,...
4541,Software Developer - Content Consumption,About YouYou have experience developing softwa...,2021-07-29T09:57:31.439Z,05/12/2021,Springer Nature,Lisbon,,https://landing.jobs/at/springer-nature/softwa...,
4542,Front-end Engineer,Write high-quality code and use the latest tec...,2021-07-29T15:15:51.877Z,05/12/2021,Fidel,Lisbon,,https://landing.jobs/at/fidel/front-end-engine...,
4544,Senior BI Developer,&nbsp;5-8 years of experience as a developer o...,2021-08-04T09:13:39.089Z,05/12/2021,Hitachi Vantara,Lisbon,,https://landing.jobs/at/hitachi-vantara/senior...,
4545,DevOps / Cloud / Infrastructure Engineer,"The Principal DevOps engineer, within the SRE ...",2021-08-06T09:01:31.510Z,05/12/2021,Wolters Kluwer / Basecone,Porto,,https://landing.jobs/at/wolters-kluwer/devops-...,


In [37]:
totalJobsByYearMonth(landing_jobs_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,11,47
2021,10,109
2021,9,73
2021,8,104
2021,7,72
2021,6,24
2021,5,50
2021,4,21
2021,3,57
2021,2,31


## Net Empregos

In [38]:
net_empregos_clean = (net_empregos.
                    pipe(copy_df).
                    pipe(replacenan).
                    pipe(pipeInvertDate).
                    pipe(postDateFillNa).
                    pipe(dropNullJobs).
                    # two pipes are needed beacause - for some reason, the function was not replacing some words it should
                    pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
                    pipe(cleanDescription, ['job_title']).
                    pipe(toDatetime, columns_list=['scrape_date', 'post_date'], dayfirst=True).
                    pipe(removeDupes)
)

print(f'Previous shape: {net_empregos.shape}\nCurrent shape:{net_empregos_clean.shape}')
net_empregos_clean.head()

Previous shape: (122188, 8)
Current shape:(58609, 8)


Unnamed: 0,job_description,job_title,post_date,scrape_date,company,job_location,job_category,job_href
44201,A Ankix é uma empresa de competências tecnológ...,iosndroid developer,2021-10-22,2021-10-30,Ankix,Lisboa,Informática ( Programação ),https://www.net-empregos.com/7806923/ios-andro...
42396,A AC Consulting encontra-se a recrutar Emprega...,empregado,2021-10-23,2021-10-30,AC Consulting,Lisboa,Restauração / Bares / Pastelarias,https://www.net-empregos.com/7775646/empregado...
42394,Somos líderes no mercado da entrega de Sushi a...,sushiman,2021-10-23,2021-10-30,Sushi em tua casa - Porto,Porto,Restauração / Bares / Pastelarias,https://www.net-empregos.com/7778008/sushiman-...
42392,"A Vincci Hoteles, é uma cadeia Internacional d...",copeira,2021-10-23,2021-10-30,Vincci Hoteles,Porto,Restauração / Bares / Pastelarias,https://www.net-empregos.com/7776747/copeira-m...
42391,Precisa-se de colaborador para desempenhar fun...,precisa,2021-10-23,2021-10-30,Farmões LDA,Lisboa,Farmácia / Biotecnologia,https://www.net-empregos.com/7777988/precisa-s...


In [39]:
totalJobsByYearMonth(net_empregos_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,11,36000
2021,10,22609


# Add Website Column to all dataframes before concat

In [40]:
jobs_dfs = [bons_empregos_clean, career_jet_clean, carga_de_trabalhos_clean, emprego_xl_clean, emprego_org_clean, itjobs_clean, jooble_clean, landing_jobs_clean, net_empregos_clean]
websites = ['Bons empregos', 'Career Jet', 'Carga de Trabalhos', 'Emprego XL', 'Emprego.org','ITjobs','Jooble','Landing Jobs','Net-empregos']

# Add column with website name
for idx, value in enumerate(jobs_dfs):
    value['website'] = websites[idx]

# Concat All dataframes into one for data Deduplication

In [41]:
neworder = ['job_title','job_description','company','job_location','job_category','salary', 'post_date', 'scrape_date','job_href', 'website']

df = pd.concat([i.reindex(columns=neworder) for i in jobs_dfs])

# Validate that the concatenation is happening properly
assert len(df) == sum(len(i) for i in jobs_dfs)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109553 entries, 2575 to 122187
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   job_title        109553 non-null  object        
 1   job_description  109553 non-null  object        
 2   company          102645 non-null  object        
 3   job_location     108610 non-null  object        
 4   job_category     60597 non-null   object        
 5   salary           1970 non-null    object        
 6   post_date        109553 non-null  datetime64[ns]
 7   scrape_date      109553 non-null  datetime64[ns]
 8   job_href         97619 non-null   object        
 9   website          109553 non-null  object        
dtypes: datetime64[ns](2), object(8)
memory usage: 9.2+ MB


## Pipeline

In [43]:
def cleanCompany(dataframe):
    def capitalize(x):
        try:
            return x.capitalize()
        except AttributeError:
            return ''
    dataframe['company'] = dataframe['company'].apply(lambda x: capitalize(x))
    return dataframe

In [44]:
df_clean = (df.
            pipe(copy_df).
            pipe(replacenan).sort_values(by='post_date').
            pipe(postDateFillNa).
            pipe(dropNullJobs).
            pipe(applyFuncToColumn, function=cleanJobTitle, columns_list=['job_title']).
            pipe(cleanCompany).
            pipe(cleanDescription, ['job_title', 'job_description']).
            pipe(removeDupes, ['job_title', 'job_description','company', 'job_location'])
)
df_clean.reset_index(drop=True, inplace=True)

print(f'Previous shape: {df.shape}\nCurrent shape:{df_clean.shape}')

Previous shape: (109553, 10)
Current shape:(109465, 10)


In [45]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109465 entries, 0 to 109464
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   job_title        109465 non-null  object        
 1   job_description  109465 non-null  object        
 2   company          109465 non-null  object        
 3   job_location     108530 non-null  object        
 4   job_category     60580 non-null   object        
 5   salary           1970 non-null    object        
 6   post_date        109465 non-null  datetime64[ns]
 7   scrape_date      109465 non-null  datetime64[ns]
 8   job_href         97551 non-null   object        
 9   website          109465 non-null  object        
dtypes: datetime64[ns](2), object(8)
memory usage: 8.4+ MB


In [46]:
print(f'There is a total of {len(df_clean)} jobs of which {df_clean.job_title.nunique()} have unique titles')

There is a total of 109465 jobs of which 32415 have unique titles


In [47]:
df_clean.describe()

  df_clean.describe()


Unnamed: 0,job_title,job_description,company,job_location,job_category,salary,post_date,scrape_date,job_href,website
count,109465,109465,109465.0,108530,60580,1970,109465,109465,97551,109465
unique,32415,96353,18362.0,1744,67,265,668,9,94079,9
top,precisa,A Multipessoal é uma empresa de referência no ...,,Lisboa,Indústria / Produção,665 - 665,2021-11-17 00:00:00,2021-10-26 00:00:00,http://jobviewtrack.com/pt-pt/job-191241685e10...,Net-empregos
freq,2145,212,6908.0,44538,5244,492,11326,24523,4,58592
first,,,,,,,2018-11-16 00:00:00,2021-10-05 00:00:00,,
last,,,,,,,2021-11-17 00:00:00,2021-11-18 00:00:00,,


In [48]:
df_clean.head()

Unnamed: 0,job_title,job_description,company,job_location,job_category,salary,post_date,scrape_date,job_href,website
0,back,"We seek flexible people, open to explore diffe...",Sky technology centre – portugal,Lisbon,,,2018-11-16,2021-10-30,,Landing Jobs
1,procuramos designer gráfico,Somos um projecto editorial e queremos recruta...,Scm supply chain magazine,Alcochete,,,2020-01-01,2021-11-12,https://www.cargadetrabalhos.net/2020/01/01/pr...,Carga de Trabalhos
2,designer gráfico,Procuramos Designer Gráfico para trabalho de c...,Ea produções,"Lisboa, Benfica",,,2020-01-01,2021-11-12,https://www.cargadetrabalhos.net/2020/01/01/de...,Carga de Trabalhos
3,comercial,Somos um estúdio de Design Full Service em exp...,Raio criativo,Lisboa,,,2020-01-01,2021-11-12,https://www.cargadetrabalhos.net/2020/01/01/co...,Carga de Trabalhos
4,lead developer,"As senior Software Engineer and technologist, ...",Yomp – digital design &amp; development,Guimarães,,,2020-01-02,2021-11-12,https://www.cargadetrabalhos.net/2020/01/02/le...,Carga de Trabalhos


In [49]:
totalJobsByYearMonth(df_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,11,46458
2021,10,32370
2021,9,4018
2021,8,2072
2021,7,2249
2021,6,2788
2021,5,2947
2021,4,2427
2021,3,2353
2021,2,1742


In [50]:
# with open(DATA_FOLDER / 'full_data_clean.json', 'w', encoding='utf-8') as file:
#     df_clean.to_json(file, force_ascii=False, orient='records', date_format='iso', date_unit='s')

# Testing the ESCO project Functions

In [262]:
test = pd.read_json(DATA_FOLDER / 'full_data_clean.json')

In [263]:
locations = pd.read_excel('freguesias-metadata.xlsx')
locations.drop(columns=['nivel', 'dicofre','brasao'], inplace=True)
# locations['dist_conc_freg'] = locations['distrito'] + ' , ' + locations['concelho'] + ' , ' + locations['freguesia']
locations.head()

Unnamed: 0,distrito,concelho,freguesia
0,Aveiro,Águeda,Aguada de Cima
1,Aveiro,Águeda,Fermentelos
2,Aveiro,Águeda,Macinhata do Vouga
3,Aveiro,Águeda,Valongo do Vouga
4,Aveiro,Águeda,União das freguesias de Águeda e Borralha


# Generate Exact Match Dictionary

In [264]:
from fold_to_ascii import fold as ascii_fold

def normalizeLocationDict(location: str) -> str:
    lowercased_ascii = ascii_fold(location.lower(), 'REMOVE_ME').replace('REMOVE_ME', '').split(',')[0] # split at , and pick first component
    only_alpha = re.sub(r'[^a-z]', ' ', lowercased_ascii)
    sem_unioes = re.sub(r'uniao d\w+ freguesias d\w+ ', '', only_alpha)
    remove_duplicate_spaces = re.sub(r'\s+', ' ', sem_unioes).strip()
    return remove_duplicate_spaces

locations = pd.read_excel('freguesias-metadata.xlsx')

locations_dict = {}
ambiguos = set()
for loc in locations.itertuples():
    for campo in [loc.freguesia, loc.concelho, loc.distrito]:
        normalizedLocation = normalizeLocationDict(campo)
        if normalizedLocation in locations_dict and locations_dict[normalizedLocation] != loc.distrito.lower():
            ambiguos.add(normalizedLocation)
        else:
            locations_dict[normalizedLocation] = loc.distrito.lower()
        
# print(sorted(set(locations_dict.keys())))

locations_dict[normalizeLocationDict('ereira e lapa')]

'santarém'

In [265]:
'cadaval' in ambiguos

False

In [266]:
import functools
from typing import Callable
from strsimpy.jaro_winkler import JaroWinkler


ComposableFunction = Callable[[str], str]

def compose(*functions: ComposableFunction) -> ComposableFunction:
    return functools.reduce(lambda f, g: lambda x: g(f(x)), functions)
    
def normalizeLocation(location:str) -> str:
    if location is None:
        return ''
    lowercased_ascii = ascii_fold(location.lower(), 'REMOVE_ME').replace('REMOVE_ME', '').split(',')
    if len(lowercased_ascii) == 1:
        only_alpha = re.sub(r'[^a-z]', ' ', lowercased_ascii[0])
        remove_duplicate_spaces = re.sub(r'\s+', ' ', only_alpha).strip()
        return remove_duplicate_spaces
    else:
        return [re.sub(r'\s+', ' ', re.sub(r'[^a-z]', ' ', v)).strip() for v in lowercased_ascii]


def commonLocationFillers(x:str) -> str:
    portugal_in_other_words = ['todo o pais','todos o pais', 'qualquer zona',
                               'todas as zonas', 'trabalho de casa', 'qualquer']
    for i in portugal_in_other_words: 
        if i in x:
            x = x.replace(i,'portugal') 
    return x.strip()

def replaceWithEmpty(x:str) -> str:
    to_replace = ['ilha de ', 'zona de ', 'e etc']
    for i in to_replace: 
        if i in x:
            x = x.replace(i,'') 
    return x.strip()
    

def matchLocation(x:str) -> str:
    if x in locations_dict: # exact match
        location = locations_dict[x] 
        return location
    elif x in ['portugal', 'remote']:
        return x
    else:
        jarowinkler = JaroWinkler()
        loc, sim = max([(locations_dict[loc], jarowinkler.similarity(x, loc)) for loc in locations_dict], key=lambda x: x[1])
        if sim >= 0.6:
            return loc
        else: # no similarity match
            return f"NOT FOUND - {x} , {loc}, {sim}"
        

cleanJobLocation = compose(commonLocationFillers, replaceWithEmpty, matchLocation)

# How to use:
# applyFuncToColumn(test, function=cleanJobTitle, columns_list=['job_title'])

In [256]:
clean_test = (test.
              pipe(applyFuncToColumn, function=normalizeLocation, columns_list=["job_location"]).
              pipe(listToRows, column="job_location").
              pipe(applyFuncToColumn, function=cleanJobLocation, columns_list=["job_location"]).
              pipe(removeDupes, ['job_title', 'job_description','company', 'job_location']))

print(f'Previous shape: {test.shape}\nCurrent shape:{clean_test.shape}')

Previous shape: (109480, 10)
Current shape:(108193, 10)


In [260]:
totalJobsByYearMonth(df_clean)

Unnamed: 0_level_0,Unnamed: 1_level_0,job_title
post_year,post_month,Unnamed: 2_level_1
2021,11,46458
2021,10,32370
2021,9,4018
2021,8,2072
2021,7,2249
2021,6,2788
2021,5,2947
2021,4,2427
2021,3,2353
2021,2,1742


In [259]:
print(matchLocation('Lisboa'))
print(matchLocation('benfica'))
print(matchLocation('bragança (portugal)'))
print(matchLocation('porto e etc'))
print(matchLocation('leiria as'))
print(matchLocation('aveiro district'))

lisboa
lisboa
bragança
porto
leiria
aveiro


In [298]:
import functools
from typing import Callable

ComposableFunction = Callable[[str], str]

def compose(*functions: ComposableFunction) -> ComposableFunction:
    return functools.reduce(lambda f, g: lambda x: g(f(x)), functions)

def normalizeTitle(location:str) -> str or list:
    if location is None:
        return ''
    lowercased_ascii = ascii_fold(location.lower(), 'REMOVE_ME').replace('REMOVE_ME', '').split(',')
    if len(lowercased_ascii) == 1:
        only_alpha = re.sub(r'[^a-z]', ' ', lowercased_ascii[0])
        remove_duplicate_spaces = re.sub(r'\s+', ' ', only_alpha).strip()
        return remove_duplicate_spaces
    else:
        return [re.sub(r'\s+', ' ', re.sub(r'[^a-z]', ' ', v)).strip() for v in lowercased_ascii][0] #keep first

def cleanJobChars(x: str) -> str:
    # Capitalize the job title
    x = x.lower()
    stop_chars = ['m f',' para ']
    
    for stop in stop_chars:
        if stop in x:
            aux = x.split(stop)
            for val in aux:
                if val != stop and val != '':
                    x = val
                    break
    return x.strip()

def replaceCommonFillers(x: str) -> str:
    fillers = ['recruta-se para', 'recruta-se', 'oferta de emprego:', 'oferta:', 'oferta de emprego', 'oferta', 'precisa-se', 
               'precisas-se', 'part-time']
    for i in fillers:
        x = x.replace(i, '')
    return x

cleanJobTitle = compose(normalizeTitle, replaceCommonFillers, cleanJobChars)

In [299]:
test.job_title.apply(lambda x: cleanJobTitle(x))

0                                    back end developer
1                         procuramos designer grafico a
2                                      designer grafico
3                        comercial business acquisition
4         area de marketing e comunicacao gestor junior
                              ...                      
112550                            consultor imobiliario
112551                      distribuidor na area do gas
112552        auxiliar de acao educativa escola palmela
112553                           operadores de producao
112554           gerente de loja burger king porto alto
Name: job_title, Length: 112555, dtype: object