In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
test = pd.read_json('full_data_with_esco_similarity_titles.json')    

In [3]:
with open("esco_dictionary.json", 'r', encoding='utf-8') as file:
    esco_dict = json.load(file)

In [4]:
esco_dict['diretor tecnico']

{'occupationUri': 'http://data.europa.eu/esco/occupation/00030d09-2b3a-4efd-87cc-c4ea39d27c34',
 'iscoGroup': 2166,
 'occupation': 'diretor tecnico',
 'skills': ['organizar ensaios',
  'avaliar os riscos de uma produção de artes performativas',
  'coordenar as atividades com outros departamentos artísticos',
  'adaptar-se às necessidades criativas dos artistas',
  'negociar questões de saúde e segurança com terceiros',
  'promover saúde e segurança',
  'coordenar as equipas técnicas das produções artísticas',
  'técnicas teatrais']}

In [5]:
mask = test['esco_similarity_match'].str.contains('NOT FOUND', case=False, na=False)
print(test[~mask][['job_title','esco_similarity_match']].shape)
test[~mask][['job_title','esco_similarity_match']].head()

(37211, 2)


Unnamed: 0,job_title,esco_similarity_match
1,procuramos designer grafico a,designer grafico
2,designer grafico,designer grafico
4,area de marketing e comunicacao gestor junior,gestor de marketing
6,marketeer digital,marketeer
12,gestor marketing e e commerce,gestor de marketing


# The Model

In [6]:
# import re

# def ngrams(string, n=3):
#     string = re.sub(r'[,-./]|\sBD',r'', string)
#     ngrams = zip(*[string[i:] for i in range(n)])
#     return [''.join(ngram) for ngram in ngrams]

In [7]:
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

job_titles = [i for i in esco_dict]
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 3))
tf_idf_matrix = vectorizer.fit_transform(job_titles)

In [46]:
feature_names = vectorizer.get_feature_names_out()
m = vectorizer.transform([""])
for i in m.nonzero()[1]:
    print(f"{i} - {feature_names[i]}\n\t\t{m[0,i]}")

In [10]:
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)
    
    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [47]:
import time
t1 = time.time()
matches = awesome_cossim_top(m, tf_idf_matrix.transpose(), 10, 0)
t = time.time()-t1
print(m.shape)
print(tf_idf_matrix.shape)
print("SELFTIMED:", t)
matches.nonzero()

IndexError: Out of bounds on buffer access (axis 0)

In [13]:
for i in matches.nonzero()[1]:
    print(f"{i} - {job_titles[i]}\n\t\t{matches[0,i]}")
best_match = job_titles[matches.nonzero()[1][0]]
best_match

11145 - advogado empresa
		0.7975335365133421
6867 - advogado
		0.7145417364007014
11146 - advogada empresa advogado empresa
		0.6027085931097672
9035 - trabalhadora empresa mudancas trabalhador empresa mudancas
		0.2973130678196964
9034 - trabalhador empresa mudancas
		0.2871932896210978
6868 - advogado solicitador solicitadora advogada
		0.2187481960198304
3859 - diretor geral gerente empresa diretor comercial diretora comercial diretora geral
		0.12357243223785598


'advogado empresa'

In [14]:
esco_dict[best_match]

{'occupationUri': 'http://data.europa.eu/esco/occupation/fdfce14e-992d-4ff4-9f9d-7a353c75654e',
 'iscoGroup': 2611,
 'occupation': 'advogado empresa',
 'skills': ['rastrear transações financeiras',
  'fusões e aquisições',
  'prestar aconselhamento jurídico sobre investimentos',
  'legislação farmacêutica',
  'comércio internacional',
  'legislação em matéria de cuidados de saúde',
  'legislação ferroviária',
  'direito da comunicação social',
  'compilar documentos jurídicos',
  'direito urbanístico',
  'negociar em processos jurídicos',
  'proteger os interesses do cliente',
  'direito da propriedade intelectual',
  'analisar provas legais',
  'apresentar argumentos persuasivamente',
  'utilizar técnicas de consultoria',
  'demonstrações financeiras',
  'legislação relativa ao transporte rodoviário',
  'apresentar argumentos jurídicos',
  'moderar negociações',
  'interpretar a lei',
  'direito contratual',
  'legislação agrícola',
  'interpretar demonstrações financeiras',
  'gestão

# Model Functions

In [114]:
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    try:
        ct.sparse_dot_topn(
            M, N, np.asarray(A.indptr, dtype=idx_dtype),
            np.asarray(A.indices, dtype=idx_dtype),
            A.data,
            np.asarray(B.indptr, dtype=idx_dtype),
            np.asarray(B.indices, dtype=idx_dtype),
            B.data,
            ntop,
            lower_bound,
            indptr, indices, data)
    except IndexError:
        return "NOT FOUND"

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [127]:
from sklearn.feature_extraction.text import TfidfVectorizer

def nlpModel(esco_dict):
    # Get the list of all job known job titles in the ESCO database
    known_job_titles = [i for i in esco_dict]
    vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 3))
    tf_idf_matrix = vectorizer.fit_transform(known_job_titles)
    return vectorizer, tf_idf_matrix, known_job_titles

vectorizer, tf_idf_matrix, known_job_titles = nlpModel(esco_dict)

def getMatches(vectorizer, tf_idf_matrix, known_job_titles, esco_dict, text, ret_score=False, threshold=0.6):    
    m = vectorizer.transform([text])
    matches = awesome_cossim_top(m, tf_idf_matrix.transpose(), 1, 0)
    try:
        best_match, score = known_job_titles[matches.nonzero()[1][0]], matches[0,matches.nonzero()[1][0]]
    except:
        return 'NOT FOUND', 'NOT FOUND', 'NOT FOUND', 'NOT FOUND' 
    if score >= threshold:
        return best_match, esco_dict[best_match]['skills'], esco_dict[best_match]['iscoGroup'], score
    return 'NOT FOUND', 'NOT FOUND', 'NOT FOUND', 'NOT FOUND'
    # if score >= 0.6 and ret_score == False:
    # #     return best_match, esco_dict[best_match]['skills'], esco_dict[best_match]['iscoGroup']
    # elif score >= 0.6 and ret_score == True:
    #     return best_match, esco_dict[best_match]['skills'], esco_dict[best_match]['iscoGroup'], score

In [37]:
vectorizer, tf_idf_matrix, known_job_titles = nlpModel(esco_dict)

best_match, skills, isco_group, score = getMatches(vectorizer, tf_idf_matrix, known_job_titles, 
                               esco_dict, 'consultor para trabalhar como financeiro', ret_score=True)
print(best_match, score)

consultor financeiro 0.7343019766987195


# Define Model

In [116]:
from tqdm.notebook import tqdm

In [117]:
vectorizer, tf_idf_matrix, known_job_titles = nlpModel(esco_dict)

best_matches = []
skills_list = []
scores = []
isco_groups = []

for text in tqdm(test['job_title']):
    best_match, skills, isco_group, score = getMatches(vectorizer, tf_idf_matrix, known_job_titles, 
                               esco_dict, text, ret_score=True)
    best_matches.append(best_match)
    skills_list.append(skills)
    scores.append(score)
    isco_groups.append(isco_group)

  0%|          | 0/112555 [00:00<?, ?it/s]

In [118]:
test['similarity_titles'] = best_matches
test['similarity_scores'] = scores
test['skills'] = skills_list
test['iscoGroup'] = isco_groups

In [124]:
# mask = test['esco_similarity_match'].str.contains('NOT FOUND', case=False, na=False)
# print(test[~mask][['job_title','esco_similarity_match']].shape)
# test[~mask][['job_title','esco_similarity_match']].head()
test.loc[test['similarity_titles'] != 'NOT FOUND'][['job_title','similarity_titles', 'esco_similarity_match']]

Unnamed: 0,job_title,similarity_titles,esco_similarity_match
1,procuramos designer grafico a,designer grafico,designer grafico
2,designer grafico,designer grafico,designer grafico
6,marketeer digital,marketeer,marketeer
12,gestor marketing e e commerce,gestor marketing,gestor de marketing
17,designer grafico,designer grafico,designer grafico
...,...,...,...
112538,precisa se comercial comissionista produtos ca...,cabeleireiro,cabeleireiro
112540,tecnico eletromecanico,eletromecanico,eletromecanico
112541,assistente dentaria,assistente dentaria assistente dentario,assistente dentario
112543,operador de armazem,operador armazem,operador de armazem


In [121]:
mask = test['esco_similarity_match'].str.contains('NOT FOUND', case=False, na=False)
print(test[~mask][['job_title','esco_similarity_match']].shape)
test[~mask][['job_title','esco_similarity_match']].head()

(37211, 2)


Unnamed: 0,job_title,esco_similarity_match
1,procuramos designer grafico a,designer grafico
2,designer grafico,designer grafico
4,area de marketing e comunicacao gestor junior,gestor de marketing
6,marketeer digital,marketeer
12,gestor marketing e e commerce,gestor de marketing


In [128]:
best_match, skills, isco_group, score = getMatches(vectorizer, tf_idf_matrix, known_job_titles, 
                           esco_dict, 'back end developer', ret_score=True, threshold=0)
best_match

'user interface developers ui developers front end developer user interface engineer ui programmer ui developer'