In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
test = pd.read_json('full_data_with_esco_similarity_titles.json')    

In [3]:
with open("esco_dictionary.json", 'r', encoding='utf-8') as file:
    esco_dict = json.load(file)

In [102]:
esco_dict['diretor tecnico']

{'occupationUri': 'http://data.europa.eu/esco/occupation/00030d09-2b3a-4efd-87cc-c4ea39d27c34',
 'iscoGroup': 2166,
 'occupation': 'diretor tecnico',
 'skills': ['organizar ensaios',
  'avaliar os riscos de uma produção de artes performativas',
  'coordenar as atividades com outros departamentos artísticos',
  'adaptar-se às necessidades criativas dos artistas',
  'negociar questões de saúde e segurança com terceiros',
  'promover saúde e segurança',
  'coordenar as equipas técnicas das produções artísticas',
  'técnicas teatrais']}

In [4]:
mask = test['esco_similarity_match'].str.contains('NOT FOUND', case=False, na=False)
print(test[~mask][['job_title','esco_similarity_match']].shape)
test[~mask][['job_title','esco_similarity_match']].head()

(37211, 2)


Unnamed: 0,job_title,esco_similarity_match
1,procuramos designer grafico a,designer grafico
2,designer grafico,designer grafico
4,area de marketing e comunicacao gestor junior,gestor de marketing
6,marketeer digital,marketeer
12,gestor marketing e e commerce,gestor de marketing


# The Model

In [5]:
import re

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [66]:
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

job_titles = [i for i in esco_dict]
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 3))
tf_idf_matrix = vectorizer.fit_transform(job_titles)

# job_titles_field = [i for i in esco_dict]
# vectorizer2 = TfidfVectorizer(min_df=1, analyzer=ngrams)
# tf_idf_matrix2 = vectorizer2.fit_transform(job_titles_field)

In [107]:
feature_names = vectorizer.get_feature_names()
m = vectorizer.transform(["procuramos para empresa senhor martim advogado"])
for i in m.nonzero()[1]:
    print(f"{i} - {feature_names[i]}\n\t\t{m[0,i]}")

39712 - empresa
		0.6995928151013778
2254 - advogado
		0.7145417364007014


In [104]:
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [105]:
import time
t1 = time.time()
matches = awesome_cossim_top(m, tf_idf_matrix.transpose(), 10, 0)
t = time.time()-t1
print(m.shape)
print(tf_idf_matrix.shape)
print("SELFTIMED:", t)
matches.nonzero()

(1, 139565)
(11205, 139565)
SELFTIMED: 0.007221698760986328


(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([2135, 7007,    0, 5255, 3222, 6902, 7482, 2900, 1477, 2629]))

In [106]:
for i in matches.nonzero()[1]:
    print(f"{i} - {job_titles[i]}\n\t\t{matches[0,i]}")
job_titles[matches.nonzero()[1][0]]

2135 - responsavel gabinete apoio aluno
		0.33312628532250527
7007 - tecnico qualidade
		0.1351384556376851
0 - diretor tecnico
		0.13504435131196307
5255 - tecnico instrumentos
		0.12686975790606694
3222 - tecnico marketing
		0.1264668759788738
6902 - tecnico gas
		0.12349426766142861
7482 - tecnico video
		0.12108978063738408
2900 - tecnico informatica
		0.12108978063738408
1477 - tecnico agricola
		0.11983033340356429
2629 - tecnico hardware
		0.11938777962786025


'responsavel gabinete apoio aluno'