In [1]:
! pip install --user -r requirements.txt
! pip install --user tensorflow

from IPython.display import clear_output
clear_output()

In [3]:
import os
import s3fs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf

In [4]:
from data_import.bdd import PostGre_SQL_DB
bdd = PostGre_SQL_DB()

DATA PREPARATION
================

In [244]:
# Get BI data
data_sql = '''SELECT rs_x, actet_x, actet_c, profi_x, profs_x, profa_x from rp_final_2017 WHERE actet_c IS NOT NULL limit 100000'''
act_data = bdd.read_from_sql(data_sql)
act_data

Unnamed: 0,rs_x,actet_x,actet_c,profi_x,profs_x,profa_x
0,ANETT AUVERGNE,EMPLOYEE DE BLANCHISSERIE,7729Z,,EMPLOYEE DE BLANCHISSERIE,
1,PLUSIEURS FAMILLES,ASSISTANTE MATERNELLE,8891A,,ASSISTANTE MATERNELLE,
2,LISA ANGOULEME,EDUCATION NATIONNALE,8520Z,,AVS,
3,LA CHARENTE LIBRE,PRESSE,5813Z,,EMPLOYE DE PRESSE,
4,LABOFFICE,LABORATOIRE DE BIOLOGIE MEDICALE,8690B,,TECHNICIEN SUPERIEUR EN LABORATOIRE DE BIOLOGI...,
...,...,...,...,...,...,...
99995,ROYAL PIZZ ET AIRE SERVICE,RESTAURATION ET MENAGE CHEZ DES GENT,5610C,,RESTAURATION,
99996,EIRL PACTE QUALITE,RENOVATION DE L'HABITAT,4331Z,PLAQUISTE CARELLAGE,,
99997,APA,AIDE A DOMICILE,8810A,,AIDE A LA PERSONNE,
99998,ENERSYS SARL,FABRICANT DE BATTERIES,2720Z,,RESPONSABLE METHODES ET INDUSTRIALISATION,


In [245]:
# Fetch naf data
from nomenclature import Nomenclature
naf = Nomenclature(bdd, 'NAF2_1')
# naf.display()

In [246]:
# enrich BI data

actet_c_libelle = []
prof_libelle = []
for _, line in act_data[['actet_c', 'profs_x', 'profi_x', 'profa_x']].iterrows():
    try:
        actet_c_libelle.append(naf.get_node(line.actet_c).desc)
    except KeyError:
        actet_c_libelle.append(None)
    
    if line.profs_x:
        prof_libelle.append(line.profs_x)
    elif line.profi_x:
        prof_libelle.append(line.profi_x)
    else:
        prof_libelle.append(line.profa_x)

act_data['actet_c_libelle'] = actet_c_libelle
act_data['prof_x'] = prof_libelle
act_data

Unnamed: 0,rs_x,actet_x,actet_c,profi_x,profs_x,profa_x,actet_c_libelle,prof_x
0,ANETT AUVERGNE,EMPLOYEE DE BLANCHISSERIE,7729Z,,EMPLOYEE DE BLANCHISSERIE,,Location et location-bail d''autres biens pers...,EMPLOYEE DE BLANCHISSERIE
1,PLUSIEURS FAMILLES,ASSISTANTE MATERNELLE,8891A,,ASSISTANTE MATERNELLE,,Accueil de jeunes enfants,ASSISTANTE MATERNELLE
2,LISA ANGOULEME,EDUCATION NATIONNALE,8520Z,,AVS,,Enseignement primaire,AVS
3,LA CHARENTE LIBRE,PRESSE,5813Z,,EMPLOYE DE PRESSE,,Édition de journaux,EMPLOYE DE PRESSE
4,LABOFFICE,LABORATOIRE DE BIOLOGIE MEDICALE,8690B,,TECHNICIEN SUPERIEUR EN LABORATOIRE DE BIOLOGI...,,Laboratoires d''analyses médicales,TECHNICIEN SUPERIEUR EN LABORATOIRE DE BIOLOGI...
...,...,...,...,...,...,...,...,...
99995,ROYAL PIZZ ET AIRE SERVICE,RESTAURATION ET MENAGE CHEZ DES GENT,5610C,,RESTAURATION,,Restauration de type rapide,RESTAURATION
99996,EIRL PACTE QUALITE,RENOVATION DE L'HABITAT,4331Z,PLAQUISTE CARELLAGE,,,Travaux de plâtrerie,PLAQUISTE CARELLAGE
99997,APA,AIDE A DOMICILE,8810A,,AIDE A LA PERSONNE,,Aide à domicile,AIDE A LA PERSONNE
99998,ENERSYS SARL,FABRICANT DE BATTERIES,2720Z,,RESPONSABLE METHODES ET INDUSTRIALISATION,,Fabrication de piles et d''accumulateurs élect...,RESPONSABLE METHODES ET INDUSTRIALISATION


In [247]:
# Filter by non null value in actet
act_data_no_empty = act_data[act_data['actet_x'] != '' ]
act_data_no_empty = act_data_no_empty[act_data_no_empty['actet_x'].notna()]
act_data_no_empty = act_data_no_empty[act_data_no_empty['actet_c_libelle'].notna()]
act_data_no_empty

Unnamed: 0,rs_x,actet_x,actet_c,profi_x,profs_x,profa_x,actet_c_libelle,prof_x
0,ANETT AUVERGNE,EMPLOYEE DE BLANCHISSERIE,7729Z,,EMPLOYEE DE BLANCHISSERIE,,Location et location-bail d''autres biens pers...,EMPLOYEE DE BLANCHISSERIE
1,PLUSIEURS FAMILLES,ASSISTANTE MATERNELLE,8891A,,ASSISTANTE MATERNELLE,,Accueil de jeunes enfants,ASSISTANTE MATERNELLE
2,LISA ANGOULEME,EDUCATION NATIONNALE,8520Z,,AVS,,Enseignement primaire,AVS
3,LA CHARENTE LIBRE,PRESSE,5813Z,,EMPLOYE DE PRESSE,,Édition de journaux,EMPLOYE DE PRESSE
4,LABOFFICE,LABORATOIRE DE BIOLOGIE MEDICALE,8690B,,TECHNICIEN SUPERIEUR EN LABORATOIRE DE BIOLOGI...,,Laboratoires d''analyses médicales,TECHNICIEN SUPERIEUR EN LABORATOIRE DE BIOLOGI...
...,...,...,...,...,...,...,...,...
99995,ROYAL PIZZ ET AIRE SERVICE,RESTAURATION ET MENAGE CHEZ DES GENT,5610C,,RESTAURATION,,Restauration de type rapide,RESTAURATION
99996,EIRL PACTE QUALITE,RENOVATION DE L'HABITAT,4331Z,PLAQUISTE CARELLAGE,,,Travaux de plâtrerie,PLAQUISTE CARELLAGE
99997,APA,AIDE A DOMICILE,8810A,,AIDE A LA PERSONNE,,Aide à domicile,AIDE A LA PERSONNE
99998,ENERSYS SARL,FABRICANT DE BATTERIES,2720Z,,RESPONSABLE METHODES ET INDUSTRIALISATION,,Fabrication de piles et d''accumulateurs élect...,RESPONSABLE METHODES ET INDUSTRIALISATION


In [248]:
from pipeline.preprocessing import preprocessing_no_stemmer
## Create vocabulary dictionary

dico_vocab = {}
nb_tokens = 0

train_corpus = act_data_no_empty['actet_x'].to_numpy().tolist() \
                + act_data_no_empty['rs_x'].to_numpy().tolist() \
                + act_data_no_empty['prof_x'].to_numpy().tolist() \
                +  [n.desc for n in naf.nodes.values()]

for index_doc, doc in enumerate(train_corpus):
    if doc is None:
        continue
    tokens = preprocessing_no_stemmer(doc).split(" ")
    nb_tokens += len(tokens)
    for t in tokens:
        if t not in dico_vocab:
            dico_vocab[t] = len(dico_vocab) + 1
            
print(nb_tokens, len(dico_vocab))
# dico_vocab

pad_token = 0
null_token = len(dico_vocab) + 1

def convert_voc(doc):
    if doc is None or not len(doc):
        return [null_token]
    tokens = preprocessing_no_stemmer(doc).split(" ")
    return [dico_vocab[t] for t in tokens]

act_data_no_empty['actet_repr'] = [convert_voc(v) for v in act_data_no_empty['actet_x']]
act_data_no_empty['rs_repr'] = [convert_voc(v) for v in act_data_no_empty['rs_x']]
act_data_no_empty['prof_repr'] = [convert_voc(v) for v in act_data_no_empty['prof_x']]
act_data_no_empty

611954 47734


Unnamed: 0,rs_x,actet_x,actet_c,profi_x,profs_x,profa_x,actet_c_libelle,prof_x,actet_repr,rs_repr,prof_repr
0,ANETT AUVERGNE,EMPLOYEE DE BLANCHISSERIE,7729Z,,EMPLOYEE DE BLANCHISSERIE,,Location et location-bail d''autres biens pers...,EMPLOYEE DE BLANCHISSERIE,"[1, 2]","[10611, 10612]","[1, 2]"
1,PLUSIEURS FAMILLES,ASSISTANTE MATERNELLE,8891A,,ASSISTANTE MATERNELLE,,Accueil de jeunes enfants,ASSISTANTE MATERNELLE,"[3, 4]","[3808, 1234]","[3, 4]"
2,LISA ANGOULEME,EDUCATION NATIONNALE,8520Z,,AVS,,Enseignement primaire,AVS,"[5, 6]","[4643, 10613]",[1868]
3,LA CHARENTE LIBRE,PRESSE,5813Z,,EMPLOYE DE PRESSE,,Édition de journaux,EMPLOYE DE PRESSE,[7],"[10614, 2649]","[481, 7]"
4,LABOFFICE,LABORATOIRE DE BIOLOGIE MEDICALE,8690B,,TECHNICIEN SUPERIEUR EN LABORATOIRE DE BIOLOGI...,,Laboratoires d''analyses médicales,TECHNICIEN SUPERIEUR EN LABORATOIRE DE BIOLOGI...,"[8, 9, 10]",[10615],"[68, 589, 8, 9, 10]"
...,...,...,...,...,...,...,...,...,...,...,...
99995,ROYAL PIZZ ET AIRE SERVICE,RESTAURATION ET MENAGE CHEZ DES GENT,5610C,,RESTAURATION,,Restauration de type rapide,RESTAURATION,"[109, 204, 1076, 10610]","[10776, 11150, 4472, 105]",[109]
99996,EIRL PACTE QUALITE,RENOVATION DE L'HABITAT,4331Z,PLAQUISTE CARELLAGE,,,Travaux de plâtrerie,PLAQUISTE CARELLAGE,"[521, 1465]","[11537, 43880, 1434]","[15, 47602]"
99997,APA,AIDE A DOMICILE,8810A,,AIDE A LA PERSONNE,,Aide à domicile,AIDE A LA PERSONNE,"[119, 71, 121]",[7944],"[119, 71, 219]"
99998,ENERSYS SARL,FABRICANT DE BATTERIES,2720Z,,RESPONSABLE METHODES ET INDUSTRIALISATION,,Fabrication de piles et d''accumulateurs élect...,RESPONSABLE METHODES ET INDUSTRIALISATION,"[158, 9915]","[43881, 3704]","[41, 2247, 3985]"


## Prepare data injection

In [250]:
# calculate each node dictionary_projection
class_repr = {k: convert_voc(v.desc) for k,v in naf.nodes.items()}

# calculate node distances

class NomenclatureDistance:
    def __init__(self, nomenclature, decrease_rate=0.5):
        self.nomenclature = nomenclature
        self._distance_mat = None
        self._distance_mat_tf = None
        self.nodes_index = None
        self.decrease_rate = decrease_rate
        self._build_distance()
        
    def _build_distance(self):
        self.nodes_index = {n: i for i, n in enumerate(self.nomenclature.nodes.keys())}
        self._distance_mat = np.zeros([len(self.nodes_index), len(self.nodes_index)])
        
        def _get_children_at_k(node_name, k):
            if k == 0 : return []
            if k == 1 : return [n.name for n in self.nomenclature.get_node(node_name).children]
            return sum([_get_children_at_k(n.name, k-1) for n in self.nomenclature.get_node(node_name).children], [])
            
        def _setup_node_distances(node_name, list_of_parents_from_furthest):
            current_dist = 1
            node_idx = self.nodes_index[node_name]
            for p in reversed(list_of_parents_from_furthest):
                current_dist *= self.decrease_rate
#                 print(node_idx, self._nodes_index[p], current_dist)
                self._distance_mat[node_idx, self.nodes_index[p]] = current_dist
            
        def _propagate_node_distances(node_label, list_of_parents_from_furthest):
            children = [n.name for n in self.nomenclature.get_node(node_label).children ]
            if not len(children):
                _setup_node_distances(node_label, list_of_parents_from_furthest)
            else:
                _setup_node_distances(node_label, list_of_parents_from_furthest)
                [_propagate_node_distances(child, list_of_parents_from_furthest + [node_label]) for child in children]
        
        start_nodes = _get_children_at_k(self.nomenclature.name, 2)
        for node_name in start_nodes:
            _propagate_node_distances(node_name, [])
                
        self._distance_mat += self._distance_mat.T
        self._distance_mat += np.eye(len(self.nodes_index))
        self._distance_mat_tf = tf.convert_to_tensor(self._distance_mat)
    
    def get_distance_mat(self, list_of_nodes):
        nodes_ind = [self.nodes_index[n] for n in list_of_nodes]
        return self._distance_mat[nodes_ind, :][:,nodes_ind]
    
    def get_distance_mat_from_tf_indices(self, list_of_node_indices):
        return tf.gather(tf.gather(self._distance_mat_tf, list_of_node_indices, axis=0), list_of_node_indices, axis=1)
    


naf_distances = NomenclatureDistance(naf)
naf_distances.get_distance_mat(["01", "77", "771", "7712", "7712Z"])
naf_distances.get_distance_mat_from_tf_indices(tf.convert_to_tensor([naf_distances.nodes_index[n] for n in ["01", "77", "771", "7712", "7712Z"]]))


<tf.Tensor: shape=(5, 5), dtype=float64, numpy=
array([[1.   , 0.   , 0.   , 0.   , 0.   ],
       [0.   , 1.   , 0.5  , 0.25 , 0.125],
       [0.   , 0.5  , 1.   , 0.5  , 0.25 ],
       [0.   , 0.25 , 0.5  , 1.   , 0.5  ],
       [0.   , 0.125, 0.25 , 0.5  , 1.   ]])>

In [251]:
batch_size = 32
voc_size = len(dico_vocab) + 2
seq_len = 40

def pad_seq(seq):
    return seq + [pad_token for _ in range(seq_len - len(seq))]


class AnchorPositivePairs(keras.utils.Sequence):
    def __init__(self, num_batchs, input_data, nom_distance):
        self.num_batchs = num_batchs
        self.input_data = input_data
        self.current_pos = 0
        self.nom_distance = nom_distance

    def __len__(self):
        return self.num_batchs
    
    @staticmethod
    def get_fields(actet_repr, rs_repr=None, prof_repr=None):
        fields = [1 for _ in actet_repr]
        if rs_repr: 
            fields += [2 for _ in rs_repr]
        if prof_repr:
            fields += [3 for _ in prof_repr]
        return fields
    
    @staticmethod
    def get_positions(fields):
        p = 1
        cf = fields[0]
        positions = []
        for f in fields:
            if f != cf:
                cf = f
                p = 1
            positions.append(p)
            p += 1
        return positions

    def __getitem__(self, _idx):
        x = np.empty((7, batch_size, seq_len), dtype=np.float32)
        for i in range(batch_size):
            class_id, actet_repr, rs_repr, prof_repr = self.input_data[self.current_pos].tolist()
            positive_data = class_repr[class_id]
            x[0, i] = pad_seq(actet_repr+rs_repr+prof_repr)
            fields = self.get_fields(actet_repr, rs_repr, prof_repr)
            x[1, i] = pad_seq(fields)
            x[2, i] = pad_seq(self.get_positions(fields))
            x[3, i] = pad_seq(positive_data)
            positive_fields = self.get_fields(positive_data)
            x[4, i] = pad_seq(positive_fields)
            x[5, i] = pad_seq(self.get_positions(positive_fields))
            x[6, i] = pad_seq([self.nom_distance.nodes_index[class_id]])
            self.current_pos += 1
            self.current_pos %= len(self.input_data)
        return x
    
examples = next(iter(AnchorPositivePairs(num_batchs=1, 
                                         input_data=act_data_no_empty[['actet_c', 'actet_repr', 'rs_repr', 'prof_repr']].to_numpy(),
                                         nom_distance=naf_distances)))
examples[6][:, 0]

array([1608., 1685., 1650., 1525., 1668., 1638., 1437., 1128., 1356.,
       1685., 1349., 1557., 1630., 1493., 1366., 1670.,  998., 1502.,
       1673., 1334., 1685., 1448., 1661., 1300., 1295., 1638., 1477.,
       1385., 1437., 1651., 1657., 1657.], dtype=float32)

## Prepare model

In [252]:
class SimilarityModel(keras.Model):
    def __init__(self, *args, nomenclature_distance, **kwargs):
        super().__init__(*args, **kwargs)
        self.nomenclature_distance = nomenclature_distance
        
    def train_step(self, data):
        # Note: Workaround for open issue, to be removed.
        anchors, anchors_fields_embeddings, anchors_positions_embeddings = data[0], data[1], data[2]
        positives, positives_fields_embeddings, positives_positions_embeddings = data[3], data[4], data[5]
        class_ids = tf.cast(data[6][:, 0], dtype=tf.int32)

        with tf.GradientTape() as tape:
            # Run both anchors and positives through model.
            anchor_projection = self([anchors, anchors_fields_embeddings, anchors_positions_embeddings], 
                                     training=True)
            positive_projection = self([positives, positives_fields_embeddings, positives_positions_embeddings], 
                                       training=True)
#             tf.keras.backend.print_tensor(tf.keras.backend.shape(anchor_projection))
#             tf.keras.backend.print_tensor(tf.keras.backend.shape(positive_projection))

            # Calculate cosine similarity between anchors and positives. As they have
            # been normalised this is just the pair wise dot products.
            similarities = tf.einsum(
                "ae,pe->ap", anchor_projection, positive_projection
            )
            # Since we intend to use these as logits we scale them by a temperature.
            # This value would normally be chosen as a hyper parameter.
            temperature = 0.2
            similarities /= temperature

            # We use these similarities as logits for a softmax. The labels for
            # this call are just the sequence [0, 1, 2, ..., num_classes] since we
            # want the main diagonal values, which correspond to the anchor/positive
            # pairs, to be high. This loss will move embeddings for the
            # anchor/positive pairs together and move all other pairsapart.
#             tf.keras.backend.print_tensor(similarities)
#             tf.keras.backend.print_tensor(tf.keras.backend.shape(similarities))
            label_distances = self.nomenclature_distance.get_distance_mat_from_tf_indices(class_ids)
#             tf.keras.backend.print_tensor(tf.keras.backend.shape(label_distances))
            loss = self.compiled_loss(label_distances, similarities)

        # Calculate gradients and apply via optimizer.
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

        # Update and return metrics (specifically the one for the loss value).
        self.compiled_metrics.update_state(label_distances, similarities)
        return {m.name: m.result() for m in self.metrics}

In [253]:
class TokenFieldAndPositionEmbedding(layers.Layer):
    def __init__(self, maxseqlen, nbfields, vocab_size, embed_dim, **kwargs):
        super(TokenFieldAndPositionEmbedding, self).__init__(**kwargs)
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.field_emb = layers.Embedding(input_dim=nbfields, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxseqlen, output_dim=embed_dim)
        
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'token_emb': self.token_emb,
            'field_emb': self.field_emb,
            'pos_emb': self.pos_emb
        })
        return config

    def call(self, x, x_fields, x_positions):
        x = self.token_emb(x)
        fields = self.field_emb(x_fields)
        positions = self.pos_emb(x_positions)
        return x + fields + positions


In [254]:
##To Change
embedding_size = 256

input_word_embeddings = layers.Input(shape=(seq_len))
input_fields_embeddings = layers.Input(shape=(seq_len))
input_positions_embeddings = layers.Input(shape=(seq_len))

embeddings = TokenFieldAndPositionEmbedding(maxseqlen=seq_len,
                                            nbfields=3+1,
                                            vocab_size=voc_size, 
                                            embed_dim=embedding_size, 
                                            name="embedding")
x_with_pos = embeddings(input_word_embeddings, input_fields_embeddings, input_positions_embeddings)
x_with_pos = layers.Bidirectional(layers.LSTM(int(seq_len), return_sequences=True))(x_with_pos)
x_with_pos = layers.Dropout(0.1)(x_with_pos)
x_with_pos = layers.Bidirectional(layers.LSTM(int(seq_len), return_sequences=True))(x_with_pos)
x_with_pos = layers.Dropout(0.1)(x_with_pos)
x_with_pos = layers.Dense(256, activation='relu')(x_with_pos)
x_with_pos = layers.Dropout(0.1)(x_with_pos)
x_with_pos = layers.GlobalAveragePooling1D()(x_with_pos)
x_with_pos = layers.Dense(128, activation='relu')(x_with_pos)
x_with_pos = layers.Dropout(0.1)(x_with_pos)
output = layers.Dense(64, activation=None)(x_with_pos)

model = SimilarityModel(inputs=[input_word_embeddings, 
                                input_fields_embeddings,
                                input_positions_embeddings], outputs=output, nomenclature_distance=naf_distances)
model.summary()

Model: "similarity_model_29"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_94 (InputLayer)           [(None, 40)]         0                                            
__________________________________________________________________________________________________
input_95 (InputLayer)           [(None, 40)]         0                                            
__________________________________________________________________________________________________
input_96 (InputLayer)           [(None, 40)]         0                                            
__________________________________________________________________________________________________
embedding (TokenFieldAndPositio (None, 40, 256)      12231680    input_94[0][0]                   
                                                                 input_95[0][0] 

## Launch training

In [255]:
## CHANGE LOSS FUNCTION

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss=keras.losses.MeanSquaredError(),  
)

history = model.fit(AnchorPositivePairs(num_batchs=10000, 
                                        input_data=act_data_no_empty[['actet_c', 'actet_repr', 'rs_repr', 'prof_repr']].to_numpy(),
                                        nom_distance=naf_distances), 
                    epochs=20)

plt.plot(history.history["loss"])
plt.show()

Epoch 1/20
Epoch 2/20
   60/10000 [..............................] - ETA: 32:05 - loss: 0.0271

KeyboardInterrupt: 

## Testing

In [203]:
def create_nomenclature_projections(nomenclature):
    nomenclature_projections = {"codes":[], "projections": []}
    for k, v in nomenclature.nodes.items():
        nomenclature_projections['codes'].append(k)
        input_tokens = convert_voc(v.desc)
        input_fields = AnchorPositivePairs.get_fields(input_tokens)
        input_positions = AnchorPositivePairs.get_positions(input_fields)
        prediction = model.predict([np.array([pad_seq(input_tokens)], dtype=np.float32), 
                                    np.array([pad_seq(input_fields)], dtype=np.float32), 
                                    np.array([pad_seq(input_positions)], dtype=np.float32)])
        nomenclature_projections['projections'].append(prediction[0])
    nomenclature_projections['projections'] = np.stack(nomenclature_projections['projections'], axis=0)
    return nomenclature_projections['codes'], nomenclature_projections['projections']

codes, projections = create_nomenclature_projections(naf)



In [204]:
projections.shape

(1730, 64)

In [240]:
def project_activite(actet_vals, rs_vals, prof_vals, nb_top_values=5):
    single_mode = False
    if not isinstance(actet_vals, list):
        single_mode = True
        actet_vals = [actet_vals]
        rs_vals = [rs_vals]
        prof_vals = prof_vals
    batch_tokens = []
    batch_fields = []
    batch_positions = []
    for actet_val, rs_val, prof_val in zip(actet_vals, rs_vals, prof_vals):
        actet_proj = convert_voc(actet_val)
        rs_proj = convert_voc(rs_val)
        prof_proj = convert_voc(prof_val)
        input_tokens = actet_proj + rs_proj + prof_proj
        input_fields = AnchorPositivePairs.get_fields(input_tokens)
        input_positions = AnchorPositivePairs.get_positions(input_fields)
        batch_tokens.append(pad_seq(input_tokens))
        batch_fields.append(pad_seq(input_fields))
        batch_positions.append(pad_seq(input_positions))
        
    text_projections = model.predict([np.array(batch_tokens, dtype=np.float32), 
                                      np.array(batch_fields, dtype=np.float32), 
                                      np.array(batch_positions, dtype=np.float32)])
    results = []
    for text_projection in text_projections:
        similarities = np.einsum("ae,pe->ap", np.expand_dims(text_projection, axis=0), projections)[0]
        top_idx = (-similarities).argsort()[:nb_top_values]
        top_codes = [codes[i] for i in top_idx]
        top_similarities = [similarities[i] for i in top_idx]
        results.append([top_codes, top_similarities])
    if single_mode:
        return results[0]
    return results

top_codes_and_similarities = project_activite('COUVREUR', 'CALIPNO', 'COUVREUR ZINGUEUR')
top_codes_and_similarities

[['0610', '0610Z', '061', '732', '7320Z'],
 [0.99995375, 0.99995375, 0.99995375, 0.9999411, 0.9999411]]

In [242]:
def show_results_for_example(top_codes_and_similarities):
    return pd.DataFrame(np.array([top_codes_and_similarities[1],
                                  top_codes_and_similarities[0], 
                                  [naf.get_node(c).desc for c in top_codes_and_similarities[0]]]).T,
                        columns=['scores', 'codes', 'description'])

# show_results(*project_activite('ELECTRICIEN PLAQUISTE'))
show_results_for_example(project_activite('COUVREUR', 'CALIPNO', 'COUVREUR ZINGUEUR', nb_top_values=25))

Unnamed: 0,scores,codes,description
0,0.99995375,0610,Extraction de pétrole brut
1,0.99995375,0610Z,Extraction de pétrole brut
2,0.99995375,061,EXTRACTION DE PÉTROLE BRUT
3,0.9999411,732,ÉTUDES DE MARCHÉ ET SONDAGES
4,0.9999411,7320Z,Études de marché et sondages
5,0.9999411,7320,Études de marché et sondages
6,0.99990857,53,ACTIVITÉS DE POSTE ET DE COURRIER
7,0.99989605,NAF2_1,Nomenclature d''activité (1 poste)
8,0.9998911,05,EXTRACTION DE HOUILLE ET DE LIGNITE
9,0.9998745,6411,Activités de banque centrale


In [58]:
projections

array([[-0.05173798, -0.2651816 , -0.02404017, ..., -0.08760474,
        -0.05846792, -0.08176856],
       [-0.05175103, -0.26518068, -0.024034  , ..., -0.08759565,
        -0.05843299, -0.08174124],
       [-0.05175215, -0.26509583, -0.02403919, ..., -0.087602  ,
        -0.05840887, -0.08170411],
       ...,
       [-0.05175327, -0.26498672, -0.02404428, ..., -0.08760923,
        -0.0583787 , -0.08165516],
       [-0.05175327, -0.26498735, -0.0240436 , ..., -0.08760919,
        -0.05837889, -0.08165493],
       [-0.05175273, -0.26498187, -0.02404664, ..., -0.08761006,
        -0.05837866, -0.08165556]], dtype=float32)