In [1]:
import numpy as np
import pandas as pd
import scipy
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')

import torch
from sentence_transformers import SentenceTransformer, models

In [2]:
data=np.load('../../data/cleaned3.npy', allow_pickle=True)

In [2]:
id_table_new=list(np.load('../../data/id_table_new.npy', allow_pickle=True))

In [3]:
model = SentenceTransformer('distiluse-base-multilingual-cased')

In [4]:
embeddings= model.encode(data, batch_size=8, show_progress_bar=True)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=5231.0, style=ProgressStyle(description_wid…




In [4]:
embeddings=np.load('../../data/multilingual_embeddings_v2.npy')

In [5]:
embeddings[25]

array([ 7.18740895e-02, -3.64777143e-03, -2.38013081e-02,  3.76347499e-03,
        2.22953968e-02,  5.91440760e-02, -6.98857522e-03, -6.59290701e-02,
        2.22539213e-02, -1.65397599e-02,  2.85662897e-03,  1.69542735e-03,
        2.45286673e-02,  5.55649260e-03, -9.16428631e-04, -1.20221768e-02,
       -2.35343091e-02,  6.54302239e-02, -1.46853738e-02,  7.02569075e-03,
       -8.40294641e-03,  4.29729074e-02, -9.21237543e-02, -3.00524980e-02,
       -9.63636711e-02, -1.59416497e-02,  2.19537597e-02,  9.82210692e-03,
        1.18117169e-01,  4.98081259e-02,  2.34393477e-02, -4.16641645e-02,
       -1.46070831e-02,  1.39948679e-02,  3.29233669e-02, -1.86285321e-02,
       -2.60580350e-02,  3.19848210e-02, -2.47378014e-02,  1.64260305e-02,
        1.56141976e-02,  2.49648411e-02, -9.60115716e-02, -4.79893237e-02,
        5.88324182e-02, -8.17859371e-04, -2.90176068e-02,  8.46660230e-03,
        2.66874135e-02, -4.36963327e-03, -3.88440825e-02, -3.45157348e-02,
        2.63432562e-02, -

In [6]:
np.save('../../data/multilingual_embeddings_v2.npy', embeddings, allow_pickle=True)

In [6]:
def query2vec(search_query):
    return(model.encode([search_query]))

def get_id(idx):
    dataset_id=id_table_new[idx]
    return(dataset_id)

def get_idx(ids):
    dataset_idx=id_table_new.index(ids)
    return(dataset_idx)

def id2vec(ids):
    return(list(embeddings[get_idx(ids)]))

def neighbours(vector, n):
    n_ids=[]
    score=[]
    distances = scipy.spatial.distance.cdist(vector, embeddings, "cosine")[0]
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])
    for idx, distance in results[0:n]:
        n_ids.append(get_id(idx))
        score.append(1-distance)
    return(n_ids, score, vector)

def find_vectors(vector, n_ids):
    vectors=[]
    for ids in n_ids:
        vectors.append(id2vec(ids))
    vectors.append(vector[0])
    tsne_vec = TSNE(n_components=2).fit_transform(vectors)
    return(tsne_vec)

def Search(search_query, n):
    n_ids, score, vector=neighbours(query2vec(search_query), n)
    tsne_vec=find_vectors(vector, n_ids)
    #print(n_ids, score)
    return(n_ids, score, tsne_vec)

def Similarity(ids, n):
    n_ids, score=neighbours(id2vec(ids), n)
    return(n_ids, score)

In [7]:
Search('bouteilles plastiques', 5)

(['5380738aa3a7297e4d35d6dd',
  '53699ec5a3a729239d205f76',
  '5beafdff06e3e77f2077c70c',
  '5cf8582106e3e710bcc3efeb',
  '56b0c2fcb595086d5669cb98'],
 [0.2564903506018905,
  0.242749568375619,
  0.24093073776975715,
  0.23368579172407267,
  0.22488530056579192],
 array([[ 205.6003    , -157.30472   ],
        [ -64.05196   , -261.0487    ],
        [ 190.27621   ,  131.20094   ],
        [  -0.63035893,  -23.595789  ],
        [ -88.85616   ,  205.79959   ],
        [-246.05716   ,  -36.636738  ]], dtype=float32))

## Benchmark queries try

In [1]:
import pandas as pd
df=pd.read_csv('../../data/querys.csv', sep=',',error_bad_lines=False, encoding='latin-1') #le fichier original

In [4]:
ids=df['expected']
queries=np.array(df['query'], dtype=str)

In [20]:
queries

array(['siren', 'sirene', 'entreprise', 'entreprises', 'siret',
       'open damir', 'opendamir', 'damir', 'contours départements',
       'emissions polluantes', 'géofla départements',
       'effectifs police municipale', 'marchés public bourgogne',
       'Liste gares SNCF', 'contours départements français',
       'loi de finance 2016', 'lolf 2016', 'formations pas de calais',
       'accidents de la circulation', 'accidents de la route',
       'risque de décès un an après accident', 'COG',
       'code officiel géographique', 'contour commune',
       'contours communes', 'contour communes', 'code postal',
       'codes postaux', 'prénoms', 'association', 'associations', 'RNA',
       'nan', 'répertoire des associations',
       'répertoire national des associations', 'waldec',
       'organismes de formation', 'organisme de formation',
       'bibliothèques', "annuaire de l'éducation", 'grand débat',
       'vie-publique répertoire'], dtype='<U36')

In [31]:
positions=['none']*len(queries)
for i in range(len(queries)):#_pca):
    results=Search(queries[i],100)[0]
    k=0
    for r in results:
        k=k+1
        if r==ids[i]:
            positions[i]=k

In [73]:
acceptable=[]
rejected=[]
bof=[]
for q in range(len(queries)):
    if positions[q]=='none':
        rejected.append((queries[q], positions[q]))
    else:
        if positions[q]<=5:
             acceptable.append((queries[q], positions[q]))
        else:
            bof.append((queries[q], positions[q]))

In [74]:
acceptable

[('siren', 1),
 ('sirene', 2),
 ('contours départements', 5),
 ('géofla départements', 1),
 ('effectifs police municipale', 2),
 ('marchés public bourgogne', 4),
 ('Liste gares SNCF', 1),
 ('contours départements français', 1),
 ('accidents de la circulation', 5),
 ('risque de décès un an après accident', 2),
 ('code officiel géographique', 5),
 ('code postal', 2),
 ('codes postaux', 3),
 ('répertoire national des associations', 5)]

In [75]:
rejected

[('entreprise', 'none'),
 ('entreprises', 'none'),
 ('damir', 'none'),
 ('loi de finance 2016', 'none'),
 ('lolf 2016', 'none'),
 ('formations pas de calais', 'none'),
 ('prénoms', 'none'),
 ('associations', 'none'),
 ('nan', 'none'),
 ('waldec', 'none'),
 ('bibliothèques', 'none'),
 ('vie-publique répertoire', 'none')]

In [76]:
bof

[('siret', 11),
 ('open damir', 18),
 ('opendamir', 37),
 ('emissions polluantes', 30),
 ('accidents de la route', 7),
 ('COG', 11),
 ('contour commune', 56),
 ('contours communes', 95),
 ('contour communes', 57),
 ('association', 69),
 ('RNA', 21),
 ('répertoire des associations', 24),
 ('organismes de formation', 72),
 ('organisme de formation', 68),
 ("annuaire de l'éducation", 19),
 ('grand débat', 10)]

In [47]:
positions

[1,
 2,
 'none',
 'none',
 11,
 18,
 37,
 'none',
 5,
 30,
 1,
 2,
 4,
 1,
 1,
 'none',
 'none',
 'none',
 5,
 7,
 2,
 11,
 5,
 56,
 95,
 57,
 2,
 3,
 'none',
 69,
 'none',
 21,
 'none',
 24,
 5,
 'none',
 72,
 68,
 'none',
 19,
 10,
 'none']

## Debugging the scripts

In [1]:
embed=np.load('../../data/test/embeddings.npy')

In [4]:
embeddings=np.load('../../data/multilingual_embeddings_v2.npy')