### Load Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
#Sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE

#other
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

import scipy

import torch
from sentence_transformers import SentenceTransformer, models

In [None]:
data_np_nostop=np.load('../../data/data_np_nostop.npy', allow_pickle=True)

In [2]:
data=np.load('../../data/cleaned2.npy', allow_pickle=True)

### Load the model

#### CamemBERT

In [None]:
word_embedding_model = models.Transformer('flaubert/flaubert_large_cased')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
import time
t = time.time()
embeddings= model.encode(data[:1000])
elapsed= time.time()-t
print(elapsed)

#### Multilingual

In [3]:
model = SentenceTransformer('distiluse-base-multilingual-cased')

In [8]:
embeddings= model.encode(data, batch_size=8, show_progress_bar=True)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=5355.0, style=ProgressStyle(description_wid…




In [9]:
np.save('../../data/multilingual_embeddings.npy', embeddings, allow_pickle=True)

In [4]:
embeddings=np.load('../../data/multilingual_embeddings_v2.npy')

In [5]:
data[245]

"11 station météo toulouse soupetard. Toulouse métropole. ce jeu de données est issu du capteur n° 11 situé sur le site de l'école maternelle des acacias quartier soupetard toulouse qui fait parti d'un réseau de stations météo situées sur la métropole toulousaine en vue d'étudier le phénomène d'ilot de chaleur urbain icu . environnement ilot-de-chaleur meteo precipitations station-meteo temperature"

#### PCA

In [None]:
from sklearn import decomposition
pca = decomposition.PCA(n_components=30)
pca.fit(embeddings)
embeddings_pca=pca.transform(embeddings)

In [None]:
embeddings_pca.shape

In [None]:
data[7229]

### Queries based on cosine 

In [6]:
import pandas as pd
df=pd.read_csv('../../data/querys.csv', sep=',',error_bad_lines=False, encoding='latin-1') #le fichier original

In [11]:
ids=df['expected']

In [7]:
queries=np.array(df['query'], dtype=str)

In [8]:
queries

array(['siren', 'sirene', 'entreprise', 'entreprises', 'siret',
       'open damir', 'opendamir', 'damir', 'contours départements',
       'emissions polluantes', 'géofla départements',
       'effectifs police municipale', 'marchés public bourgogne',
       'Liste gares SNCF', 'contours départements français',
       'loi de finance 2016', 'lolf 2016', 'formations pas de calais',
       'accidents de la circulation', 'accidents de la route',
       'risque de décès un an après accident', 'COG',
       'code officiel géographique', 'contour commune',
       'contours communes', 'contour communes', 'code postal',
       'codes postaux', 'prénoms', 'association', 'associations', 'RNA',
       'nan', 'répertoire des associations',
       'répertoire national des associations', 'waldec',
       'organismes de formation', 'organisme de formation',
       'bibliothèques', "annuaire de l'éducation", 'grand débat',
       'vie-publique répertoire'], dtype='<U36')

In [27]:
#queries = ['répertoire association', 'base adresses nationale', 'bouteilles plastiques']
query_embeddings = model.encode(queries)
#query_embeddings_pca=pca.transform(query_embeddings)
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
closest_n = 10
for query, query_embedding in zip(queries, query_embeddings):#_pca):
    distances = scipy.spatial.distance.cdist([query_embedding], embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])
    
    print(results)
    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:closest_n]:
        print(idx)
        print(data [idx], "(Score: %.4f)" % (1-distance))


[(33366, 0.4982598954143679), (37884, 0.5168296584078957), (7295, 0.5613298158858926), (42600, 0.567590104014055), (41017, 0.5675901059303239), (33963, 0.5712303657930369), (11514, 0.58866666856042), (402, 0.5907884823104481), (21340, 0.5988187476029985), (3996, 0.6100565602373029), (1883, 0.6108306040935534), (36120, 0.6131867553522952), (36299, 0.6131867553522952), (6225, 0.6200139134953673), (21010, 0.6218051856831174), (0, 0.622227916985939), (19344, 0.622697072351293), (36927, 0.6234437061548406), (18767, 0.624620439412283), (12349, 0.6264444559836525), (2449, 0.6296827527029549), (1958, 0.6313998278580094), (11637, 0.6325962306461922), (4138, 0.6355642827865444), (13341, 0.6364365616760489), (4121, 0.6365917009050699), (37778, 0.6383138838113709), (37775, 0.6383896153233649), (8298, 0.6398087680962723), (21348, 0.6419848356494187), (10640, 0.6457890095184893), (7503, 0.6469731172056467), (9681, 0.6492054859170684), (7760, 0.6512971828764968), (5909, 0.6531865218715753), (39912, 0

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### Exploration

In [None]:
len(embeddings[0])

In [None]:
print(data_np_nostop[:1000])

### KD Tree

In [None]:
from scipy import spatial
A = embeddings
tree = spatial.KDTree(A)

In [None]:
queries = ['historique météorologique', 'station météo', "budget régional d'assurance maladie"]
query_embeddings = model.encode(queries)
#query_embeddings_pca=pca.transform(query_embeddings)
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
closest_n = 10
for query, query_embedding in zip(queries, query_embeddings):
    distances, results=tree.query(query_embedding, k=closest_n)

    print("\n\n======================\n\n")
    print("Query:", query)
    print(f"\nTop {closest_n} most similar sentences in corpus:")

    for idx, distance in zip(results, distances):
        print(data_np_nostop[idx], "(Score: %.4f)" % (1-distance/50))

### Official example

In [None]:
word_embedding_model = models.Transformer('flaubert/flaubert_large_cased')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

embedder = SentenceTransformer('distiluse-base-multilingual-cased')

# Corpus with example sentences
corpus = ['Un homme mange de la nourriture',
          "Un homme mange un morceau de pain.",
          "Un homme mange des pâtes.",
          "La fille porte un bébé.",
          "Le bébé est porté par la femme",
          'Un homme est à cheval',
          "Un homme monte un cheval blanc sur un terrain clos.",
          "Un singe joue du tambour.",
          "Un gorille joue du tambour.", "Un singe joue du tambour.",
          "Un guépard court derrière sa proie.",
          "Un guépard poursuit sa proie à travers un champ."
          ]
corpus_embeddings = embedder.encode(corpus)

# Perform kmean clustering
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

### Extract Vectors

In [None]:
vectors=[]
for i in range(1000):
    doc=nlp(data_np_nostop[i])
    vectors_doc=[]
    for token in doc:
        vectors_doc.append(token.vector)
    vectors.append(np.array(vectors_doc))

In [None]:
vectors=np.array(vectors)

In [None]:
vectors[2].shape

In [None]:
len(vectors_clean)

In [None]:
import random 
rand= random.randint(1, len(data_np_nostop)//10)
print(rand)
print(np.var(vectors[rand]))

from sklearn import decomposition
pca = decomposition.PCA(n_components=2)
pca.fit(vectors[rand])
vectors_pca=pca.transform(vectors[rand]).transpose()
x,y = np.mean(vectors_pca[0]), np.mean(vectors_pca[1])

plt.figure(0)
plt.scatter(vectors_pca[0], vectors_pca[1])


from sklearn.ensemble import IsolationForest
clf = IsolationForest(random_state=0, contamination=0.4).fit(vectors[rand])
truth=clf.predict(vectors[rand])

vectors_clean=[]
for i in range(len(vectors[rand])):
    if truth[i]==1:
               vectors_clean.append(vectors[rand][i])


from sklearn import decomposition
pca = decomposition.PCA(n_components=2)
pca.fit(vectors_clean)
vectors_pca_clean=pca.transform(vectors_clean).transpose()
xc,yc = np.mean(vectors_pca_clean[0]), np.mean(vectors_pca_clean[1])
#plt.figure(1)
#plt.scatter(vectors_pca_clean[0], vectors_pca_clean[1])
#plt.scatter(x, y, marker='D', color='green')
plt.scatter(xc, yc, marker='*', color='red')

