In [1]:
from annoy import AnnoyIndex
import faiss                   # make faiss available
import numpy as np
from tqdm.notebook import tqdm

datasets = ['tcu', 'stj']
embedders = {
    'tfidf':{'tcu': 34081, 'stj': 25696},
    'lda':{'tcu': 44, 'stj': 1458},
    'word2vec': 300,
    'weighted_word2vec': 300,    
    'fasttext': 300,
    'weighted_fasttext': 300,
    'doc2vec': 100,
    'sentence_transformer': 768,
    'bert': 3072,
    'itd_bert': 3072,
    'longformer':3072,
    'itd_longformer': 3072,
    'elmo': 1024, 
}

In [2]:
for data_name in datasets:
    for model_name, model_data in embedders.items():
        print(model_name)
        indexer_path = '../results/'+data_name+'/'+model_name+'.ann'
        vector_size = None
        if model_name in ['tfidf','lda']:
            vector_size = model_data[data_name]
        else:
            vector_size = model_data
            
        indexer = AnnoyIndex(vector_size, 'angular')
        indexer.load(indexer_path)
        
        
#         nlist = 100 #number of cluster centers 
#         quantizer = faiss.IndexFlat(vector_size)  # the other index
        if model_name == 'lda':
            metric = faiss.METRIC_JensenShannon
        else:
            metric = faiss.METRIC_INNER_PRODUCT

        faiss_indexer = faiss.IndexFlat(vector_size, metric)

        embeddings = []
        ids = []
        for i in tqdm(range(indexer.get_n_items())):
            vector = indexer.get_item_vector(i)
            norm_vector = np.array(vector).astype('float32')
            embeddings.append(norm_vector)
            ids.append(i)
        embeddings = np.array(embeddings).astype('float32')
        
        if model_name == 'lda':
            # It normalizes the matrix in place.
            faiss.normalize_L2(embeddings)
        
        faiss_indexer.train(embeddings)
        faiss_indexer.add(embeddings)
        faiss.write_index(faiss_indexer, '../results/faiss/'+data_name+'/'+model_name+'.faiss')

tfidf


  0%|          | 0/371 [00:00<?, ?it/s]

lda


  0%|          | 0/371 [00:00<?, ?it/s]

word2vec


  0%|          | 0/371 [00:00<?, ?it/s]

weighted_word2vec


  0%|          | 0/371 [00:00<?, ?it/s]

fasttext


  0%|          | 0/371 [00:00<?, ?it/s]

weighted_fasttext


  0%|          | 0/371 [00:00<?, ?it/s]

doc2vec


  0%|          | 0/371 [00:00<?, ?it/s]

sentence_transformer


  0%|          | 0/371 [00:00<?, ?it/s]

bert


  0%|          | 0/371 [00:00<?, ?it/s]

itd_bert


  0%|          | 0/371 [00:00<?, ?it/s]

longformer


  0%|          | 0/371 [00:00<?, ?it/s]

itd_longformer


  0%|          | 0/371 [00:00<?, ?it/s]

elmo


  0%|          | 0/371 [00:00<?, ?it/s]

tfidf


  0%|          | 0/7403 [00:00<?, ?it/s]

lda


  0%|          | 0/7403 [00:00<?, ?it/s]

word2vec


  0%|          | 0/7403 [00:00<?, ?it/s]

weighted_word2vec


  0%|          | 0/7403 [00:00<?, ?it/s]

fasttext


  0%|          | 0/7403 [00:00<?, ?it/s]

weighted_fasttext


  0%|          | 0/7403 [00:00<?, ?it/s]

doc2vec


  0%|          | 0/7403 [00:00<?, ?it/s]

sentence_transformer


  0%|          | 0/7403 [00:00<?, ?it/s]

bert


  0%|          | 0/7403 [00:00<?, ?it/s]

itd_bert


  0%|          | 0/7403 [00:00<?, ?it/s]

longformer


  0%|          | 0/7403 [00:00<?, ?it/s]

itd_longformer


  0%|          | 0/7403 [00:00<?, ?it/s]

elmo


  0%|          | 0/7403 [00:00<?, ?it/s]

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance
import pandas as pd

datasets_len = {'tcu':371,'stj':7403}
k = 6

for data_name in datasets:
    results = []
    for model_name, model_data in embedders.items():
        print(model_name)
        indexer = faiss.read_index('../results/faiss/'+data_name+'/'+model_name+'.faiss')
        indexer.nprobe = 100
        for source_index in tqdm(range(datasets_len[data_name])):
            source_vector = indexer.reconstruct(source_index)
            D, I  = indexer.search(np.array([source_vector]).astype('float32'), k)
            for similar_index in I[0]:
                if source_index != similar_index:
                    similar_vector = indexer.reconstruct(int(similar_index))
                    if model_name != 'lda':
                        similarity = cosine_similarity([source_vector], [similar_vector])[0][0]
                    else:
                        similarity = distance.jensenshannon(source_vector, similar_vector)

                    results.append([source_index, similar_index, similarity, model_name])
    data = pd.DataFrame(results, 
                            columns=['SOURCE_INDEX','SIMILAR_INDEX','SIMILARITY','MODEL_NAME'])
    data.to_csv('../results/faiss/'+data_name+'/similarities.csv')

tfidf


  0%|          | 0/371 [00:00<?, ?it/s]

lda


  0%|          | 0/371 [00:00<?, ?it/s]

word2vec


  0%|          | 0/371 [00:00<?, ?it/s]

weighted_word2vec


  0%|          | 0/371 [00:00<?, ?it/s]

fasttext


  0%|          | 0/371 [00:00<?, ?it/s]

weighted_fasttext


  0%|          | 0/371 [00:00<?, ?it/s]

doc2vec


  0%|          | 0/371 [00:00<?, ?it/s]

sentence_transformer


  0%|          | 0/371 [00:00<?, ?it/s]

bert


  0%|          | 0/371 [00:00<?, ?it/s]

itd_bert


  0%|          | 0/371 [00:00<?, ?it/s]

longformer


  0%|          | 0/371 [00:00<?, ?it/s]

itd_longformer


  0%|          | 0/371 [00:00<?, ?it/s]

elmo


  0%|          | 0/371 [00:00<?, ?it/s]

tfidf


  0%|          | 0/7403 [00:00<?, ?it/s]

lda


  0%|          | 0/7403 [00:00<?, ?it/s]

  return np.sqrt(js / 2.0)


word2vec


  0%|          | 0/7403 [00:00<?, ?it/s]

weighted_word2vec


  0%|          | 0/7403 [00:00<?, ?it/s]

fasttext


  0%|          | 0/7403 [00:00<?, ?it/s]

weighted_fasttext


  0%|          | 0/7403 [00:00<?, ?it/s]

doc2vec


  0%|          | 0/7403 [00:00<?, ?it/s]

sentence_transformer


  0%|          | 0/7403 [00:00<?, ?it/s]

bert


  0%|          | 0/7403 [00:00<?, ?it/s]

itd_bert


  0%|          | 0/7403 [00:00<?, ?it/s]

longformer


  0%|          | 0/7403 [00:00<?, ?it/s]

itd_longformer


  0%|          | 0/7403 [00:00<?, ?it/s]

elmo


  0%|          | 0/7403 [00:00<?, ?it/s]

In [4]:
import os, inspect, sys
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

from ..bert_embedder import BertEmbedder

def setup_indexer(vectors_size=3072):
    return AnnoyIndex(vectors_size, 'angular')

bert = BertEmbedder('models/bert-base-cased-pt-br', setup_indexer())

ValueError: attempted relative import beyond top-level package