<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/EmbsCosDis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import uuid 
import pandas as pd
import numpy as np
from itertools import combinations
from scipy.spatial import distance # 1 - cos_similarity

In [2]:
# gerando IDS alpha numéricos
K = 6_000
ids_ = [uuid.uuid4().hex[:6].upper() for i in range(K)]

#garantindo que os IDS gerados são únicos
ids_ = list(set(ids_)); print(len(ids_))
df = pd.DataFrame({'ID':ids_})

# fake embeddings
embs = []
for i in range(len(ids_)):
    embs.append(np.random.random(768))
embs = np.array(embs)

df

5999


Unnamed: 0,ID
0,F8EE83
1,342965
2,D04146
3,2CBD8C
4,3258CF
...,...
5994,2DEDCD
5995,2A480E
5996,350BF3
5997,9059C8


In [3]:
class EmbsCosDist:
    """
    Entrada:
        -   dataframe com os IDs dos documentos
        -   embeddings de algum método: BERT, TF-IDF, W2V
    
    Executa: 
        -   a distãncia do cosseno: 1 - similaridade

    Saída:
        -   dataframe com as cols.: 
                ID1: id do 1o documento
                ID2: id do 2o documento
                COS_DIST: distância do cossendo entre (1o e 2o) documento. 
    """
    
    def __init__(self, df, embs):
        self.embs = embs
        self.df = df.copy()
    
    def tuple_array_cos(self):
        
        #combinatória dos pares i,j para todo (i !=k)
        comb = np.fromiter(combinations(range(self.embs.shape[0]), 2), dtype='i,i')
        
        #ids dos documentos
        ids = self.df.ID.values
        
        #ids do 1o documento
        ids1 = ids[comb['f0']]

        #ids do 2o documento
        ids2 = ids[comb['f1']]
        
        #ids em forma de tupla (1o documento, 2o documento)
        tuple_ids = [(ids1[i], ids2[i]) for i in range(ids1.shape[0])]

        #para guardar as distâncias do cosseno
        cos_d = []
        #------------------------------------------------------------------
        #para todos os pares (i,j)
        for j, id in enumerate(comb):

            # calcula a distância com o scipy
            cos_dist = distance.cosine(self.embs[id[0]], self.embs[id[1]])
            
            #gurada as distâncias do cosseno
            cos_d.append(cos_dist)
            
            #if que acompanha a evoluçao
            if j%1_000_000==0:
                print(f'{j} of {comb.shape[0]}')
        #------------------------------------------------------------------
        return cos_d, tuple_ids

    def get_dataframe(self):
        
        #instância o método tuple_array
        cos_d, ids = self.tuple_array_cos()
        
        #cria um dataframe com as colunas de interesse
        df_cos = pd.DataFrame({'ID':ids, 'COS_DIST': cos_d})
        
        #quebra a tupla em duas colunas
        df_cos[['ID1', 'ID2']] = pd.DataFrame(df_cos['ID'].tolist(), index=df_cos.index)
        
        #reordena o dataframe
        df_cos = df_cos[['ID1', 'ID2', 'COS_DIST']]

        return df_cos

In [4]:
%%time
ECD = EmbsCosDist(df, embs)
df_class = ECD.get_dataframe()

0 of 17991001
1000000 of 17991001
2000000 of 17991001
3000000 of 17991001
4000000 of 17991001
5000000 of 17991001
6000000 of 17991001
7000000 of 17991001
8000000 of 17991001
9000000 of 17991001
10000000 of 17991001
11000000 of 17991001
12000000 of 17991001
13000000 of 17991001
14000000 of 17991001
15000000 of 17991001
16000000 of 17991001
17000000 of 17991001
CPU times: user 11min 6s, sys: 3.54 s, total: 11min 10s
Wall time: 11min 10s


In [5]:
df_class

Unnamed: 0,ID1,ID2,COS_DIST
0,F8EE83,342965,0.236677
1,F8EE83,D04146,0.248018
2,F8EE83,2CBD8C,0.257660
3,F8EE83,3258CF,0.238607
4,F8EE83,B483BD,0.271757
...,...,...,...
17990996,2A480E,9059C8,0.258392
17990997,2A480E,BEA503,0.237146
17990998,350BF3,9059C8,0.271053
17990999,350BF3,BEA503,0.247198


# Fim

