<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/Similarity_String_to_String_WITH_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install -q transformers

[K     |████████████████████████████████| 2.2MB 4.5MB/s 
[K     |████████████████████████████████| 3.3MB 18.1MB/s 
[K     |████████████████████████████████| 870kB 51.2MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [14]:
import torch
from torch import Tensor

import pandas as pd
import numpy as np

from itertools import combinations
from transformers import BertModel, BertTokenizer

pd.set_option('display.max_rows', 1500)

In [72]:
text_list = [
        'arroz com feijão e batata frita|1001',
        'arroz salada e mandioca|1010',
        'arroz com feijão e abobrinha|1101',
        'arroz com feijão batata frit|1001',
        'feijão arroz e cenoura|1011',
        'beterraba, chuchu e arroz com feijão|1111',
        'arroz com milho e feijão com couve|2001',
        'batata frita com legumes e soja|2010',
        'arroz com bata frita|2010',
        'madioquinha com sopa de feijão|2010',
        'arroz feijo e batata fita|1001',
        'mesa, cadeira e escrivaninha|2711',
        'cadeira, mesa e escrivaninha|2711',
        'escrivaninha, mesa e cadeira|2711',
        'geladeira e fogão|1711',
        'microondas e geraleira|1711',
        'geladeira e mesa azul|1711',
        'roupeiro, criado-mudo e cama|2804',
        'criado-mudo e cama|2804',
        'roupeiro e criado-mudo|2804',
        'cama e roupeiro|2804',
        'microondas e cama|2805',
        'mochila e roupeiro|2805',
        'ps1, ps2, ps3, ps4 e ps5|1311',
        'n64, snes, ps3, ps4 e ps5|1211',
        'ps1, ps2, n64, ps4 e ps5|1341',
        'ps5, ps4, ps3, ps2 e ps1|1311',
        'ps1, n64, snes, switch|2311',
        'ps6, ps7, ps8, ps9 e ps10|3211',
        'ps10, ps2, ps3, ps4 e ps5|3211',
        'mega-drive, nes, snes|1301',
        'mega drive, n64, snes|1301',
        'megadrive, nes, n64|1301',
        'mega-drive, snes, nes|1311',
        'mega drive, nes, n64|1311',
        'tom, planck, ozzy, mel, nina, pingado e gisele|0000',
        'tom, planck, ozzy, mel, nina, pingado gisele|0000',
        'paulo, tom, planck, ozzy, mel, nina, pingado e gisele|0001',
        'paulo, eli, tom, planck, ozzy, mel, nina, pingado e gisele|0001',
        'jan, paulo, eli, tom, planck, ozzy, mel, nina, pingado e gisele|0001',
]

mydict = {}
for i, item in enumerate(text_list):
    mydict[i] = item.split('|')[0], item.split('|')[1]

df = pd.DataFrame(mydict).T
df = df.rename(columns={0:'text', 1:'doc'})
df

Unnamed: 0,text,doc
0,arroz com feijão e batata frita,1001
1,arroz salada e mandioca,1010
2,arroz com feijão e abobrinha,1101
3,arroz com feijão batata frit,1001
4,feijão arroz e cenoura,1011
5,"beterraba, chuchu e arroz com feijão",1111
6,arroz com milho e feijão com couve,2001
7,batata frita com legumes e soja,2010
8,arroz com bata frita,2010
9,madioquinha com sopa de feijão,2010


# BERTaú-Embeddings

In [73]:
tokenizer = BertTokenizer.from_pretrained(path_model)

sample = df.text.to_list()[0]
print(f'text:{sample}')

tokens = tokenizer(sample, return_tensors='pt')
tokens

text:arroz com feijão e batata frita


{'input_ids': tensor([[    2, 24750,   259, 12608, 26537,    37, 27424,   277, 21685,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [74]:
model = BertModel.from_pretrained(path_model)
outs = model(**tokens, return_dict=True)
last_hidden_state = outs['last_hidden_state']
# cls = last_hidden_state[:,0, :].squeeze(0).detach().numpy()


In [75]:
print(last_hidden_state[:,1:-1,].shape)

torch.einsum("ijk -> k", last_hidden_state).detach().numpy().shape

torch.Size([1, 8, 768])


(768,)

In [76]:
%%time

def get_embs(text_list:[], path_model:str):
    tokenizer = BertTokenizer.from_pretrained(path_model)
    model = BertModel.from_pretrained(path_model)
    hidden_size = model.config.hidden_size
    embs = torch.zeros(len(df)*hidden_size).reshape(len(df), hidden_size)

    for i, text in enumerate(text_list):
        tokens = tokenizer(text, return_tensors='pt')
        outs = model(**tokens, return_dict=True)
        last_hidden_state = outs['last_hidden_state']
        # cls = last_hidden_state[:,0, :].squeeze(0).detach().numpy()
        
        # without CLS and SEP
        embs_mean = torch.einsum("ijk -> k", last_hidden_state[:,1:-1,])#.detach().numpy()
        embs[i] = embs_mean

    return embs
# --------------------------------------------------
path_model = 'Itau-Unibanco/BERTau'
embeddings = get_embs(df.text.to_list(), path_model)

CPU times: user 6.65 s, sys: 858 ms, total: 7.5 s
Wall time: 7.81 s


# Similaridade

In [77]:
def pytorch_cos_sim(a: Tensor, b: Tensor):
    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = a / a.norm(dim=1)[:, None]
    b_norm = b / b.norm(dim=1)[:, None]
    return torch.einsum('ij, kj -> ik', a_norm, b_norm).item()

def similarities(df:pd.DataFrame, embs:np.array, verbose=True):
    comb = np.fromiter(combinations(range(embs.shape[0]), 2), dtype='i,i')
    ids = df.text.values
    ids1 = ids[comb['f0']]
    ids2 = ids[comb['f1']]
    l = [(ids1[i], ids2[i]) for i in range(ids1.shape[0])]
    
    ID0, ID1, sim = [],[],[]
    #------------------------------------------------------------
    for j, (id0, id1) in enumerate(comb):
        sim.append(pytorch_cos_sim(embs[id0], embs[id1]))
        ID0.append(l[j][0])
        ID1.append(l[j][1])
        
        if verbose:
            if j%500==0:
                print(f'Processed:  {j} of {comb.shape[0]}')

    df_new = pd.DataFrame(
        {
            'ID0': ID0, 
            'ID1': ID1, 
            'SIMILARITY': sim,
         }, )

    return df_new
#---------------------------------------------------------------------------------
df_new = similarities(df, embeddings, verbose=True)
df_new = df_new.sort_values(by='SIMILARITY', ascending=False).reset_index(drop=True)
df_new

Processed:  0 of 780
Processed:  500 of 780


Unnamed: 0,ID0,ID1,SIMILARITY
0,"ps1, ps2, ps3, ps4 e ps5","ps5, ps4, ps3, ps2 e ps1",1.0
1,"ps1, ps2, ps3, ps4 e ps5","ps10, ps2, ps3, ps4 e ps5",1.0
2,"ps1, ps2, ps3, ps4 e ps5","ps6, ps7, ps8, ps9 e ps10",1.0
3,"ps5, ps4, ps3, ps2 e ps1","ps6, ps7, ps8, ps9 e ps10",1.0
4,"ps1, ps2, n64, ps4 e ps5","ps5, ps4, ps3, ps2 e ps1",1.0
5,"ps1, ps2, n64, ps4 e ps5","ps6, ps7, ps8, ps9 e ps10",1.0
6,"ps1, ps2, n64, ps4 e ps5","ps10, ps2, ps3, ps4 e ps5",1.0
7,"ps6, ps7, ps8, ps9 e ps10","ps10, ps2, ps3, ps4 e ps5",1.0
8,"ps1, ps2, ps3, ps4 e ps5","ps1, ps2, n64, ps4 e ps5",1.0
9,"ps5, ps4, ps3, ps2 e ps1","ps10, ps2, ps3, ps4 e ps5",1.0


# Group Targets

In [78]:
THRS = 0.92
df_thrs = df_new[df_new.SIMILARITY >= THRS]
df_thrs

Unnamed: 0,ID0,ID1,SIMILARITY
0,"ps1, ps2, ps3, ps4 e ps5","ps5, ps4, ps3, ps2 e ps1",1.0
1,"ps1, ps2, ps3, ps4 e ps5","ps10, ps2, ps3, ps4 e ps5",1.0
2,"ps1, ps2, ps3, ps4 e ps5","ps6, ps7, ps8, ps9 e ps10",1.0
3,"ps5, ps4, ps3, ps2 e ps1","ps6, ps7, ps8, ps9 e ps10",1.0
4,"ps1, ps2, n64, ps4 e ps5","ps5, ps4, ps3, ps2 e ps1",1.0
5,"ps1, ps2, n64, ps4 e ps5","ps6, ps7, ps8, ps9 e ps10",1.0
6,"ps1, ps2, n64, ps4 e ps5","ps10, ps2, ps3, ps4 e ps5",1.0
7,"ps6, ps7, ps8, ps9 e ps10","ps10, ps2, ps3, ps4 e ps5",1.0
8,"ps1, ps2, ps3, ps4 e ps5","ps1, ps2, n64, ps4 e ps5",1.0
9,"ps5, ps4, ps3, ps2 e ps1","ps10, ps2, ps3, ps4 e ps5",1.0


In [86]:
targets = []
for id0, id1 in zip(df_thrs.ID0.to_list(), df_thrs.ID1.to_list()):
    targets.append(list(set([
                    list(set(df[df.text==id0].doc.to_list()))[0], 
                    list(set(df[df.text==id1].doc.to_list()))[0]
    ])))

In [87]:
df_thrs['TARGETS'] = targets
df_thrs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,ID0,ID1,SIMILARITY,TARGETS
0,"ps1, ps2, ps3, ps4 e ps5","ps5, ps4, ps3, ps2 e ps1",1.0,[1311]
1,"ps1, ps2, ps3, ps4 e ps5","ps10, ps2, ps3, ps4 e ps5",1.0,"[1311, 3211]"
2,"ps1, ps2, ps3, ps4 e ps5","ps6, ps7, ps8, ps9 e ps10",1.0,"[1311, 3211]"
3,"ps5, ps4, ps3, ps2 e ps1","ps6, ps7, ps8, ps9 e ps10",1.0,"[1311, 3211]"
4,"ps1, ps2, n64, ps4 e ps5","ps5, ps4, ps3, ps2 e ps1",1.0,"[1311, 1341]"
5,"ps1, ps2, n64, ps4 e ps5","ps6, ps7, ps8, ps9 e ps10",1.0,"[3211, 1341]"
6,"ps1, ps2, n64, ps4 e ps5","ps10, ps2, ps3, ps4 e ps5",1.0,"[3211, 1341]"
7,"ps6, ps7, ps8, ps9 e ps10","ps10, ps2, ps3, ps4 e ps5",1.0,[3211]
8,"ps1, ps2, ps3, ps4 e ps5","ps1, ps2, n64, ps4 e ps5",1.0,"[1311, 1341]"
9,"ps5, ps4, ps3, ps2 e ps1","ps10, ps2, ps3, ps4 e ps5",1.0,"[1311, 3211]"
