# **Comparing sentences by meaning**

In [None]:
!pip3 install transformers

In [2]:
import numpy as np 
import pandas as pd 
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

import os

In [None]:
#tokenizer = AutoTokenizer.from_pretrained("EMBEDDIA/sloberta")
#model = AutoModel.from_pretrained("EMBEDDIA/sloberta",output_hidden_states=True)

tokenizer = AutoTokenizer.from_pretrained("EMBEDDIA/crosloengual-bert")
model = AutoModel.from_pretrained("EMBEDDIA/crosloengual-bert",output_hidden_states=True)

In [6]:
def get_embeddings(text,token_length):
    tokens=tokenizer(text,max_length=token_length,padding='max_length',truncation=True)
    output=model(torch.tensor(tokens.input_ids).unsqueeze(0),
                 attention_mask=torch.tensor(tokens.attention_mask).unsqueeze(0)).hidden_states[-1]
    return torch.mean(output,axis=1).detach().numpy()

In [7]:
def calculate_similarity(text1,text2,token_length=40):
    out1=get_embeddings(text1,token_length=token_length) 
    out2=get_embeddings(text2,token_length=token_length)

    sim1= cosine_similarity(out1,out2)[0][0]

    return sim1

*We assume that the words and use-case sentences are stored in your Google Drive account.*

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

The file with the words and manually annotated phrases is available in our repository, and needs to be saved in the root folder of your Google Drive. 
An example of such a file is the corpus-annotated.csv file in our repository in the data folder.

In [9]:
sheet = "Stavki anotirani rocno"
wb = gc.open(sheet)
rows = wb.sheet1.get_all_values();

The results of the comparison are written to an table, which must be created in the base folder of your Google Drive and named Output. 

In [11]:
wb2 = gc.open('output').sheet1
count = 1;

threshold = 0.53

for row in rows:

  text1 = row[2]
  text2 = row[3]
  similarity = calculate_similarity(text1,text2);

  cells = wb2.range('A'+str(count)+':F'+str(count))

  cells[0].value = row[0]
  cells[1].value = str(similarity)
  if (similarity > threshold):
    cells[2].value = str(1)
  else:
    cells[2].value = str(0)
  cells[3].value = row[1]
  cells[4].value = row[2]
  cells[5].value = row[3]

  wb2.update_cells(cells)

  count += 1

The results are written in the form:

`<token> <similarity> <predicted_similarity> <hand_anotated_similarity> <sentence1> <sentence2>`

Manually estimated similarity of 1 means the same meaning, while a value of 0 means different meanings of the use of the word in the sentences.

If calculated similarity is higher than provided threshold, the `predicted_similarity` is set to 1.