<a href="https://colab.research.google.com/github/gpandu/BERT-Pretraining/blob/main/Copy_of_Sentense_similarity_2C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


In [None]:
import pandas as pd
import csv

df = pd.read_csv('/content/train/data.csv',
        header=0,
        usecols=["ID", "Description"])

In [None]:
df.loc[0]
df["Description"].apply(len).min()

19

In [None]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-distilroberta-v1')
model = AutoModel.from_pretrained('sentence-transformers/all-distilroberta-v1')



tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

In [None]:
#Set model to load into GPU Memory
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model.to(device)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-5): 6 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout)

In [None]:
id_vector_map = {}
batch_len = 100
sentenses = df['Description'].tolist()
embeddings_list = []
for start_idx in range(0, len(sentenses), batch_len):
  if start_idx+batch_len > len(sentenses):
      batch_len = start_idx+ len(sentenses)%batch_len
  samples = sentenses[start_idx : start_idx + batch_len]

  # Tokenize sentences
  encoded_input = tokenizer(samples, padding=True, truncation=True, return_tensors='pt').to(device)

  # Compute token embeddings
  with torch.no_grad():
    model_output = model(**encoded_input)
  # Perform pooling
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

  # Normalize embeddings
  sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
  embeddings_list.append(sentence_embeddings.detach().cpu().numpy())




In [None]:
import numpy as np
embeddings_arr = embeddings_list[0]
for i, embeddings in enumerate(embeddings_list):
  if i==0:
    continue
  embeddings_arr = np.vstack((embeddings_arr, embeddings))

embeddings_arr.shape

(23224, 768)

In [None]:
similar_text_map = {}
id_list = df['ID'].tolist()
top_no_of_records = 5
embeddings_copy = embeddings_arr.copy()
for i, text_embed in enumerate(embeddings_arr):
  id_issue_map = {}
  for j, embed_copy in enumerate(embeddings_copy):
    if i==j:
      continue
    id_issue_map[id_list[j]] = np.inner(text_embed, embed_copy)

  # Sort based on inner product score
  sorted_map = sorted(id_issue_map.items(), key=lambda kv: kv[1], reverse=True)
  similar_list = []
  for item in sorted_map[0:top_no_of_records]:
    similar_list.append(item[0])
  similar_text_map[id_list[i]] = similar_list



In [None]:
result_df = pd.DataFrame(similar_text_map.items(), columns=['ID', 'Recommendations'])
result_df.to_csv("result.csv", index=False)