**Importing Necessary Packages**

In [None]:
import pandas as pd
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import stopwords
import string

**Downloading wordnet, stopwords**

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

**Loading Dataset**

In [None]:
df = pd.read_csv("/content/dataCorpus.csv")

**Preprocessing Data**

In [None]:
stop_words=set(stopwords.words("english"))
def pre_process(text):
  tokenized_word=word_tokenize(text)
  lem = WordNetLemmatizer()
  clear =list()
  for word in tokenized_word:
    if word not in stop_words:
      clear.append(lem.lemmatize(word))
  
  return TreebankWordDetokenizer().detokenize(clear)

**Creating Context**

In [None]:
def create_window(inp, wrd):    
     res = ""
     content = pre_process(inp)
     content_list = content.split(" ")
     ind = -1
     length = len(content_list)
     for i in range(len(content_list)):
          if content_list[i] == pre_process(wrd):
              ind = i
     cnt = 1
     if(ind - 2 >= 0):
        res += (content_list[ind - 2] + " ")
        cnt += 1
     if(ind - 1 >= 0):
        res += (content_list[ind - 1] + " ")
        cnt += 1
     ind += 1
     while(ind < len(content_list) and cnt<=5):
        res += (content_list[ind] + " ")
        ind += 1
        cnt += 1
     return res

In [None]:
new_context = list()
for row in df.itertuples(index=True, name='Pandas'):
     content = create_window(row.context,row.target_word)
     new_context.append(content)
df['new_context'] = new_context


**Downloading Extra Packages**

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 17.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 49.8 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 48.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 55.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    F

**Generating Embedding and doing mean pooling**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = model.to(device)

def get_bert_embedding(sentence):
  encoded_input = tokenizer(sentence, padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)
  with torch.no_grad():
    model_output = model(**encoded_input)
  sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
  return sentence_embedding[0]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

In [None]:
embed = []
X = df['new_context']
for i in range(len(X)):
  val = get_bert_embedding(X[i]).cpu().numpy()
  embed.append(val)

In [None]:
df['embed'] = embed
df.head()

Unnamed: 0.1,Unnamed: 0,file,context,target_word,gloss,is_proper_gloss,new_context,embed
0,0,br-a01,The Fulton_County_Grand_Jury said Friday an in...,Fulton_County_Grand_Jury,any number of entities (members) considered as...,True,The said Friday investigation Atlanta's,"[-0.18552732, -0.7882837, 1.4862454, 0.3077651..."
1,1,br-a01,The Fulton_County_Grand_Jury said Friday an in...,Fulton_County_Grand_Jury,(chemistry) two or more atoms bound together a...,False,The said Friday investigation Atlanta's,"[-0.18552732, -0.7882837, 1.4862454, 0.3077651..."
2,2,br-a01,The Fulton_County_Grand_Jury said Friday an in...,Fulton_County_Grand_Jury,"a set that is closed, associative, has an iden...",False,The said Friday investigation Atlanta's,"[-0.18552732, -0.7882837, 1.4862454, 0.3077651..."
3,3,br-a01,The Fulton_County_Grand_Jury said Friday an in...,Fulton_County_Grand_Jury,form a group or group together,False,The said Friday investigation Atlanta's,"[-0.18552732, -0.7882837, 1.4862454, 0.3077651..."
4,4,br-a01,The Fulton_County_Grand_Jury said Friday an in...,said,the chance to speak,True,The Fulton_County_Grand_Jury Friday investigat...,"[-0.062361002, -0.32859203, 1.2687191, 0.36365..."


**Saving the embedding**

In [None]:
import numpy as np
np.save('/content/knn_bert.npy', embed)

**Test Train Split**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(df, test_size=0.25, random_state=42)


**Distance Measure**

In [None]:
from numpy.linalg import norm
def cosine_similarity(num1, num2):
    return np.dot(num1,num2)/(norm(num1)*norm(num2))

**Calculating KNN for given context and target word**

In [None]:
test_embed = X_test['embed']
train_embed = X_train['embed']
correct = 0
k = 5
for test in X_test.itertuples(index=True, name='Pandas'):
    sim = list()
    for train in X_train.itertuples(index=True, name='Pandas'):
        if train.target_word == test.target_word:
            sim.append(cosine_similarity(test.embed,train.embed))
        else:
            sim.append(-1)
    X_train['cosine'] = sim
    X_train.sort_values(by=['cosine'], ascending=False)
    cnt = 0
    for train in X_train.itertuples(index=True, name='Pandas'):
        if(train.cosine != -1 and train.is_proper_gloss == True and cnt<k):
            correct += 1
            cnt += 1
            break
    X_train.drop(['cosine'],axis = 1)
    

**Results**

In [None]:
accuracy = correct / len(X_test) * 100
print(accuracy)

81.39999999999999
