In [None]:
!pip install nltk
!pip install sentence_transformers
!pip install funcy

Collecting funcy
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy
Successfully installed funcy-2.0


### **Using the word Vocabulary**

In [None]:
import nltk
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
from sentence_transformers import SentenceTransformer, util
import nltk
from nltk.corpus import words

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
phrase = "shared border with"

word_list = words.words()

phrase_embedding = model.encode(phrase, convert_to_tensor=True)
word_embeddings = model.encode(word_list[:1000], convert_to_tensor=True)

similarities = util.pytorch_cos_sim(phrase_embedding, word_embeddings)[0]

In [None]:
closest_word = word_list[similarities.argmax()]
print(f"The closest matching word is: {closest_word}")

The closest matching word is: aboveboard


### **Using Word Net**

---



In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
from sentence_transformers import SentenceTransformer, util
from nltk.corpus import wordnet


model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from funcy import print_durations

@print_durations
def get_embeddings(labels, sent_tran_model):
    embeddings = sent_tran_model.encode(labels, show_progress_bar=False)
    return embeddings

In [None]:
def write_embeddings_to_file(embeddings, labels, filename):
    with open(filename, 'w', encoding='utf-8') as f_out:
        f_out.write(f"{len(labels)} {len(embeddings[0])}\n")
        for label, embedding in zip(labels, embeddings):
            f_out.write(f"{label.replace(' ', '_')} {' '.join([str(x) for x in embedding])}\n")
    print("Embeddings written to file successfully")

In [None]:
word_list = list(set(wordnet.words()))

word_list = word_list[:3000]

embeddings = get_embeddings(word_list, model)
write_embeddings_to_file(embeddings, word_list, "embeddings.vectors")

   11.48 s in get_embeddings(['cross-section', 'tak..., SentenceTransformer( (...)
Embeddings written to file successfully


In [None]:
from gensim.models import KeyedVectors

def load_gensim_model_from_file(filepath):
    model = KeyedVectors.load_word2vec_format(filepath, binary=False)
    return model

### **Testing with the gensim model similarity**

In [None]:
gensim_model = load_gensim_model_from_file("embeddings.vectors")

In [None]:
import pandas as pd

def sim_search(term, gensim_model, sent_tran_model):
    result = gensim_model.most_similar(
        positive=sent_tran_model.encode([term], show_progress_bar=False), topn=1)
    out = []
    for label, score in result:
        out.append({'label': label.replace('_', ' '), 'score': score})
    df = pd.DataFrame(out)
    return df

In [None]:
phrase = "shared border with"

phrase_sug = sim_search(phrase, gensim_model, model)
print(phrase_sug)

       label     score
0  uncrossed  0.471029


### **Without the gensim model, Using the sentence transformer util cosine similarity**

In [None]:
phrase = "shared border with"

word_list = list(set(wordnet.words()))

phrase_embedding = model.encode(phrase, convert_to_tensor=True)

word_embeddings = model.encode(word_list[:2000], convert_to_tensor=True)
similarities = util.pytorch_cos_sim(phrase_embedding, word_embeddings)[0]

We still need better models to generate the embeddings for better accuracies. Models like "all-mpnet-base-v2" seems better but takes a longer time to generate embeddings (> 130sec)

In [None]:
closest_word = word_list[similarities.argmax()]
print(f"Suggested predicate - dbp:{closest_word}")

Suggested predicate - dbp:tangle_with
