In [29]:
import spacy 
import nltk

In [2]:
nlp = spacy.load("en_core_web_sm")
text = "Natural Language Processing is fascinating"

In [35]:
doc = nlp(text)

for token in doc: 
    print(token.text, token.vector.shape)

Natural (96,)
Language (96,)
Processing (96,)
is (96,)
fascinating (96,)


In [25]:
from gensim.models import Word2Vec

corpus = "Machine Learning is fun. natural language processing. deep learning models"
sents = [[w.lower() for w in nltk.word_tokenize(s)] for s in nltk.sent_tokenize(corpus)]
sents

[['machine', 'learning', 'is', 'fun', '.'],
 ['natural', 'language', 'processing', '.'],
 ['deep', 'learning', 'models']]

In [26]:
# convert word to vec
model = Word2Vec(sents, vector_size=50, window=3, min_count=1)
vector = model.wv['natural']
print("Vector for 'natural': ", vector)

Vector for 'natural':  [-0.01648536  0.01859871 -0.00039532 -0.00393455  0.00920726 -0.00819063
  0.00548623  0.01387993  0.01213085 -0.01502159  0.0187647   0.00934362
  0.00793224 -0.01248701  0.01691996 -0.00430033  0.01765038 -0.01072401
 -0.01625884  0.01364912  0.00334239 -0.00439702  0.0190272   0.01898771
 -0.01954809  0.00501046  0.01231338  0.00774491  0.00404557  0.000861
  0.00134726 -0.00764127 -0.0142805  -0.00417774  0.0078478   0.01763737
  0.0185183  -0.01195187 -0.01880534  0.01952875  0.00685957  0.01033223
  0.01256469 -0.00560853  0.01464541  0.00566054  0.00574201 -0.00476074
 -0.0062565  -0.00474028]


In [28]:
# Word Similarity using word2vec
similarity = model.wv.similarity("machine", "language")
print("Similarity: ", similarity)

Similarity:  0.22442302


In [34]:
# GloVe Embeddings (Pre-trained)
import gensim.downloader as api
glove = api.load("glove-wiki-gigaword-50")

print(glove['computer'])
print(glove.similarity("computer", "laptop"))

[ 0.079084 -0.81504   1.7901    0.91653   0.10797  -0.55628  -0.84427
 -1.4951    0.13418   0.63627   0.35146   0.25813  -0.55029   0.51056
  0.37409   0.12092  -1.6166    0.83653   0.14202  -0.52348   0.73453
  0.12207  -0.49079   0.32533   0.45306  -1.585    -0.63848  -1.0053
  0.10454  -0.42984   3.181    -0.62187   0.16819  -1.0139    0.064058
  0.57844  -0.4556    0.73783   0.37203  -0.57722   0.66441   0.055129
  0.037891  1.3275    0.30991   0.50697   1.2357    0.1274   -0.11434
  0.20709 ]
0.77411586


In [36]:
# contextual Embedding using BERT
from transformers import BertTokenizer, BertModel
import torch

In [39]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

sentence = "The bank is near the river"
inputs = tokenizer(sentence, return_tensors='pt')

outputs = model(**inputs)
embeddings = outputs.last_hidden_state

print(embeddings.shape)

torch.Size([1, 8, 768])


In [45]:
# context change Example
sent1 = "I deposited money in the bank"
sent2 = "The river bank is beautiful"

inputs1 = tokenizer(sent1, return_tensors='pt')
inputs2 = tokenizer(sent2, return_tensors='pt')

emb1 = model(**inputs).last_hidden_state
emb2 = model(**inputs).last_hidden_state

print(emb1, emb2)

tensor([[[-0.0362,  0.0981, -0.0198,  ..., -0.0685, -0.0597, -0.0195],
         [ 0.1371,  0.0440, -0.0555,  ..., -0.2686,  0.0399,  0.1294],
         [ 0.0159,  0.0557,  0.0038,  ..., -0.0188, -0.1045, -0.0836],
         [ 0.1222,  0.2120,  0.0749,  ...,  0.0946,  0.0471, -0.0361],
         [ 0.0351, -0.0155,  0.0312,  ..., -0.0737,  0.0225,  0.0391],
         [-0.0304,  0.1093, -0.0470,  ..., -0.1169, -0.0684, -0.0506]]],
       grad_fn=<NativeLayerNormBackward0>) tensor([[[-0.0362,  0.0981, -0.0198,  ..., -0.0685, -0.0597, -0.0195],
         [ 0.1371,  0.0440, -0.0555,  ..., -0.2686,  0.0399,  0.1294],
         [ 0.0159,  0.0557,  0.0038,  ..., -0.0188, -0.1045, -0.0836],
         [ 0.1222,  0.2120,  0.0749,  ...,  0.0946,  0.0471, -0.0361],
         [ 0.0351, -0.0155,  0.0312,  ..., -0.0737,  0.0225,  0.0391],
         [-0.0304,  0.1093, -0.0470,  ..., -0.1169, -0.0684, -0.0506]]],
       grad_fn=<NativeLayerNormBackward0>)


In [43]:
# RoBERTa embedding: 
from transformers import RobertaTokenizer, RobertaModel

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

inputs = tokenizer("Word representations are powerful", return_tensors='pt')
outputs = model(**inputs)

print(outputs.last_hidden_state.shape)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([1, 6, 768])
