# Introduction to word embeddings and distributed representation of words


---


##Popular word embedding models: Word2Vec and GloVe

---


##Hands-on exercise: Training Word2Vec model and exploring pre-trained embeddings

https://www.youtube.com/watch?v=hQwFeIupNP0

---


https://towardsdatascience.com/skip-gram-neural-network-from-scratch-485f2e688238

---

https://medium.com/swlh/differences-between-word2vec-and-bert-c08a3326b5d1

---



https://www.nltk.org/howto/gensim.html

In [None]:
import numpy as np

# vector representing the word 'beer'
vec1_beer = np.array([0.9, 0.82, 0.75])

# vector representing the word 'wine'
vec2_wine = np.array([0.5, 0.98, 0.92])

# vector representing the word 'house'
vec3_house = np.array([0.91, 0.11, 0.25])


def cosine_similarity(w1, w2):
  return np.dot(w1,w2)/(np.dot(w1,w1)*np.dot(w2,w2))**0.5

print('Similarity between beer & wine:   ', cosine_similarity(vec1_beer,vec2_wine)) # -> output: 0.947
print('Similarity between beer & house: ', cosine_similarity(vec1_beer,vec3_house)) # -> output: 0.807
print('Similarity between wine & house:    ', cosine_similarity(vec2_winr, vec3_house)) # -> output: 0.581


# Training word2vec

In [None]:
# Import necessary libraries
import nltk
from nltk.corpus import brown
from gensim.models import Word2Vec

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Download the Brown corpus from NLTK
nltk.download('brown')

# Load the Brown corpus
corpus = brown.sents()

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


#Word2Vec class is instantiated with the specified parameters:

corpus: It represents the input text corpus, which is assumed to be assigned to the variable corpus.

---


vector_size: It determines the dimensionality of the word vectors, set to 100 in this case.

---


window: It defines the window size for the context words, set to 5.

---


min_count: It specifies the minimum frequency threshold for words to be included in the vocabulary, set to 5.

---


workers: It determines the number of worker threads used for training the model in parallel, set to 4.

In [None]:
# Train Word2Vec model on the Brown corpus
model = Word2Vec(corpus, vector_size=100, window=5, min_count=5, workers=4)
# Test the trained Word2Vec model
similar_words = model.wv.most_similar('house')
print("Similar words to 'house':", similar_words)

Similar words to 'house': [('car', 0.9466149806976318), ('room', 0.9146735668182373), ('hand', 0.8710554838180542), ('bed', 0.8634669780731201), ('door', 0.8606128096580505), ('hall', 0.8588022589683533), ('office', 0.8505901098251343), ('hands', 0.8488484621047974), ('face', 0.8458495736122131), ('close', 0.8431581854820251)]


# Exercise 1: Train Word2Vec model on a different corpus

In [None]:

# 1.1 Download a different corpus using NLTK (e.g., Gutenberg Corpus)
nltk.download('gutenberg')

# 1.2 Preprocess the corpus (e.g., tokenization, lowercasing)
gutenberg_corpus = nltk.corpus.gutenberg.sents()
preprocessed_corpus = [[token.lower() for token in sentence] for sentence in gutenberg_corpus]

# 1.3 Train a Word2Vec model on the preprocessed corpus
new_model = Word2Vec(preprocessed_corpus, vector_size=100, window=5, min_count=5, workers=4)

# 1.4 Test the trained Word2Vec model with different words
similar_words = new_model.wv.most_similar('extraordinary')
print("Similar words to 'love':", similar_words)

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


Similar words to 'love': [('awful', 0.8148207664489746), ('exquisite', 0.8071579337120056), ('singular', 0.8032702207565308), ('remarkable', 0.8004785180091858), ('active', 0.7999264001846313), ('improper', 0.7980139255523682), ('uncommon', 0.7917614579200745), ('ingenious', 0.7865627408027649), ('artist', 0.784122884273529), ('admirable', 0.77900630235672)]


# Exercise 2: Explore other pre-trained Word2Vec embeddings

In [None]:

# 2.1 Find and download other pre-trained Word2Vec models
# - You can explore other pre-trained models from websites like GloVe, FastText, etc.

# 2.2 Load the pre-trained model using `KeyedVectors.load_word2vec_format()`
from gensim.models import KeyedVectors

# Download pre-trained Word2Vec embeddings (GloVe)
# Note: This file is large (about 1.4GB) and may take some time to download.
# You can comment out this line if the file is already downloaded.
!wget -c "https://nlp.stanford.edu/data/glove.840B.300d.zip"
!unzip glove.840B.300d.zip

--2023-06-07 10:27:28--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2023-06-07 10:27:28--  https://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip]
Saving to: â€˜glove.840B.300d.zipâ€™


2023-06-07 10:34:19 (5.06 MB/s) - â€˜glove.840B.300d.zipâ€™ saved [2176768927/2176768927]

Archive:  glove.840B.300d.zip
  inflating: glove.840B.300d.txt     


In [None]:
# 2.2 Load the pre-trained model using `KeyedVectors.load_word2vec_format()`
from gensim.models import KeyedVectors

# Load pre-trained Word2Vec embeddings
pretrained_model = KeyedVectors.load_word2vec_format('glove.840B.300d.txt', binary=False, no_header=True)

# 2.3 Test the pre-trained Word2Vec model with different words
similar_words = pretrained_model.most_similar('technology')
print("Similar words to 'technology':", similar_words)




Similar words to 'technology': [('technologies', 0.8787461519241333), ('technological', 0.7222408652305603), ('innovation', 0.7079066634178162), ('innovations', 0.6894813776016235), ('innovative', 0.6559900045394897), ('capabilities', 0.6543591022491455), ('high-tech', 0.6419724225997925), ('systems', 0.6346928477287292), ('advancements', 0.6320609450340271), ('Technology', 0.6287643909454346)]


  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
