In [1]:
!pip install flair
import pandas as pd
import gensim
from gensim.models import Word2Vec
from flair.data import Sentence
from flair.embeddings import WordEmbeddings
from scipy import spatial


Collecting flair
  Downloading flair-0.13.1-py3-none-any.whl (388 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.3/388.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3>=1.20.27 (from flair)
  Downloading boto3-1.34.96-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bpemb>=0.3.2 (from flair)
  Downloading bpemb-0.3.5-py3-none-any.whl (19 kB)
Collecting conllu>=4.0 (from flair)
  Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Collecting deprecated>=1.2.13 (from flair)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting ftfy>=6.1.0 (from flair)
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting janome>=0.4.2 (from flair)
  Downloading Janome-0.5.0-py2.py3-none-any.whl (19.

In [2]:
import nltk
from nltk.corpus import reuters

# Downloading the reuters corpus
nltk.download('reuters')

def generate_corpus():
    documents = []

    # file IDs in the Reuters corpus
    for file_id in reuters.fileids():
        # Tokenizing words in the document and adding them to the documents list
        words = [word.lower() for word in reuters.words(file_id)]
        documents.append(words)

    return documents

corpus = generate_corpus()
print(corpus[3])

[nltk_data] Downloading package reuters to /root/nltk_data...


['thai', 'trade', 'deficit', 'widens', 'in', 'first', 'quarter', 'thailand', "'", 's', 'trade', 'deficit', 'widened', 'to', '4', '.', '5', 'billion', 'baht', 'in', 'the', 'first', 'quarter', 'of', '1987', 'from', '2', '.', '1', 'billion', 'a', 'year', 'ago', ',', 'the', 'business', 'economics', 'department', 'said', '.', 'it', 'said', 'janunary', '/', 'march', 'imports', 'rose', 'to', '65', '.', '1', 'billion', 'baht', 'from', '58', '.', '7', 'billion', '.', 'thailand', "'", 's', 'improved', 'business', 'climate', 'this', 'year', 'resulted', 'in', 'a', '27', 'pct', 'increase', 'in', 'imports', 'of', 'raw', 'materials', 'and', 'semi', '-', 'finished', 'products', '.', 'the', 'country', "'", 's', 'oil', 'import', 'bill', ',', 'however', ',', 'fell', '23', 'pct', 'in', 'the', 'first', 'quarter', 'due', 'to', 'lower', 'oil', 'prices', '.', 'the', 'department', 'said', 'first', 'quarter', 'exports', 'expanded', 'to', '60', '.', '6', 'billion', 'baht', 'from', '56', '.', '6', 'billion', '.',

In [3]:
# Skip-gram model
skipgram_model = Word2Vec(sentences=corpus,
                          sg=1,                # indicates the Skip-gram model
                          vector_size=100,     # each word represented as a 100-dimensional vector
                          window=2,            # context window size 2/3/4/5
                          min_count=1,         # include all words in the corpus
                          workers=4,           # use 4 CPU cores
                          epochs=20)           # number of training epochs 20/10/3

# CBOW model
cbow_model = Word2Vec(sentences=corpus,
                      sg=0,                # indicates the CBOW model
                      vector_size=100,     # each word represented as a 100-dimensional vector
                      window=2,            # context window size
                      min_count=1,         # include all words in the corpus
                      workers=4,           # use 4 CPU cores
                      epochs=20)           # number of training epochs

# Get word embeddings
skipgram_embeddings = skipgram_model.wv
cbow_embeddings = cbow_model.wv

In [4]:
print("The Skip-gram_embedding of 'thailand' : "'\n',
skipgram_embeddings['thailand'])

The Skip-gram_embedding of 'thailand' : 
 [-0.15603839 -0.00603326 -0.11105411  0.21903665 -0.26254663  0.40396506
  0.01396661  0.7773606  -0.41795126 -0.31858456  0.19979882 -0.74580723
  0.27982032  0.36949688 -0.41706443  0.00497431  0.18380193 -0.13701607
  0.13044533 -0.7735183   0.25465208 -0.04976621  0.20222531 -0.23979989
 -0.9917326   0.29935202 -0.22822136 -0.39437908 -0.5242264  -0.10090041
 -0.55606407 -0.1967668   0.57211643  0.09566039 -0.04791538  1.2255208
  0.05564652 -0.31810227 -0.05274864 -0.76432794 -0.18647344  0.38693848
 -0.28949514  0.41967157  0.6661168   0.50532615  0.07026751 -0.43118724
 -0.68639207 -0.47972402  0.01155642 -0.36420572  0.13330029 -0.11963738
  0.24306135 -0.1028681  -0.02672275 -0.36016703 -0.30766487  0.26538062
 -0.25173545 -0.15910563 -0.47057882  0.41143814  0.23199537  0.6190805
  0.12625107  0.6568882  -0.67068374 -0.02607581 -0.09061975  0.8411479
 -0.21854362  0.45735744  0.93285877 -0.09851194  0.04778669 -0.2911984
  0.46645468 

In [5]:
print("The CBOW_embedding of 'thailand' : "'\n',
cbow_embeddings['thailand'])

The CBOW_embedding of 'thailand' : 
 [ 0.88566273 -0.3229559   0.382801   -0.8844002  -0.13663106  0.2765259
  0.45512387  0.14587875 -0.7976553  -0.8645363  -0.37102258 -0.4657008
 -0.23158105 -0.67073125 -0.90604377  0.23075257 -0.05584046 -0.58842105
 -0.44956765 -0.41617063  0.8544379  -0.46696728 -0.23071349  0.2575898
 -0.1910855   1.2417698   0.69984037  1.5674213   0.9750787  -1.114649
 -0.4815869  -0.14850593  0.6902104   0.4159567   0.17609115  1.2805328
 -0.33867988 -0.9177701   0.50461185 -0.36608216 -0.7650224   0.53250265
  0.07033613  0.3546938   1.3543811  -0.33472905  0.5947876  -0.7403004
  0.00729801 -0.6773257   0.32179683  0.21093307 -0.01939125 -0.09739932
  0.3232425   0.24686888 -0.27116483 -0.3548509  -0.00271616 -0.7806319
 -0.69774526 -0.7225863  -1.2793962   0.23244305 -0.21440572 -0.87688226
 -1.2989095   0.7599836  -0.34391677  0.74984807 -0.32563064 -0.11574651
  0.17568281  0.43642554  0.9563689  -0.15040624  0.67378914 -0.9424629
  0.20203236  1.3750023

In [6]:
print("The Skip-gram_embedding of 'trade' : "'\n',
skipgram_embeddings['trade'])

The Skip-gram_embedding of 'trade' : 
 [-0.76614124  0.36359707  0.15471853  0.07219135 -0.18135384  0.11271082
  1.1122818   0.94440234 -0.27762777  0.5499714   0.38398096 -0.01357417
 -0.31791762 -0.02455545 -1.0390977  -0.51165813  0.2830303  -0.7046863
 -0.00280256 -0.25666225  0.29564464 -0.5033032  -0.05743292 -0.22197866
  0.53501856  0.581971    0.7139737  -0.46013144 -0.22374852 -0.9955071
 -0.4711841  -0.45177153  0.15057275 -0.33895776 -0.07318382  0.8589752
  0.62208384 -0.162363   -0.5840605   0.17780139  0.07480626  0.09285251
  0.29921672 -0.22097622  0.7137623   0.4171728  -0.1153111   0.36560863
  0.04347743  0.00515418  0.09267402 -0.14990652  0.3693315  -0.34644446
 -0.6330659  -0.43300435 -0.38794887 -0.524245    0.09678399 -0.43594342
 -0.02315399  0.13709652 -0.04015827 -0.7095326  -0.19035205  1.1814023
 -0.07929632  0.19120938 -0.6649558  -0.02460628 -0.17532267  0.24799515
 -0.24816523  0.5379412   0.17348477  1.4490371   0.20449075 -0.12922789
 -0.66561     0.

In [7]:
print("The Skip-gram_embedding of 'trade' : "'\n',
cbow_embeddings['trade'])

The Skip-gram_embedding of 'trade' : 
 [-1.3626528e+00 -4.5252475e-01  1.1194279e+00 -4.5388675e-01
 -9.6635133e-01  3.6116259e+00  4.2640676e+00 -1.4659275e+00
  1.4106661e+00 -6.9094849e-01  2.0567904e+00  2.0821903e+00
 -4.1894823e-02 -4.6844751e-01 -1.6428123e+00 -6.4824086e-01
 -1.6044396e-01 -2.8221800e+00  2.1379442e+00  7.3478192e-02
  6.4263707e-01 -1.4374151e+00 -2.6956990e-01  9.5279090e-04
  2.2756503e+00  5.9665757e-01  2.6338606e+00 -2.0194330e+00
 -9.0620065e-01 -2.4156487e+00  4.0865967e-01  1.5372337e+00
  7.7637541e-01  2.4986520e+00  4.2684546e-01  5.5969012e-01
  2.6449440e+00 -1.3981080e-01 -6.7595083e-01  1.0147005e+00
 -1.0036540e+00  3.9079672e-01  8.2990718e-01 -8.6784118e-01
  1.3738923e+00  1.2237054e+00 -4.5756963e-01 -1.8417501e-01
  2.3137980e+00 -2.1492419e+00 -4.0799350e-01  8.9149451e-01
 -1.1261982e+00 -7.2054648e-01 -6.9329494e-01 -1.4682560e+00
 -1.9090275e-01  1.4489138e+00  3.1846967e-01  7.8452963e-01
 -1.8070840e+00  6.4065540e-01 -9.7276455e-01 

In [8]:
print(skipgram_embeddings.similarity('trade', 'trade'))  # Compute similarity between the same word

1.0


In [11]:
compare_similarity = skipgram_embeddings.similarity('janunary', 'march')  # Compute similarity between 2 words
print("Similarity between janunary and march:", compare_similarity)

Similarity between janunary and march: 0.48615858


In [12]:
compare_similarityCBOw = cbow_embeddings.similarity('thailand', 'india')
print("Similarity between thailand and india:", compare_similarityCBOw)

Similarity between thailand and india: 0.5744101


In [13]:
print(skipgram_embeddings.most_similar("thailand")) # Find most similar words to 'thailand' - skipgram_embeddings

[('chile', 0.6603491902351379), ('copra', 0.6094616651535034), ('indonesia', 0.6065950393676758), ('china', 0.5885546803474426), ('madagascar', 0.5863330960273743), ('australia', 0.5851213335990906), ('spain', 0.581716775894165), ('kaohsiung', 0.5800643563270569), ('nigeria', 0.5740512013435364), ('heilongjiang', 0.5737406611442566)]


In [14]:
print(cbow_embeddings.most_similar("thailand")) # Find most similar words to 'thailand' - cbow_embeddings

[('spain', 0.6965895891189575), ('indonesia', 0.6937969923019409), ('australia', 0.664749801158905), ('italy', 0.6195459961891174), ('malaysia', 0.600208044052124), ('kenya', 0.6001816987991333), ('france', 0.5938326716423035), ('china', 0.592526376247406), ('peru', 0.5915660262107849), ('cuba', 0.5862504839897156)]


In [15]:
print(skipgram_embeddings.rank('thailand','india'))
print(cbow_embeddings.rank('thailand','india'))

29
14


In [36]:
similarity= 1 - spatial.distance.cosine(skipgram_embeddings["trade"], cbow_embeddings["trade"])
similarity

0.5700680017471313

In [16]:
glove_embedding = WordEmbeddings('glove')

2024-05-02 13:48:26,601 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmpyscaiquv


100%|██████████| 153M/153M [00:08<00:00, 17.9MB/s]

2024-05-02 13:48:36,058 copying /tmp/tmpyscaiquv to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2024-05-02 13:48:36,733 removing temp file /tmp/tmpyscaiquv
2024-05-02 13:48:37,465 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim not found in cache, downloading to /tmp/tmpcym8fuob


100%|██████████| 20.5M/20.5M [00:01<00:00, 11.1MB/s]

2024-05-02 13:48:39,965 copying /tmp/tmpcym8fuob to cache at /root/.flair/embeddings/glove.gensim
2024-05-02 13:48:39,993 removing temp file /tmp/tmpcym8fuob





In [18]:
glove_sentence1 = Sentence('She used a colorful scarf to tie her hair into a stylish bun')
glove_sentence2 = Sentence('His job interview went well, but he did not have a tie to wear with his suit')

In [19]:
print(glove_sentence1)
print(glove_sentence1.tokens)
print(glove_sentence1[0])

Sentence[13]: "She used a colorful scarf to tie her hair into a stylish bun"
[Token[0]: "She", Token[1]: "used", Token[2]: "a", Token[3]: "colorful", Token[4]: "scarf", Token[5]: "to", Token[6]: "tie", Token[7]: "her", Token[8]: "hair", Token[9]: "into", Token[10]: "a", Token[11]: "stylish", Token[12]: "bun"]
Token[0]: "She"


In [20]:
print(glove_sentence2)
print(glove_sentence2.tokens)
print(glove_sentence2[0])

Sentence[18]: "His job interview went well, but he did not have a tie to wear with his suit"
[Token[0]: "His", Token[1]: "job", Token[2]: "interview", Token[3]: "went", Token[4]: "well", Token[5]: ",", Token[6]: "but", Token[7]: "he", Token[8]: "did", Token[9]: "not", Token[10]: "have", Token[11]: "a", Token[12]: "tie", Token[13]: "to", Token[14]: "wear", Token[15]: "with", Token[16]: "his", Token[17]: "suit"]
Token[0]: "His"


In [23]:
# embed a sentence using glove.
glove_embedding.embed(glove_sentence1)

for token in glove_sentence1:
    print(token)
    #print the embedding for each token
    print(token.embedding)

Token[0]: "She"
tensor([ 0.3144,  0.1531,  0.1826, -0.0959,  0.1320,  0.4428, -0.1401,  0.8488,
         0.5705,  0.2021,  0.3575,  0.2790,  0.2424,  0.5929,  0.0709, -0.2236,
         0.2905,  0.2584, -0.5239,  0.2074, -0.1903,  0.0785,  0.3739,  0.1341,
         0.6007,  0.7033, -0.5381, -1.5178,  0.4962, -0.3139, -0.4069,  1.1136,
         0.4728,  0.3203, -0.0774,  0.2923, -0.2867,  0.1330,  0.1037, -0.2659,
        -1.0749, -0.1974,  0.1176, -0.5536, -0.2697, -0.0694, -0.0873, -0.6109,
         0.7917, -0.4573, -0.2104, -0.4465,  1.0217,  1.4455, -0.1559, -2.9029,
        -0.1445, -0.2267,  0.8572,  1.1399,  0.1820,  1.0707, -0.3531,  0.1442,
         0.6392, -0.3528,  0.7269,  0.3061,  0.2623,  0.4319,  0.1371, -0.1222,
         0.1147,  0.2121,  0.2266,  0.8510, -0.1074, -0.4954, -0.8858, -0.5439,
         0.1720, -0.0254, -0.0805,  0.3384, -1.9701, -1.4076, -0.0744, -0.2123,
        -0.6591, -0.6036,  0.0125, -0.7423,  0.9570,  0.4106, -0.5335,  0.6575,
        -0.5649, -0.0427

In [25]:
# embed a sentence using glove.
glove_embedding.embed(glove_sentence2)

for token in glove_sentence2:
    print(token)
    #print the embedding for each token
    print(token.embedding)

Token[0]: "His"
tensor([ 1.2883e-01, -8.2209e-01,  2.7438e-01, -6.9014e-02,  1.7989e-01,
         7.2605e-01, -1.5112e-01,  8.5541e-03, -9.5122e-01,  7.7243e-01,
        -2.8375e-01,  2.8329e-01,  1.4825e-01, -1.2230e-02, -1.9267e-02,
        -3.4460e-02,  3.1506e-01, -1.6639e-01, -1.3435e-02, -2.0459e-03,
         6.4905e-02, -2.0989e-01,  1.2524e-01,  3.5230e-01,  6.4040e-01,
         5.9570e-02, -8.0302e-01, -8.1648e-01,  6.6134e-01,  5.9970e-02,
        -6.1521e-02,  8.4922e-01, -2.8733e-02,  2.7670e-01, -1.0068e+00,
         7.1758e-01, -3.7257e-01,  4.3064e-01, -4.9244e-01,  3.8683e-01,
        -3.6828e-01,  2.7982e-02,  1.5346e+00, -6.0533e-01, -3.4449e-01,
        -1.7069e-01,  2.9288e-01, -5.3581e-01,  5.6035e-01, -6.3013e-01,
        -1.2308e-01,  9.3633e-02,  5.9336e-01,  1.5214e+00, -9.2629e-02,
        -3.1408e+00,  1.3931e-01, -5.3820e-01,  1.1736e+00,  6.2318e-01,
         4.3621e-01,  1.2856e+00,  1.2121e-01,  4.6206e-01,  5.6142e-01,
        -4.1439e-01,  9.4360e-01,  

In [30]:
print("The size of the embedding vector of the word 'tie': ")
len(glove_sentence1[6].embedding)

The size of the embedding vector of the word 'tie': 


100

In [31]:
print("The size of the embedding vector of the word 'tie': ")
len(glove_sentence2[12].embedding)

The size of the embedding vector of the word 'tie': 


100

In [34]:
# The similarity between the same word in a different sentence
similarity= 1 - spatial.distance.cosine(glove_sentence1[6].embedding, glove_sentence2[12].embedding)
similarity

1