# Assignment 1 : Search Engine

## Task 1. Preparation and Training

In [4]:
!python3 skipgram.py

corpus_tokenized sample : 4623
voc_size : 812
 19%|███████▌                               | 962/5000 [00:02<00:10, 401.43it/s]Epoch   1000 | Loss: 8.695992
 40%|███████████████▏                      | 1999/5000 [00:06<00:08, 367.12it/s]Epoch   2000 | Loss: 6.969626
 60%|██████████████████████▋               | 2987/5000 [00:10<00:05, 350.37it/s]Epoch   3000 | Loss: 6.696272
 80%|██████████████████████████████▏       | 3977/5000 [00:13<00:02, 378.44it/s]Epoch   4000 | Loss: 6.260726
 99%|█████████████████████████████████████▋| 4957/5000 [00:16<00:00, 340.94it/s]Epoch   5000 | Loss: 6.617470
100%|██████████████████████████████████████| 5000/5000 [00:16<00:00, 300.58it/s]


In [5]:
!python3 neg.py

corpus_tokenized sample : 4623
voc_size : 812
 19%|███████▍                               | 959/5000 [00:03<00:21, 189.09it/s]Epoch   1000 | Loss: 5.292124
 39%|██████████████▊                       | 1952/5000 [00:06<00:08, 357.65it/s]Epoch   2000 | Loss: 1.846857
 60%|██████████████████████▋               | 2977/5000 [00:09<00:05, 381.16it/s]Epoch   3000 | Loss: 1.503825
 80%|██████████████████████████████▎       | 3994/5000 [00:12<00:02, 346.80it/s]Epoch   4000 | Loss: 2.561615
 99%|█████████████████████████████████████▊| 4969/5000 [00:14<00:00, 309.03it/s]Epoch   5000 | Loss: 5.038004
100%|██████████████████████████████████████| 5000/5000 [00:14<00:00, 334.07it/s]


In [1]:
!python3 glove.py

corpus_tokenized sample : 4623
330078it [00:00, 609404.45it/s]
voc_size : 812
 20%|███████▋                               | 989/5000 [00:02<00:09, 402.90it/s]Epoch: 1000 | cost: 7.054244 | time: 0m 2s
 40%|███████████████▏                      | 1993/5000 [00:04<00:06, 478.11it/s]Epoch: 2000 | cost: 1.901738 | time: 0m 4s
 60%|██████████████████████▊               | 2997/5000 [00:07<00:04, 462.23it/s]Epoch: 3000 | cost: 1.622313 | time: 0m 7s
 80%|██████████████████████████████▎       | 3987/5000 [00:09<00:02, 447.01it/s]Epoch: 4000 | cost: 1.335360 | time: 0m 9s
100%|█████████████████████████████████████▉| 4990/5000 [00:12<00:00, 434.34it/s]Epoch: 5000 | cost: 0.726250 | time: 0m 12s
100%|██████████████████████████████████████| 5000/5000 [00:12<00:00, 412.94it/s]


## Task 2. Model Comparison and Analysis

In [3]:
# Use corpus from nltk
import nltk
# nltk.download('brown')
from nltk.corpus import brown

corpus_tokenized = nltk.corpus.brown.sents(categories='news')
print('corpus_tokenized sample :',len(corpus_tokenized))

#1. tokenization
corpus = [[word.lower() for word in sent] for sent in corpus_tokenized]
corpus = corpus[:100]

#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(corpus))) #all the words we have in the system - <UNK>

#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}

#append UNK
vocabs.append('<UNK>')
word2index['<UNK>'] = len(vocabs) - 1

#vocab size
voc_size = len(vocabs)
print('voc_size :',voc_size)

corpus_tokenized sample : 4623
voc_size : 812


### 2.1 Compare Skip-gram, Skip-gram with negative sampling, GloVe models on training loss, training time, syntactic and semantic accuracy, similar to the methods in the Word2Vec and GloVe paper. 

In [4]:
import torch
import os
from all_models import Skipgram, SkipgramNeg, GloVe

list_weight = os.listdir('./models')
# list_weight.remove('.ipynb_checkpoints')
print(list_weight)

embedding_size  = 2
model_skipgram  = Skipgram(voc_size, embedding_size)
model_neg       = SkipgramNeg(voc_size, embedding_size)
model_glove     = GloVe(voc_size, embedding_size)

list_model      = [model_skipgram, model_glove, model_neg]

for idx, each_weight in enumerate(list_weight):
    print(list_model[idx].__class__.__name__)
    pretrained_state_dict = torch.load(os.path.join('./models/', each_weight))
    # # Load the state dictionary into the new model
    list_model[idx].load_state_dict(pretrained_state_dict)

print(list_model)

['Skipgram.pt', 'GloVe.pt', 'SkipgramNeg.pt']
Skipgram
GloVe
SkipgramNeg
[Skipgram(
  (embedding_center): Embedding(812, 2)
  (embedding_outside): Embedding(812, 2)
), GloVe(
  (embedding_center): Embedding(812, 2)
  (embedding_outside): Embedding(812, 2)
  (v_bias): Embedding(812, 1)
  (u_bias): Embedding(812, 1)
), SkipgramNeg(
  (embedding_center): Embedding(812, 2)
  (embedding_outside): Embedding(812, 2)
  (logsigmoid): LogSigmoid()
)]


In [8]:
# !python3 -m spacy download en_core_web_sm

In [7]:
#Testing Set
import spacy
import numpy as np
import re
import pandas as pd

def clean_data(df_col):
    corpus = []
    for item in df_col:
        item = re.sub('[^A-Za-z0-9]+', ' ', str(item)) # remove special characters
        item = item.lower() # lower all characters
        item = item.split() # split data
        corpus.append(' '.join(str(x) for x in item))
    return corpus
    
nlp = spacy.load('en_core_web_sm')
text = open('./data/word-test.txt',mode='r')
df = pd.DataFrame(text.readlines())
df.head()

Unnamed: 0,0
0,// Copyright 2013 Google Inc. All Rights Reser...
1,: capital-common-countries\n
2,Athens Greece Baghdad Iraq\n
3,Athens Greece Bangkok Thailand\n
4,Athens Greece Beijing China\n


In [9]:
#Check Header 
header = df[0].str.startswith(':')
index_list = np.where(header)[0].tolist()
print(index_list)

[1, 508, 5033, 5900, 8368, 8875, 9868, 10681, 12014, 13137, 14194, 15794, 17355, 18688]


#### Semantic

In [10]:
#Semantic
#capital-common-countries 
df_capital_common_countries = df[1:508]
df_capital_common_countries.head()
#i pick only captial-common-countries to test

Unnamed: 0,0
1,: capital-common-countries\n
2,Athens Greece Baghdad Iraq\n
3,Athens Greece Bangkok Thailand\n
4,Athens Greece Beijing China\n
5,Athens Greece Berlin Germany\n


In [11]:
#1. tokenize
#data cleaned
corpus_test = clean_data(df_capital_common_countries[0])
#data tokenized
semantic_corpus_tokenized_test = [sent.split(" ") for sent in corpus_test]
semantic_corpus_tokenized_test.pop(0)
semantic_corpus_tokenized_test[:5]

[['athens', 'greece', 'baghdad', 'iraq'],
 ['athens', 'greece', 'bangkok', 'thailand'],
 ['athens', 'greece', 'beijing', 'china'],
 ['athens', 'greece', 'berlin', 'germany'],
 ['athens', 'greece', 'bern', 'switzerland']]

In [12]:
len(semantic_corpus_tokenized_test)

506

#### Syntactic

In [13]:
#Syntactic
#: gram7-past-tense
df_past_tense = df[15794:17354]
df_past_tense.head()
#i pick only past-tense to test

Unnamed: 0,0
15794,: gram7-past-tense\n
15795,dancing danced decreasing decreased\n
15796,dancing danced describing described\n
15797,dancing danced enhancing enhanced\n
15798,dancing danced falling fell\n


#### Cleaning

In [14]:
#1. tokenize
#data cleaned
corpus_test = clean_data(df_past_tense[0])
#data tokenized
syntactic_corpus_tokenized_test = [sent.split(" ") for sent in corpus_test]
syntactic_corpus_tokenized_test.pop(0)
syntactic_corpus_tokenized_test[:5]

[['dancing', 'danced', 'decreasing', 'decreased'],
 ['dancing', 'danced', 'describing', 'described'],
 ['dancing', 'danced', 'enhancing', 'enhanced'],
 ['dancing', 'danced', 'falling', 'fell'],
 ['dancing', 'danced', 'feeding', 'fed']]

In [15]:
len(syntactic_corpus_tokenized_test)

1559

In [16]:
corpus_tokenized_test = syntactic_corpus_tokenized_test + semantic_corpus_tokenized_test

In [17]:
#Put to pandas which easier to select column
import pandas as pd
data = pd.DataFrame(corpus_tokenized_test, columns=["A", "B", "C", "D"])

#2. numericalize (vocab)
#2.1 get all the unique words
#we want to flatten unit (basically merge all list)
flatten = lambda l: [item for sublist in l for item in sublist]
vocabs_test = list(set(flatten(corpus_tokenized_test)))

#2.2 assign id to all these vocabs
word2index = {v: idx for idx, v in enumerate(vocabs_test)}

#adding unknown word
vocabs_test.append('<UNK>')
word2index['<UNK>'] = len(vocabs_test) - 1

voc_size_test = len(vocabs_test)
voc_size_test

127

In [18]:
#testing draft
from tqdm.auto import tqdm
from numpy import dot
from numpy.linalg import norm

def cos_sim(a, b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim
    
#find embedding of fruit, cat
def get_embed(word, model, word2index):
    try:
        index = word2index[word]
    except :
        index = word2index['<UNK>'] #unknown
    word = torch.LongTensor([index])
    
    embed = (model.embedding_center(word)+model.embedding_outside(word))/2
    return np.array(embed[0].detach().numpy())
    # return embed[0][0].item(),embed[0][1].item()

def find_analogy(word_list, model, vocabs,word2index):
    word1, word2, word3, word4 = word_list
    emb_a, emb_b, emb_c = get_embed(word1, model, word2index),get_embed(word2, model,word2index),get_embed(word3, model,word2index)
    vector = emb_b - emb_a + emb_c
    similarity = -1 
    
    accuracy = 0
    for vocab in vocabs:
        if vocab not in [word1, word2, word3]: #ignore input words itself
            current_sim = cos_sim(vector,get_embed(vocab,model,word2index))
            if current_sim > similarity:
                similarity = current_sim #update better one
                d = (vocab, similarity)
                if d == word4:
                    accuracy = 1
                else:
                    accuracy = 0
                    
    return model.__class__.__name__, d, accuracy

#Test each model
for each_model in list_model:
    print(find_analogy(['dancing', 'danced', 'decreasing', 'decreased'], each_model, vocabs, word2index))

('Skipgram', ('saw', 0.47231573), 0)
('GloVe', ('saw', 0.9661224), 0)
('SkipgramNeg', ('saw', 0.9999987), 0)


  from .autonotebook import tqdm as notebook_tqdm


#### Find Analogy

In [160]:
def find_analogy_sets(analogy_sets, model, word2index, vocab):
    total_accuracy = 0
    num_sets = len(analogy_sets)

    for word_list in tqdm(analogy_sets):
        word1, word2, word3, word4 = word_list
        emb_a, emb_b, emb_c = get_embed(word1, model, word2index), get_embed(word2, model, word2index), get_embed(word3, model, word2index)
        vector = emb_b - emb_a + emb_c
        similarity = -1

        accuracy = 0
        for vocab_word in vocab:
            if vocab_word not in [word1, word2, word3]:
                current_sim = cos_sim(vector, get_embed(vocab_word, model, word2index))
                if current_sim > similarity:
                    similarity = current_sim
                    predicted_word = vocab_word
                    if predicted_word == word4:
                        accuracy = 1
                    else:
                        accuracy = 0

        total_accuracy += accuracy

    average_accuracy = total_accuracy / num_sets
    return model.__class__.__name__, average_accuracy

#### Semantic Result from Skipgram, SkipgramNeg, Glove

In [163]:
# Semantic Corpus Result
for each_model in list_model:
    result = find_analogy_sets(semantic_corpus_tokenized_test, each_model, word2index, vocab)
    print(result)

100%|██████████| 506/506 [01:56<00:00,  4.36it/s]


('Glove', 0.0)


100%|██████████| 506/506 [01:56<00:00,  4.35it/s]


('Skipgram', 0.0)


100%|██████████| 506/506 [01:56<00:00,  4.35it/s]

('SkipgramNeg', 0.0)





#### Syntactic Result from Skipgram, SkipgramNeg, Glove

In [164]:
# Syntactic Corpus Result
for each_model in list_model:
    result = find_analogy_sets(syntactic_corpus_tokenized_test, each_model, word2index, vocab)
    print(result)

100%|██████████| 1559/1559 [05:58<00:00,  4.34it/s]


('Glove', 0.011545862732520847)


100%|██████████| 1559/1559 [05:59<00:00,  4.34it/s]


('Skipgram', 0.015394483643361129)


100%|██████████| 1559/1559 [06:01<00:00,  4.31it/s]

('SkipgramNeg', 0.01603592046183451)





#### Glove gensim

In [17]:
# !pip3 install gensim

In [20]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
# glove_file = datapath('glove.6B.100d.txt')  #search on the google
glove_file = './data/glove.6B.50d.txt'
model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [21]:
def analogy_gensim(analogy_sets, model):
    total_accuracy = 0
    num_sets = len(analogy_sets)
    
    for word_list in tqdm(analogy_sets):
        word1, word2, word3, word4 = word_list
        result = model.most_similar(positive=[word3, word2], negative=[word1])
        if result[0][0] == word4:
            accuracy = 1
        else:
            accuracy = 0
        total_accuracy += accuracy
    average_accuracy = total_accuracy / num_sets
    return average_accuracy

#### Semantic/Syntactic Result from Gensim

In [23]:
semantic_result = analogy_gensim(semantic_corpus_tokenized_test, model)
print(f'semantic_result :{semantic_result:.3f}')

syntactic_result = analogy_gensim(syntactic_corpus_tokenized_test, model)
print(f'syntactic_result :{syntactic_result:.3f}')

100%|██████████| 506/506 [00:05<00:00, 93.58it/s] 


semantic_result :0.792


100%|██████████| 1559/1559 [00:19<00:00, 81.86it/s] 

syntactic_result :0.375





### 2.2 Use the similarity dataset to find the correlation between your models' dot product and the provided similarity metrics. Assess if your embeddings correlate with human judgment.

#### Gold standard

In [30]:
import pandas as pd

wordsim353 = open('./data/wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt',mode='r')
df_wordsim353 = pd.DataFrame(wordsim353.readlines())

def clean_data_num(df_col):
    corpus = []
    for item in df_col:
        # item = re.sub('[^A-Za-z0-9]+', ' ', str(item)) # remove special characters
        item = item.lower() # lower all characters
        item = item.split() # split data
        corpus.append(' '.join(str(x) for x in item))
    return corpus

#1. tokenize
#data cleaned
corpus_wordsim353 = clean_data_num(df_wordsim353[0])

#data tokenized
corpus_tokenized_wordsim353 = [sent.split(" ") for sent in corpus_wordsim353]
# corpus_tokenized_wordsim353[:5]
wordsim_353 = pd.DataFrame(corpus_tokenized_wordsim353, columns=['x1','x2','correlation'])
wordsim_353.head()

Unnamed: 0,x1,x2,correlation
0,computer,keyboard,7.62
1,jerusalem,israel,8.46
2,planet,galaxy,8.11
3,canyon,landscape,7.53
4,opec,country,5.63


In [None]:
def get_embed(word, model, word2index):
    try:
        index = word2index[word]
    except :
        index = word2index['<UNK>'] #unknown
    word = torch.LongTensor([index])
    
    embed = (model.embedding_center(word)+model.embedding_outside(word))/2
    return np.array(embed[0].detach().numpy())

In [36]:
list_model

[Skipgram(
   (embedding_center): Embedding(812, 2)
   (embedding_outside): Embedding(812, 2)
 ),
 GloVe(
   (embedding_center): Embedding(812, 2)
   (embedding_outside): Embedding(812, 2)
   (v_bias): Embedding(812, 1)
   (u_bias): Embedding(812, 1)
 ),
 SkipgramNeg(
   (embedding_center): Embedding(812, 2)
   (embedding_outside): Embedding(812, 2)
   (logsigmoid): LogSigmoid()
 )]

In [37]:
wordsim = {}
for idx, model in enumerate(list_model):
    wordsim[model.__class__.__name__] = wordsim_353.apply(
        lambda row: np.dot(
            get_embed(row['x1'], model, word2index),
            get_embed(row['x2'], model, word2index)
        ), axis=1)

#### Correlation  
- Calculate a Spearman correlation coefficient with associated p-value.

In [53]:
from scipy.stats import spearmanr

# Example data
for idx, ws in enumerate(wordsim.keys()):
    # Calculate Spearman correlation coefficient
    corr_coef, p_value = spearmanr(wordsim_353['correlation'], wordsim[ws])
    # Display the result
    print(f"{list_model[idx].__class__.__name__}:")
    print(f"Spearman correlation: {corr_coef}")
    print(f"P-value: {p_value}")

Skipgram:
Spearman correlation: nan
P-value: nan
GloVe:
Spearman correlation: nan
P-value: nan
SkipgramNeg:
Spearman correlation: nan
P-value: nan


## Conclusion
**Section 1** : I have tried to train the model (skipgram, NEG, GloVe) using only 5000 epoch 

| Model          | Window Size | Dimension | Training Loss | Syntactic Accuracy | Semantic accuracy |
|----------------|-------------|-----------|---------------|--------------------|-------------------|
| Skipgram       |      2      |     2     |  9.523348     |         0          |         0         |
| Skipgram (NEG) |      2      |     2     |  1.891104     |         0          |         0         |
| Glove          |      2      |     2     |  93.934296    |         0          |         0         |
| Glove (Gensim) |      10     |    100    |       -       |       0.792        |       0.375       |

**Section 2** : using spearman correlation to find similarity

| Model           | Skipgram | NEG    | GloVe  | GloVe (gensim) | Y_true |
|-----------------|----------|--------|--------|----------------|--------|
| MSE             | 26.726   | 26.726 | 26.726 | 5.29           | 5.03   |
