# Training the Model

In [3]:
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
# from src.dataset import Word2VecDataset
from torch.autograd import Variable

In [6]:
DATA_DIR = "data/wikiTest"

In [10]:
cbow_dataset = Word2VecDataset( DATA_DIR, 2, "cbow")
sgram_dataset = Word2VecDataset( DATA_DIR, 2, "skipgram")

== Loading Data ===
Added contents of data/wikiTest/article1.txt to corpus


100%|█████████████████████████████████████████| 80/80 [00:00<00:00, 5671.71it/s]


lemmatizing and removing stopwords...


100%|██████████████████████████████████████| 303/303 [00:00<00:00, 17029.45it/s]


== Loading Data ===
Added contents of data/wikiTest/article1.txt to corpus


100%|████████████████████████████████████████| 80/80 [00:00<00:00, 11905.07it/s]


lemmatizing and removing stopwords...


100%|██████████████████████████████████████| 303/303 [00:00<00:00, 20668.99it/s]


In [11]:
print(f"CBOW Vocab size - {len(cbow_dataset.vocab)},\nSkipgram Vocab size - {len(sgram_dataset.vocab)}")

CBOW Vocab size - 1642,
Skipgram Vocab size - 1642


**Accessing elements of Dataset**

In [15]:
# cbow
print("CBOW ", cbow_dataset[0])

# skipgram
print("Skipgram ", sgram_dataset[0])

CBOW  (tensor([1, 2, 5, 6]), tensor(3))
Skipgram  (tensor(2), tensor(3))


In [16]:
print("CBOW word pairs - ", len(cbow_dataset))

CBOW word pairs -  1797


In [17]:
print("Skipgram word pairs - ", len(sgram_dataset))

Skipgram word pairs -  7970


## Model Building

In [64]:
#cbow model
train_loader = DataLoader(cbow_dataset, batch_size = 32, shuffle = not True)

vocab_size = len(cbow_dataset.vocab)
embedding_size = 10
embeddings_input = nn.Embedding(vocab_size, embedding_size) # embedding layer
linear = nn.Linear(embedding_size, vocab_size) # linear layer


for context, target in train_loader:
    embed = embeddings_input(context)
    out = torch.sum(linear(embed), dim = 1)
    print(out.size())
    break

torch.Size([32, 1642])


In [65]:
#skipgram model
train_loader = DataLoader(sgram_dataset, batch_size = 32, shuffle = not True)

vocab_size = len(sgram_dataset.vocab)
embedding_size = 10
embeddings_input = nn.Embedding(vocab_size, embedding_size) # embedding layer
linear = nn.Linear(embedding_size, vocab_size) # linear layer

for context, target in train_loader:
    embed = embeddings_input(context)
    out = linear(embed)
    print(out.size())
    break

torch.Size([32, 1642])


---

In [1]:
MODEL_DIR = "../model"

In [4]:
saved_model = torch.load(f"{MODEL_DIR}/model_glove_neg10.pth")

In [5]:
glove_model = torch.

{'model_state_dict': OrderedDict([('_focal_embeddings.weight',
               tensor([[ 0.9362,  0.2351,  0.4671,  ..., -0.7640, -0.5844,  0.3263],
                       [-1.2083, -0.1838,  0.6645,  ...,  0.0386,  0.4747, -1.8250],
                       [ 1.2263,  1.6646,  1.6518,  ...,  0.4046, -0.5968,  0.2918],
                       ...,
                       [-0.7047,  0.9246, -0.5855,  ..., -0.9109,  0.4876, -0.1069],
                       [ 0.3169, -0.8631, -0.0330,  ...,  0.5430,  0.0518, -0.5775],
                       [-0.5016,  0.2833,  0.3451,  ...,  1.8717, -2.0402,  0.7413]],
                      device='cuda:0')),
              ('_context_embeddings.weight',
               tensor([[-1.1331,  1.3374,  1.2159,  ...,  0.2099,  0.8879, -0.5372],
                       [ 0.4775, -0.3400, -0.5634,  ..., -0.4830, -0.6062,  0.0761],
                       [ 0.5611,  0.1507, -0.5462,  ..., -0.4529, -1.6863, -1.3211],
                       ...,
                       [-1.27

---

# Testing the Model

In [3]:
import pandas as pd

In [4]:
TEST_DIR = "data/SimLex-999"

In [5]:
test_dataset = pd.read_csv(f"{TEST_DIR}/SimLex-999.txt", sep ="\t")

In [6]:
test_dataset

Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41
1,smart,intelligent,A,9.20,1.75,2.46,1,7.11,1,0.67
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93
...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,V,2.85,2.86,2.93,2,0.00,0,0.99
995,send,attend,V,1.67,2.70,3.17,2,0.00,0,1.44
996,gather,attend,V,4.80,2.75,3.17,2,0.00,0,1.97
997,absorb,withdraw,V,2.97,3.11,3.04,2,0.00,0,1.75
