In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import string
import json

## 1. Corpus Loading and Preprocessing


In [2]:
# Load the Reuters corpus categorized under "livestock"
import nltk
nltk.download('reuters')
from nltk.corpus import reuters 
category_livestock = reuters.fileids("livestock")
corpus = [reuters.words(fid) for fid in category_livestock]
print(corpus)


[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\A443696\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


[['JAPAN', 'MINISTRY', 'SAYS', 'OPEN', 'FARM', 'TRADE', ...], ['USDA', 'TO', 'PROPOSE', 'FOREIGN', 'MEAT', ...], ['U', '.', 'S', '.', 'MEAT', ',', 'POULTRY', ...], ['MEXICAN', 'CATTLE', 'IMPORTS', 'TO', 'BE', 'BRANDED', ...], ['ARGENTINE', 'CATTLE', 'MARKET', 'REPORT', 'ABOUT', ...], ['HOG', 'AND', 'CATTLE', 'SLAUGHTER', 'GUESSTIMATES', ...], ['EASTERN', 'DISTRIBUTIVE', 'BEEF', 'TRADE', 'WEEKLY', ...], ['DISEASE', 'PUTS', 'ZIMBABWE', 'BEEF', 'EXPORTS', ...], ['CCC', 'GUARANTEES', 'TO', 'IRAQ', 'SWITCHED', '--', ...], ['U', '.', 'S', '.', 'DOLLAR', 'LOSSES', 'PROPEL', ...], ['U', '.', 'S', '.', 'FEEDER', 'STEER', 'PRICE', 'The', ...], ['HOG', 'AND', 'CATTLE', 'SLAUGHTER', 'GUESSTIMATES', ...], ['CCC', 'ACCEPTS', 'BONUS', 'ON', 'CATTLE', 'TO', ...], ['SWIFT', 'TO', 'SELL', 'SOUTH', 'DAKOTA', 'PORK', ...], ['SWIFT', '&', 'lt', ';', 'SFTPF', '>', 'TO', 'SELL', ...], ['EC', 'MEAT', 'DIRECTIVE', 'DEADLINE', 'SEEN', ...], ['JAPAN', '1986', '/', '87', 'COMPOUND', 'FEED', ...], ['INDONESIA', '/

In [3]:
# Preprocess the corpus, filter out non-alpha tokens, convert to lowercase, and remove punctuation and digits
def preprocess_corpus(corpus):
    table = str.maketrans('', '', string.punctuation + string.digits)
    return [[word.lower().translate(table) for word in doc if word.isalpha()] for doc in corpus]

corpus = preprocess_corpus(corpus)
corpus

[['japan',
  'ministry',
  'says',
  'open',
  'farm',
  'trade',
  'would',
  'hit',
  'u',
  's',
  'japan',
  's',
  'agriculture',
  'ministry',
  'angered',
  'by',
  'u',
  's',
  'demands',
  'that',
  'japan',
  'open',
  'its',
  'farm',
  'products',
  'market',
  'will',
  'tell',
  'u',
  's',
  'officials',
  'at',
  'talks',
  'later',
  'this',
  'month',
  'that',
  'liberalisation',
  'would',
  'harm',
  'existing',
  'u',
  's',
  'farm',
  'exports',
  'to',
  'japan',
  'a',
  'senior',
  'ministry',
  'official',
  'said',
  'imports',
  'from',
  'the',
  'u',
  's',
  'would',
  'drop',
  'due',
  'to',
  'active',
  'sales',
  'drives',
  'by',
  'other',
  'suppliers',
  'the',
  'official',
  'who',
  'declined',
  'to',
  'be',
  'named',
  'said',
  'japan',
  'is',
  'the',
  'largest',
  'customer',
  'for',
  'u',
  's',
  'farm',
  'products',
  'and',
  'it',
  'is',
  'not',
  'reasonable',
  'for',
  'the',
  'u',
  's',
  'to',
  'demand',
  'japan'

## 2. Vocabulary Creation

In [4]:
# Flatten the corpus into a single list of words
flatten_corpus = [word for doc in corpus for word in doc]
# Create a set of unique words in the corpus
vocabs = list(set(flatten_corpus))
vocabs

['respective',
 'conducting',
 'canada',
 'foreign',
 'sees',
 'probably',
 'making',
 'ground',
 'commitment',
 'administrative',
 'maximun',
 'blaylock',
 'feedlots',
 'relation',
 'operations',
 'imposed',
 'max',
 'southwestern',
 'assurance',
 'special',
 'cooked',
 'erecting',
 'knuckle',
 'dense',
 'substitutes',
 'coming',
 'unloaded',
 'manly',
 'directive',
 'instructs',
 'include',
 'detail',
 'before',
 'butter',
 'shearson',
 'exists',
 'expand',
 'taxes',
 'syndicats',
 'abattoirs',
 'processing',
 'volume',
 'propel',
 'favour',
 'protesting',
 'grassley',
 'marketings',
 'whole',
 'nebr',
 'congressmen',
 'along',
 'opposed',
 'moisture',
 'restarts',
 'unacceptable',
 'switched',
 'projections',
 'agronomist',
 'insert',
 'administrator',
 'over',
 'anticipated',
 'into',
 'farrowing',
 'reaffirmed',
 'israel',
 'conducts',
 'consumers',
 'eradication',
 'peru',
 'supply',
 'compliance',
 'baucus',
 'puerto',
 'barrier',
 'recommends',
 'ambitous',
 'identify',
 'fence

In [5]:
# Add a special token for unknown words
vocabs.append('<UNK>')

In [6]:
# Create a mapping from words to their indices
word2index = {v: idx for idx, v in enumerate(vocabs)}
print(word2index['<UNK>'])  # Print the index of the <UNK> token

2906


In [7]:
index2word = {v:k for k, v in word2index.items()}

## 3. Batch Preparation for Training


In [8]:
def random_batch(batch_size, corpus):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        # since we assign window size = 2
        # look from the third word until third last word
        for i in range(2, len(doc)-2):
            #center word
            center = word2index[doc[i]]
            #outside words = 4 words (2 words from left and 2 words from right)
            outside = (word2index[doc[i-2]],word2index[doc[i-1]], word2index[doc[i+1]],word2index[doc[i+2]])
            #for each of these 4 outside words, we need to append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], [] #inputs = center word, labels = outside word 
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)


## 4. Preparing Vocabulary for Embedding Layer

In [9]:
# Model and Training Parameters
batch_size = 32
voc_size = len(vocabs)
emb_size = 10
k = 5


In [10]:
# Function to convert a sequence of words to their corresponding indices
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)
all_vocabs

tensor([[   0,    1,    2,  ..., 2904, 2905, 2906],
        [   0,    1,    2,  ..., 2904, 2905, 2906],
        [   0,    1,    2,  ..., 2904, 2905, 2906],
        ...,
        [   0,    1,    2,  ..., 2904, 2905, 2906],
        [   0,    1,    2,  ..., 2904, 2905, 2906],
        [   0,    1,    2,  ..., 2904, 2905, 2906]])

## 5. Skipgram Model Definition

In [11]:
#count
from collections import Counter

word_count = Counter(flatten_corpus)
# word_count

#get the total number of words
num_total_words = sum(word_count.values())
# num_total_words

In [12]:
z = 0.001
unigram_table = []

for v in vocabs:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)
    
Counter(unigram_table)

Counter({'the': 127,
         'to': 78,
         'of': 66,
         'and': 65,
         'in': 52,
         'a': 50,
         'said': 48,
         's': 41,
         'for': 35,
         'u': 30,
         'on': 28,
         'beef': 26,
         'at': 25,
         'that': 24,
         'is': 23,
         'from': 23,
         'will': 20,
         'meat': 20,
         'cattle': 20,
         'mln': 20,
         'by': 20,
         'be': 19,
         'it': 19,
         'are': 18,
         'ec': 18,
         'year': 18,
         'was': 16,
         'with': 16,
         'would': 15,
         'an': 15,
         'trade': 15,
         'tonnes': 15,
         'pork': 14,
         'agriculture': 14,
         'as': 14,
         'farm': 13,
         'have': 13,
         'japan': 13,
         'has': 13,
         'ago': 12,
         'head': 12,
         'pct': 12,
         'he': 12,
         'department': 12,
         'been': 12,
         'dlrs': 11,
         'they': 11,
         'its': 11,
         'were':

In [13]:
import random

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
        
    return torch.cat(neg_samples) #batch_size, k

## 6. Model Initialization and Loss Calculation

In [14]:
x, y = random_batch(batch_size, corpus)
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)

In [15]:
neg_samples = negative_sampling(y_tensor, unigram_table, k)


In [22]:
y_tensor[1]

tensor([1946])

In [23]:
neg_samples[1]

tensor([2286,  792,  447, 1576,  150])

In [16]:
# create skipgram negative sampling
class SkipgramNeg(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size)
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1)
        
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)

In [17]:
#test your model
model = SkipgramNeg(voc_size, emb_size)

In [18]:
loss = model(x_tensor, y_tensor, neg_samples)

In [27]:
loss

tensor(3.5441, grad_fn=<NegBackward0>)

## 7. Training Epoch

In [28]:
import time

# Function to calculate time taken per epoch
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs


In [29]:
# Initialize model and optimizer
model      = SkipgramNeg(voc_size, emb_size)
optimizer  = optim.Adam(model.parameters(), lr=0.001)

In [31]:
num_epochs = 10000
start_time = time.time()

for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    #predict
    neg_samples = negative_sampling(label_tensor, unigram_table, k)
    loss = model(input_tensor, label_tensor, neg_samples)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print the loss
    if (epoch + 1) % 1000 == 0:
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f}")

end_time = time.time()
min, sec = epoch_time(start_time, end_time)
print(f"Epoch time: {min}:{sec}")

Epoch   1000 | Loss: 3.561222
Epoch   2000 | Loss: 3.527747
Epoch   3000 | Loss: 2.645572
Epoch   4000 | Loss: 2.763034
Epoch   5000 | Loss: 1.941732
Epoch   6000 | Loss: 2.529409
Epoch   7000 | Loss: 1.757275
Epoch   8000 | Loss: 1.613858
Epoch   9000 | Loss: 1.494330
Epoch  10000 | Loss: 2.683821
Epoch time: 13:34


In [32]:
# Save the trained model
torch.save(model, 'skipgram_neg_model.pth')

## 8. Model Loading and Evaluation on Analogy Tasks

In [38]:
# Load the trained model
model = torch.load('skipgram_neg_model.pth')
model.eval()

SkipgramNeg(
  (embedding_center): Embedding(2907, 10)
  (embedding_outside): Embedding(2907, 10)
  (logsigmoid): LogSigmoid()
)

In [39]:
# Load analogy data and separate into syntactic and semantic analogies
syntactic_analogies = []
semantic_analogies = []
current_category = None

with open('word-test.txt', 'r') as file:
    for line in file:
        if line.startswith(':'):
            if 'gram' in line:  # Assumption: syntactic questions contain 'gram'
                current_category = syntactic_analogies
            else:
                current_category = semantic_analogies
        elif current_category is not None:
            current_category.append(line.strip().split())

In [40]:
# Function to solve an analogy given words a, b, c
def solve_analogy(a, b, c, word_to_index, index_to_word, model):
    # Convert words to embeddings
    a_emb = model.embedding_center(torch.tensor([word_to_index.get(a, word_to_index["<UNK>"])]))
    b_emb = model.embedding_center(torch.tensor([word_to_index.get(b, word_to_index["<UNK>"])]))
    c_emb = model.embedding_center(torch.tensor([word_to_index.get(c, word_to_index["<UNK>"])]))

    # Calculate the expected embedding for d
    expected_d_emb = b_emb - a_emb + c_emb

    # Find the closest word to the expected embedding
    closest_word, closest_dist = None, float('inf')
    for word, idx in word_to_index.items():
        if word not in [a, b, c]:
            word_emb = model.embedding_center(torch.tensor([idx]))
            dist = torch.norm(expected_d_emb - word_emb)
            if dist < closest_dist:
                closest_word, closest_dist = word, dist
    return closest_word

In [41]:
# Function to calculate accuracy of the model on analogy tasks
def calculate_accuracy(analogies, word_to_index, index_to_word, model):
    correct = 0
    for analogy in analogies:
        a, b, c, expected_d = analogy
        predicted_d = solve_analogy(a, b, c, word_to_index, index_to_word, model)
        if predicted_d == expected_d:
            correct += 1
    return correct / len(analogies) if analogies else 0

In [42]:
# Calculate and print syntactic and semantic accuracies
syntactic_accuracy = calculate_accuracy(syntactic_analogies, word2index, index2word, model)
semantic_accuracy = calculate_accuracy(semantic_analogies, word2index, index2word, model)

print(f"Syntactic Accuracy: {syntactic_accuracy}")
print(f"Semantic Accuracy: {semantic_accuracy}")

Syntactic Accuracy: 0.0
Semantic Accuracy: 0.0


In [19]:
import scipy.stats
from scipy.stats import spearmanr

In [20]:
test_data = []
with open('Data/wordsim353_agreed.txt') as input_file:
    for line in input_file:
        words = line.strip().split()
        if len(words) == 3:  # Ensure there are exactly three elements (two words and a score)
            test_data.append(words)

In [21]:
import torch

# Step 2: Load the trained model
model = torch.load('skipgram_neg_model.pth')
model.eval()  # Set the model to evaluation mode

SkipgramNeg(
  (embedding_center): Embedding(2907, 10)
  (embedding_outside): Embedding(2907, 10)
  (logsigmoid): LogSigmoid()
)

In [22]:
# Step 3: Compute similarity scores using the model
model_scores = []
human_scores = []
for word1, word2, human_score in test_data:
    if word1 in model.wv.vocab and word2 in model.wv.vocab:
        model_score = model.wv.similarity(word1, word2)
        model_scores.append(model_score)
        human_scores.append(float(human_score))

In [23]:
# Step 4: Calculate Spearman's rank correlation
correlation, _ = spearmanr(human_scores, model_scores)
print(f"Spearman's rank correlation: {correlation}")

Spearman's rank correlation: nan
