In [4]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import string
import json

## 1. Corpus Loading and Preprocessing


In [5]:
# Load the Reuters corpus categorized under "livestock"
import nltk
nltk.download('reuters')
from nltk.corpus import reuters 
category_livestock = reuters.fileids("livestock")
corpus = [reuters.words(fid) for fid in category_livestock]
print(corpus)


[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\A443696\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


[['JAPAN', 'MINISTRY', 'SAYS', 'OPEN', 'FARM', 'TRADE', ...], ['USDA', 'TO', 'PROPOSE', 'FOREIGN', 'MEAT', ...], ['U', '.', 'S', '.', 'MEAT', ',', 'POULTRY', ...], ['MEXICAN', 'CATTLE', 'IMPORTS', 'TO', 'BE', 'BRANDED', ...], ['ARGENTINE', 'CATTLE', 'MARKET', 'REPORT', 'ABOUT', ...], ['HOG', 'AND', 'CATTLE', 'SLAUGHTER', 'GUESSTIMATES', ...], ['EASTERN', 'DISTRIBUTIVE', 'BEEF', 'TRADE', 'WEEKLY', ...], ['DISEASE', 'PUTS', 'ZIMBABWE', 'BEEF', 'EXPORTS', ...], ['CCC', 'GUARANTEES', 'TO', 'IRAQ', 'SWITCHED', '--', ...], ['U', '.', 'S', '.', 'DOLLAR', 'LOSSES', 'PROPEL', ...], ['U', '.', 'S', '.', 'FEEDER', 'STEER', 'PRICE', 'The', ...], ['HOG', 'AND', 'CATTLE', 'SLAUGHTER', 'GUESSTIMATES', ...], ['CCC', 'ACCEPTS', 'BONUS', 'ON', 'CATTLE', 'TO', ...], ['SWIFT', 'TO', 'SELL', 'SOUTH', 'DAKOTA', 'PORK', ...], ['SWIFT', '&', 'lt', ';', 'SFTPF', '>', 'TO', 'SELL', ...], ['EC', 'MEAT', 'DIRECTIVE', 'DEADLINE', 'SEEN', ...], ['JAPAN', '1986', '/', '87', 'COMPOUND', 'FEED', ...], ['INDONESIA', '/

In [6]:
# Preprocess the corpus, filter out non-alpha tokens, convert to lowercase, and remove punctuation and digits
def preprocess_corpus(corpus):
    table = str.maketrans('', '', string.punctuation + string.digits)
    return [[word.lower().translate(table) for word in doc if word.isalpha()] for doc in corpus]

corpus = preprocess_corpus(corpus)
corpus

[['japan',
  'ministry',
  'says',
  'open',
  'farm',
  'trade',
  'would',
  'hit',
  'u',
  's',
  'japan',
  's',
  'agriculture',
  'ministry',
  'angered',
  'by',
  'u',
  's',
  'demands',
  'that',
  'japan',
  'open',
  'its',
  'farm',
  'products',
  'market',
  'will',
  'tell',
  'u',
  's',
  'officials',
  'at',
  'talks',
  'later',
  'this',
  'month',
  'that',
  'liberalisation',
  'would',
  'harm',
  'existing',
  'u',
  's',
  'farm',
  'exports',
  'to',
  'japan',
  'a',
  'senior',
  'ministry',
  'official',
  'said',
  'imports',
  'from',
  'the',
  'u',
  's',
  'would',
  'drop',
  'due',
  'to',
  'active',
  'sales',
  'drives',
  'by',
  'other',
  'suppliers',
  'the',
  'official',
  'who',
  'declined',
  'to',
  'be',
  'named',
  'said',
  'japan',
  'is',
  'the',
  'largest',
  'customer',
  'for',
  'u',
  's',
  'farm',
  'products',
  'and',
  'it',
  'is',
  'not',
  'reasonable',
  'for',
  'the',
  'u',
  's',
  'to',
  'demand',
  'japan'

## 2. Vocabulary Creation

In [7]:
# Flatten the corpus into a single list of words
flatten_corpus = [word for doc in corpus for word in doc]
# Create a set of unique words in the corpus
vocabs = list(set(flatten_corpus))
vocabs

['lower',
 'try',
 'package',
 'compound',
 'agreements',
 'favoured',
 'be',
 'difference',
 'feeder',
 'deteriorates',
 'guillaume',
 'voice',
 'calm',
 'participants',
 'too',
 'conjunction',
 'shipping',
 'after',
 'rice',
 'size',
 'vt',
 'exchange',
 'tobacco',
 'stems',
 'feed',
 'points',
 'buyers',
 'illegal',
 'proportion',
 'appropriate',
 'and',
 'credit',
 'am',
 'thought',
 'congressman',
 'md',
 'administrative',
 'three',
 'continues',
 'has',
 'began',
 'long',
 'group',
 'one',
 'concessions',
 'big',
 'houses',
 'elected',
 'compared',
 'instead',
 'crops',
 'catholic',
 'auctioned',
 'programs',
 'kicks',
 'ham',
 'normally',
 'baltic',
 'distributive',
 'causing',
 'approval',
 'asked',
 'ruled',
 'lead',
 'spark',
 'distortion',
 'offset',
 'season',
 'have',
 'bring',
 'lines',
 'underpinned',
 'imported',
 'cold',
 'rebuilding',
 'soviet',
 'reasonable',
 'actually',
 'reform',
 'licensing',
 'conditioning',
 'average',
 'setting',
 'disease',
 'severe',
 'turno

In [9]:
# Add a special token for unknown words
vocabs.append('<UNK>')

In [10]:
# Create a mapping from words to their indices
word2index = {v: idx for idx, v in enumerate(vocabs)}
print(word2index['<UNK>'])  # Print the index of the <UNK> token

2907


In [11]:
index2word = {v:k for k, v in word2index.items()}

## 3. Batch Preparation for Training


In [12]:
# Count the frequency of each word in the corpus
from collections import Counter

X_i = Counter(flatten_corpus)
X_i

Counter({'the': 1145,
         'to': 598,
         'of': 478,
         'and': 470,
         'in': 354,
         'a': 330,
         'said': 314,
         's': 257,
         'for': 210,
         'u': 167,
         'on': 152,
         'beef': 140,
         'at': 133,
         'that': 124,
         'is': 119,
         'from': 117,
         'will': 103,
         'by': 101,
         'mln': 100,
         'cattle': 99,
         'meat': 97,
         'it': 93,
         'be': 92,
         'are': 89,
         'year': 88,
         'ec': 88,
         'with': 74,
         'was': 73,
         'trade': 71,
         'an': 71,
         'would': 69,
         'tonnes': 69,
         'as': 64,
         'agriculture': 62,
         'pork': 62,
         'japan': 57,
         'have': 57,
         'has': 56,
         'farm': 55,
         'been': 54,
         'head': 54,
         'pct': 52,
         'department': 51,
         'he': 50,
         'ago': 50,
         'were': 49,
         'this': 48,
         'not': 4

In [13]:
# Generate skip-grams: pairs of target and context words
skip_grams = []

for doc in corpus:
    for i in range(2, len(doc)-2):
        center = doc[i]
        outside = [doc[i-2], doc[i-1], doc[i+1], doc[i+2]]
        for each_out in outside:
            skip_grams.append((center, each_out))
skip_grams

[('says', 'japan'),
 ('says', 'ministry'),
 ('says', 'open'),
 ('says', 'farm'),
 ('open', 'ministry'),
 ('open', 'says'),
 ('open', 'farm'),
 ('open', 'trade'),
 ('farm', 'says'),
 ('farm', 'open'),
 ('farm', 'trade'),
 ('farm', 'would'),
 ('trade', 'open'),
 ('trade', 'farm'),
 ('trade', 'would'),
 ('trade', 'hit'),
 ('would', 'farm'),
 ('would', 'trade'),
 ('would', 'hit'),
 ('would', 'u'),
 ('hit', 'trade'),
 ('hit', 'would'),
 ('hit', 'u'),
 ('hit', 's'),
 ('u', 'would'),
 ('u', 'hit'),
 ('u', 's'),
 ('u', 'japan'),
 ('s', 'hit'),
 ('s', 'u'),
 ('s', 'japan'),
 ('s', 's'),
 ('japan', 'u'),
 ('japan', 's'),
 ('japan', 's'),
 ('japan', 'agriculture'),
 ('s', 's'),
 ('s', 'japan'),
 ('s', 'agriculture'),
 ('s', 'ministry'),
 ('agriculture', 'japan'),
 ('agriculture', 's'),
 ('agriculture', 'ministry'),
 ('agriculture', 'angered'),
 ('ministry', 's'),
 ('ministry', 'agriculture'),
 ('ministry', 'angered'),
 ('ministry', 'by'),
 ('angered', 'agriculture'),
 ('angered', 'ministry'),
 ('

In [14]:
# Count the co-occurrence of each skip-gram pair
X_ik_skipgrams = Counter(skip_grams)
X_ik_skipgrams

Counter({('the', 'of'): 222,
         ('of', 'the'): 220,
         ('the', 'to'): 151,
         ('to', 'the'): 150,
         ('the', 'said'): 149,
         ('s', 'u'): 145,
         ('u', 's'): 144,
         ('said', 'the'): 143,
         ('the', 'in'): 110,
         ('in', 'the'): 109,
         ('the', 's'): 87,
         ('s', 'the'): 87,
         ('and', 'the'): 85,
         ('the', 'and'): 85,
         ('the', 'u'): 68,
         ('u', 'the'): 68,
         ('a', 'of'): 52,
         ('of', 'a'): 52,
         ('the', 'ec'): 51,
         ('ec', 'the'): 51,
         ('for', 'the'): 49,
         ('the', 'for'): 49,
         ('on', 'the'): 47,
         ('the', 'on'): 47,
         ('the', 'department'): 45,
         ('to', 'a'): 44,
         ('a', 'to'): 44,
         ('to', 'be'): 43,
         ('be', 'to'): 42,
         ('a', 'year'): 42,
         ('of', 'and'): 42,
         ('and', 'of'): 42,
         ('at', 'the'): 41,
         ('the', 'at'): 41,
         ('department', 'the'): 41,
      

In [15]:
# Weighting function for co-occurrences
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [16]:
# Create co-occurrence dictionary and apply weighting
from itertools import combinations_with_replacement

X_ik = {} #keeping the co-occurences
weighting_dic = {} #already scale the co-occurences using the weighting function

for bigram in combinations_with_replacement(vocabs, 2):
    if X_ik_skipgrams.get(bigram):  #if the pair exists in our corpus
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 #for stability
        X_ik[(bigram[1], bigram[0])] = co + 1
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

In [17]:
# Function to generate random batches for training
import math

def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #convert our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

## 4. Testing method

In [18]:
# Model and Training Parameters
batch_size = 32
voc_size = len(vocabs)
emb_size = 10
x, y, cooc, weighting = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)

In [49]:
x

array([[1438],
       [2533],
       [1580],
       [1632],
       [2705],
       [ 819],
       [1048],
       [  90],
       [1865],
       [ 349],
       [1232],
       [1630],
       [ 584],
       [ 982],
       [ 874],
       [ 249],
       [2112],
       [ 194],
       [1588],
       [ 342],
       [ 634],
       [1048],
       [2895],
       [ 222],
       [ 680],
       [ 350],
       [1520],
       [ 982],
       [ 811],
       [1418],
       [1539],
       [1580]])

In [50]:
y

array([[2041],
       [2592],
       [2895],
       [ 982],
       [1060],
       [1416],
       [1937],
       [ 697],
       [  16],
       [1494],
       [1354],
       [ 222],
       [2605],
       [1028],
       [ 342],
       [2753],
       [1723],
       [1609],
       [1048],
       [1632],
       [ 557],
       [2112],
       [2891],
       [ 899],
       [ 636],
       [  16],
       [2817],
       [ 325],
       [1789],
       [1587],
       [ 725],
       [ 342]])

In [51]:
cooc

array([[0.69314718],
       [1.09861229],
       [3.49650756],
       [0.69314718],
       [0.69314718],
       [1.60943791],
       [0.69314718],
       [0.69314718],
       [2.07944154],
       [1.38629436],
       [0.69314718],
       [0.69314718],
       [0.69314718],
       [3.29583687],
       [0.69314718],
       [0.69314718],
       [0.69314718],
       [0.69314718],
       [0.69314718],
       [1.38629436],
       [0.69314718],
       [3.17805383],
       [2.30258509],
       [0.69314718],
       [0.69314718],
       [1.38629436],
       [0.69314718],
       [1.60943791],
       [0.69314718],
       [1.60943791],
       [0.69314718],
       [1.79175947]])

In [52]:
weighting

array([[0.05318296],
       [0.07208434],
       [0.43539702],
       [0.05318296],
       [0.05318296],
       [0.10573713],
       [0.05318296],
       [0.05318296],
       [0.15042412],
       [0.08944272],
       [0.05318296],
       [0.05318296],
       [0.05318296],
       [0.37456123],
       [0.05318296],
       [0.05318296],
       [0.05318296],
       [0.05318296],
       [0.05318296],
       [0.08944272],
       [0.05318296],
       [0.34289285],
       [0.17782794],
       [0.05318296],
       [0.05318296],
       [0.08944272],
       [0.05318296],
       [0.10573713],
       [0.05318296],
       [0.10573713],
       [0.05318296],
       [0.12123093]])

## 5. GloVe Model Definition

In [19]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.center_embedding(center) #(batch_size, 1, emb_size)
        outside_embeds = self.outside_embedding(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

## 6. Model Initialization and Loss Calculation

In [20]:
model = Glove(voc_size, emb_size)
model

Glove(
  (center_embedding): Embedding(2908, 10)
  (outside_embedding): Embedding(2908, 10)
  (center_bias): Embedding(2908, 1)
  (outside_bias): Embedding(2908, 1)
)

In [21]:
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)
cooc_tensor = torch.FloatTensor(cooc)
weighting_tensor = torch.FloatTensor(weighting)


In [22]:
loss = model(x_tensor, y_tensor, cooc_tensor, weighting_tensor)

## 7. Training Epoch

In [57]:
import time

# Function to calculate time taken per epoch
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs


In [58]:
# Initialize model and optimizer
model          = Glove(voc_size, emb_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [59]:
import time
start_time = time.time()

# Training
num_epochs = 10000
for epoch in range(num_epochs):      
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()
    
    if (epoch + 1) % 1000 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f}")

end_time = time.time()
min, sec = epoch_time(start_time, end_time)
print(f"Epoch time: {min}:{sec}")

Epoch: 1000 | cost: 57.152378
Epoch: 2000 | cost: 40.790550
Epoch: 3000 | cost: 17.099533
Epoch: 4000 | cost: 13.615710
Epoch: 5000 | cost: 13.479882
Epoch: 6000 | cost: 8.286769
Epoch: 7000 | cost: 7.457489
Epoch: 8000 | cost: 7.251559
Epoch: 9000 | cost: 6.842691
Epoch: 10000 | cost: 3.628716
Epoch time: 3:26


In [60]:
# Save the trained model
torch.save(model, 'GloVe_model.pth')

## 8. Model Loading and Evaluation on Analogy Tasks

In [23]:
# Load the trained model
model = torch.load('GloVe_model.pth')
model.eval()

Glove(
  (center_embedding): Embedding(2907, 10)
  (outside_embedding): Embedding(2907, 10)
  (center_bias): Embedding(2907, 1)
  (outside_bias): Embedding(2907, 1)
)

In [62]:
# Load analogy data and separate into syntactic and semantic analogies
syntactic_analogies = []
semantic_analogies = []
current_category = None

with open('word-test.txt', 'r') as file:
    for line in file:
        if line.startswith(':'):
            if 'gram' in line:  # Assumption: syntactic questions contain 'gram'
                current_category = syntactic_analogies
            else:
                current_category = semantic_analogies
        elif current_category is not None:
            current_category.append(line.strip().split())

In [63]:
# Function to solve an analogy given words a, b, c
def solve_analogy(a, b, c, word_to_index, index_to_word, model):
    # Convert words to embeddings
    a_emb = model.center_embedding(torch.tensor([word_to_index.get(a, word_to_index["<UNK>"])]))
    b_emb = model.center_embedding(torch.tensor([word_to_index.get(b, word_to_index["<UNK>"])]))
    c_emb = model.center_embedding(torch.tensor([word_to_index.get(c, word_to_index["<UNK>"])]))

    # Calculate the expected embedding for d
    expected_d_emb = b_emb - a_emb + c_emb

    # Find the closest word to the expected embedding
    closest_word, closest_dist = None, float('inf')
    for word, idx in word_to_index.items():
        if word not in [a, b, c]:
            word_emb = model.center_embedding(torch.tensor([idx]))
            dist = torch.norm(expected_d_emb - word_emb)
            if dist < closest_dist:
                closest_word, closest_dist = word, dist
    return closest_word


In [64]:
# Function to calculate accuracy of the model on analogy tasks
def calculate_accuracy(analogies, word_to_index, index_to_word, model):
    correct = 0
    for analogy in analogies:
        a, b, c, expected_d = analogy
        predicted_d = solve_analogy(a, b, c, word_to_index, index_to_word, model)
        if predicted_d == expected_d:
            correct += 1
    return correct / len(analogies) if analogies else 0

In [65]:
# Calculate and print syntactic and semantic accuracies
syntactic_accuracy = calculate_accuracy(syntactic_analogies, word2index, index2word, model)
semantic_accuracy = calculate_accuracy(semantic_analogies, word2index, index2word, model)

print(f"Syntactic Accuracy: {syntactic_accuracy}")
print(f"Semantic Accuracy: {semantic_accuracy}")

Syntactic Accuracy: 0.0
Semantic Accuracy: 0.0


# 9. Search engine

In [24]:
import torch
# load saved glove model
model = torch.load('GloVe_model.pth')
model.eval()

Glove(
  (center_embedding): Embedding(2907, 10)
  (outside_embedding): Embedding(2907, 10)
  (center_bias): Embedding(2907, 1)
  (outside_bias): Embedding(2907, 1)
)

In [None]:
# Load data once outside the function
with open('Data/AIT_wiki.txt') as file:
    hp_data = file.read().split()

            