In [None]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm

## Download Data

In [None]:
!wget https://user.phil-fak.uni-duesseldorf.de/~samih/wp-content/uploads/2022/03/input.txt

##Reading text file

In [None]:
## define a function that reads a text file and returns a list of words in the text.
text_path = '/content/input.txt'

def get_words(file_path):
  with open(file_path) as f:
    return f.read().split()

text = get_words(text_path)

In [None]:
# define the parameters 

WINDOW_SIZE = 2
EMDEDDING_DIM = 100

vocab = set(text)
vocab_size = len(vocab)
word_list = list(vocab)

# define the mapping
w2i = {word:idx for idx, word in enumerate(vocab)}
i2w = {idx:word for idx, word in enumerate(vocab)}

len(w2i)

## Preparing the data

In [None]:
#write a function that take take two arguments: list of words and window size.
#it returns a list of tuples. the output should look like this. 
#[(['It', 'was', 'end', 'of'], 'the'), ... ,(['was', 'the', 'of', 'November,'], 'end')]

def get_context(text, WINDOW_SIZE=2):
   data = []
   for i in range(WINDOW_SIZE, len(text) - WINDOW_SIZE):
       context = [text[i - WINDOW_SIZE], 
                  text[i - (WINDOW_SIZE - 1)], 
                  text[i + (WINDOW_SIZE - 1)], 
                  text[i + WINDOW_SIZE]]
       label= text[i]
       data.append((context, label))
   return data

In [None]:
## write a function 
def vectorize(context, word2idx):
    idxs = [word2idx[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

In [None]:
data = get_context(text)
vectorize(data[0][0], w2i).shape


In [None]:
class Model(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Model, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 100)
        self.activation_function1 = nn.ReLU()
        self.linear2 = nn.Linear(100, vocab_size)
        

    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([w2i[word]])
        return self.embeddings(word).view(1,-1)

In [None]:
model = Model(vocab_size, EMDEDDING_DIM)

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [None]:
for epoch in tqdm(range(50)):
    total_loss = 0

    for context, target in data:
        context_vector = vectorize(context, w2i)  

        log_probs = model(context_vector)

        total_loss += loss_function(log_probs, torch.tensor([w2i[target]]))

    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

In [None]:
plt.figure(figsize=(15,10))
for w in word_list:
    x = model.get_word_emdedding(w).detach().data.numpy()[0][0]
    y = model.get_word_emdedding(w).detach().data.numpy()[0][1]
    plt.scatter(x, y)
    plt.annotate(w, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()

In [None]:

context = ['They', 'deserve','attention', 'and']
context_vector = vectorize(context, w2i)
prediction = model(context_vector)

In [None]:
print(f'Text: {" ".join(text)}\n')
print(f'Context: {context}\n')
print(f'Prediction: {i2w[torch.argmax(prediction[0]).item()]}')