In [None]:
import json
import os

from collections import Counter
from tqdm import tqdm

In [None]:
# load utilities
exec(open('./01-utilities.py').read())
exec(open('./03-utilities.py').read())

In [None]:
# read the thread json
thread = json.load(open('./data/thread.json'))
len(thread['train'])

In [None]:
# capping the vocabulary at max_vocab
max_vocab = 20_000
most_common = set()
freq = Counter([t for split in ['train'] for x in thread[split].values() for t in tokenize(x['raw'].lower(), space=False)])
for t, _ in freq.most_common(max_vocab):
    most_common.add(t)

In [None]:
'xophe' in most_common

### Re-use from Assignment 3

These next few cells should look familiar from Assignment 3...

In [None]:
def make_co_counts(documents, space = True, k = 20, gamma = 0):
    
    handle = "-".join(map(str,[max_vocab, space, k, gamma]))
    if os.path.exists('./data/data-' + handle + '.json'):
        return json.load(open('./data/data-' + handle + '.json'))
    
    document_frequency = Counter()
    for j, document in enumerate(tqdm(documents)):
        sentences = sentokenize(document.lower(), space = space)
        documents[j] = [[t for t in s if t in most_common] for s in sentences]
        frequency = Counter([t for s in documents[j] for t in s])
        document_frequency += Counter(frequency.keys())
    type_index = {t:i for i, t in enumerate(sorted(list(document_frequency.keys())))}

    co_counts = Counter()  
    for document in tqdm(documents):
        for sentence in document:
            for i, ti in enumerate(sentence):
                context, weights = get_context(i, sentence, k = k, gamma = gamma)        
                for j, tj in enumerate(context):
                    ij = ",".join(map(str,[type_index[ti], type_index[tj]]))
                    co_counts[ij] += weights[j]

    data = {'co_counts': dict(co_counts), 'type_index': dict(type_index)}
    
    with open('./data/data-' + handle + '.json', "w") as f:
        f.write(json.dumps(data))
    
    return data

In [None]:
data = make_co_counts(
    [x['raw'].lower() for split in ['train'] for x in thread[split].values()], 
    space=False,
    gamma=1,
    k=10
)
len(data['type_index']), len(data['co_counts']), sum(data['co_counts'].values())

In [None]:
def weight_nonzero_data(data, comax = 100, alpha = 0.75):
    data['fco_counts'] = {ij: (data['co_counts'][ij]/comax)**alpha
                          if data['co_counts'][ij] < comax else 1
                          for ij in tqdm(data['co_counts'])}

In [None]:
weight_nonzero_data(data)
len(data['fco_counts']), sum(data['fco_counts'].values())

### PyTorch Deviation Here

In [None]:
import torch

In [None]:
class WCOData(torch.utils.data.Dataset):
    """
    Our weighted co-occurrence counts,
    wrapped in a PyTorch dataset!
    """
    
    def __init__(self, data):
        self.data = data
        self.keys = list(self.data['co_counts'].keys())
    
    def __len__(self):
        return len(self.keys)
    
    def __getitem__(self, item):
        key = self.keys[item]

        i, j = map(int, key.split(','))
        
        return {
            'i': i,
            'j': j,
            'co': data['co_counts'][key],
            'fco': data['fco_counts'][key],
        }

In [None]:
tdata = WCOData(data)
len(tdata), tdata[0]

In [None]:
class GloVe(torch.nn.Module):

    """
    A PyTorch GloVe nn.Module
    """
    
    def __init__(self, vocab_size, d=50):
        super(GloVe, self).__init__()
        self.U = torch.nn.Embedding(vocab_size, d)
        self.V = torch.nn.Embedding(vocab_size, d)
        
        self.a = torch.nn.Embedding(vocab_size, 1)
        self.b = torch.nn.Embedding(vocab_size, 1)
        
    def forward(self, i, j):
        u = self.U(i).unsqueeze(1)
        v = self.V(j).unsqueeze(-1)
        uv = torch.bmm(u, v).squeeze()
        
        a = self.a(i).squeeze()
        b = self.b(j).squeeze()
        
        y_pred = uv + a + b
        
        return y_pred

In [None]:
def to_gpu(x):
    if torch.cuda.is_available():
        return x.to('cuda')
  
    return x.to('cpu')

In [None]:
# construction our GloVe model with the desired hyperparameters

vocab_size = len(data['type_index'])

# d = 2
# d = 4
# d = 8
# d = 16
# d = 32
# d = 64
d = 128
# d = 256

glove = GloVe(vocab_size, d=d)
glove = to_gpu(glove)
glove

In [None]:
# create our loss function
# Note: the use of reduction='none'... This is necessary to weight the different contributions to the loss 
loss_fn = torch.nn.MSELoss(reduction='none')
loss_fn

In [None]:
# using the Adagrad optimizers with the learning rate set to 0.05, per the original GloVe paper
opt = torch.optim.Adagrad(glove.parameters(), lr=0.05)
opt

In [None]:
# Our GloVe train loop!

epochs = 100 if d > 100 else 50
batch_size = 4096 * 16

loader = torch.utils.data.DataLoader(tdata, batch_size=batch_size, shuffle=True)

for e in range(epochs):
    print(f'Begin epoch: {e + 1}')
    epoch_loss = 0
    for batch in tqdm(loader):
        opt.zero_grad()
        
        y_gold = to_gpu(torch.log10(batch['co'].float()))
        y_pred = glove(to_gpu(batch['i']), to_gpu(batch['j'])).float()
        
        loss = loss_fn(y_pred, y_gold)
        wloss = (loss * to_gpu(batch['fco'])).sum()
        epoch_loss += wloss.item()
        
        wloss.backward()
        
        opt.step()
    
    print(f'Epoch loss: {epoch_loss:.2f} (avg. {epoch_loss / len(tdata):.6f})')

In [None]:
ix2type = {v: k for k, v in data['type_index'].items()}
ix2type[999]

In [None]:
# code to export 3 matrices: U, V, and UV
U = list(glove.U.parameters())[0]
V = list(glove.V.parameters())[0]
UV = U + V

to_cache = [
    ('U', U), ('V', V), ('UV', UV)
]

for label, embed in to_cache:
    out = []
    for t, idx in data['type_index'].items():
        vec = list(embed[idx, :].cpu().detach().numpy())
        
        out.append({
            'type': t,
            'vec': list(map(float, vec))
        })
        
    json.dump(out, open(f'./data/glove.irc_dis.gamma.{label}.{max_vocab}v.{d}d.{epochs}e.json', 'w+'))