In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
print(plt.get_backend())

module://ipykernel.pylab.backend_inline


In [2]:
def linecount(path):
    j = 0
    with open(path, 'r') as f:
        for l in f:
            j += 1
    return j

In [3]:
import numpy as np
import torch


np.random.seed(0)


class dataloader:
    
    def __init__(self, w2i, path, batchsize=16, negsize=20, n_batches=None):
        self.w2i = w2i
        self.path = path
        self.batchsize = batchsize
        self.negsize = negsize
        if not n_batches:
            n_batches = (linecount(path) // batchsize) + 1
        self.n_batches = n_batches
        
    def b2i(self, batch):
        # use pytorch function for padding if one exists??
        batch = [l.split() for l in batch]
        lengths = [len(l) for l in batch]
        index = np.zeros((len(batch), max(lengths)))
        w2i = lambda w: (self.w2i[w] if w in self.w2i else self.w2i['<unk>'])
        for j, (words, length) in enumerate(zip(batch, lengths)):
            index[j, :length] = [w2i(w) for w in words]
        return index
    
    def sentence_batch_generator(self):
        all_batch = []
        with open(self.path, 'r') as f:
            for l in f:
                all_batch.append(l)

        n_batch = int(len(all_batch) / self.batchsize) - 1
        np.random.shuffle(all_batch)
        batchcount = 0
        while True:
            if batchcount == n_batch:
                np.random.shuffle(all_batch)
                batchcount = 0
            batch = all_batch[batchcount * self.batchsize:(batchcount + 1) * self.batchsize]
            batchcount += 1
            if not batch:
                print('empty batch!', batch, batchcount)
                import pdb
                pdb.set_trace()
            yield torch.LongTensor(self.b2i(batch))

    def negative_batch_generator(self):
        all_batch = []
        with open(self.path, 'r') as f:
            for l in f:
                all_batch.append(l)

        data_len = len(all_batch)
        while True:
            indices = np.random.choice(data_len, self.batchsize * self.negsize)
            samples = self.b2i([all_batch[i] for i in indices])
            samples = samples.reshape(self.batchsize, self.negsize, samples.shape[1])
            yield torch.LongTensor(samples)

In [4]:
from sklearn.cluster import KMeans
import numpy as np
import gensim
import codecs
import tqdm


class word2vec:

    def __init__(self, corpus_path):
        self.corpus_path = corpus_path
        self.n_words = 0

    def __iter__(self):
        with codecs.open(self.corpus_path, 'r', 'utf-8') as f:
            for line in tqdm.tqdm(f, desc='training'):
                yield line.split()

    def add(self, *words):
        for word in words:
            if not word in self.w2i:
                self.w2i[word] = self.n_words
                self.i2w[self.w2i[word]] = word
                self.n_words += 1
                
    def embed(self, model_path, d_embed, window=5, min_count=10, workers=16):
        if os.path.isfile(model_path):
            model = gensim.models.Word2Vec.load(model_path)
        else:
            model = gensim.models.Word2Vec(self, size=d_embed, 
                window=window, min_count=min_count, workers=workers)
            model.save(model_path)
            model = gensim.models.Word2Vec.load(model_path)

        self.i2w, self.w2i = {}, {}
        self.add('<pad>')
        self.add('<unk>')
            
        E = []
        n = len(model.wv.vocab)
        for word in sorted(model.wv.vocab):
            j = len(E)
            self.i2w[j] = word
            self.w2i[word] = j
            E.append(list(model.wv[word]))
        self.E = np.asarray(E)
        self.d_embed = d_embed        
        return self
    
    def aspect(self, n_aspects):
        self.n_aspects = n_aspects
        
        #self.T = np.random.randn(n_aspects, self.E.shape[1]).astype(np.float32)
        #self.T /= np.linalg.norm(self.T, axis=-1, keepdims=True)
        #return self
        
        km = KMeans(n_clusters=n_aspects, random_state=0)
        km.fit(self.E)
        clusters = km.cluster_centers_

        # L2 normalization
        norm_aspect_matrix = clusters / np.linalg.norm(clusters, axis=-1, keepdims=True)
        self.T = norm_aspect_matrix.astype(np.float32)
        return self

In [5]:
import torch.nn as nn
import torch


class attention(nn.Module):
    
    def __init__(self, d_embed):
        super(attention, self).__init__()
        self.M = nn.Linear(d_embed, d_embed)
        self.M.weight.data.uniform_(-0.1, 0.1)
    
    def forward(self, e_i):
        y_s = torch.mean(e_i, dim=-1)
        d_i = torch.bmm(e_i.transpose(1, 2), self.M(y_s).unsqueeze(2)).tanh()
        a_i = d_i / sum(torch.exp(d_i))
        return a_i.squeeze(1)

        
class abae(nn.Module):
    
    def __init__(self, w2v, ortho_reg=0.1):
        super(abae, self).__init__()
        self.ortho_reg = ortho_reg
        self.E = nn.Embedding(w2v.n_words, w2v.d_embed)
        self.E.weight.data = torch.from_numpy(w2v.E)        
        for param in self.E.parameters():
            param.requires_grad = False # freeze layer E
        self.T = nn.Embedding(w2v.n_aspects, w2v.d_embed)
        self.T.weight.data = torch.from_numpy(w2v.T)
        for param in self.T.parameters():
            param.requires_grad = True
        self.attention = attention(w2v.d_embed)
        self.linear = nn.Linear(w2v.d_embed, w2v.n_aspects)
        self.softmax = nn.Softmax(dim=0)

    def forward(self, pos, negs):
        p_t, z_s = self.predict(pos) 
        r_s = torch.mm(self.T.weight.t(), p_t.t()).t()
        e_n = self.E(negs).transpose(-2, -1)
        z_n = torch.mean(e_n, dim=-1)
        return self.loss(r_s, z_s, z_n)

    def predict(self, x):
        e_i = self.E(x).transpose(1, 2)
        a_i = self.attention(e_i)
        z_s = torch.bmm(e_i, a_i).squeeze(2)
        p_t = self.softmax(self.linear(z_s))
        # CONFIRM NORMALIZATION ALONG THE CORRECT AXIS??
        return p_t, z_s
    
    def loss(self, r_s, z_s, z_n):
        J = self.max_margin_loss(r_s, z_s, z_n)
        U = r_s.shape[0] * self.regularize(self.T.weight)
        return J + self.ortho_reg * U

    @staticmethod
    def regularize(T):
        T_n = torch.nn.functional.normalize(T, dim=1)
        U = T_n.mm(T_n.t()) - torch.eye(T_n.shape[0]).to(T.device)
        return torch.norm(U)

    @staticmethod
    def max_margin_loss(z_s, r_s, z_n):
        z_s_n = torch.nn.functional.normalize(z_s, dim=-1)
        r_s_n = torch.nn.functional.normalize(r_s, dim=-1)
        z_n_n = torch.nn.functional.normalize(z_n, dim=-1)
        pos = torch.bmm(z_s_n.unsqueeze(1), r_s_n.unsqueeze(2)).squeeze()
        negs = torch.bmm(z_n_n, r_s_n.unsqueeze(2)).squeeze()
        loss = torch.ones(negs.shape) - pos.unsqueeze(1).expand(negs.shape) + negs
        return torch.sum(torch.clamp(loss, min=0.0))

    def aspects(self, i2w, n=8):
        E = self.E.weight.cpu().data.numpy()
        T = self.T.weight.cpu().data.numpy()
        E = E / np.linalg.norm(E, axis=-1, keepdims=True)
        T = T / np.linalg.norm(T, axis=-1, keepdims=True)
        for j, a in enumerate(T):
            neighbors = np.argsort(E.dot(a.T))[-n:]
            print('Aspect %2d: %s' % (j, ', '.join([i2w[w] for w in neighbors])))

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import codecs
import json
import os


class wikidata:
    
    def __init__(self, corpus_path, d_embed=200, n_aspects=14):
        self.corpus_path = corpus_path
        
        self.prep_path = self.corpus_path + '.prep'        
        if not os.path.isfile(self.prep_path):
            self.preprocess(self.corpus_path, self.prep_path)

        self.model_path = self.prep_path + '.w2v'
        w2v = word2vec(self.prep_path)
        w2v.embed(self.model_path, d_embed)
        w2v.aspect(n_aspects)
        self.n_vocab = len(w2v.w2i)
        self.w2v = w2v

    def preprocess(self, input_path, output_path):
        lmtzr = WordNetLemmatizer()    
        stop = stopwords.words('english')
        token = CountVectorizer().build_tokenizer()
        lc = linecount(input_path)
        with open(input_path, 'r') as in_f, open(output_path, 'w') as out_f:
            for j, l in tqdm.tqdm(enumerate(in_f), total=lc, desc='preprocessing "%s"' % input_path):
                tokens = [lmtzr.lemmatize(t) for t in token(l.lower()) if not t in stop]
                if len(tokens) > 3:
                    out_l = ' '.join(tokens)
                    out_f.write(out_l + '\n')

In [8]:
import torch.optim as optim
import torch


def train(dataset, device='cuda', epochs=5, epochsize=50, batchsize=100, initial_lr=0.02):
    dl = dataloader(dataset.w2v.w2i, dataset.prep_path, batchsize=batchsize)
    sen_gen = dl.sentence_batch_generator()
    neg_gen = dl.negative_batch_generator()
    
    ab = abae(dataset.w2v).to(device)
    ab.aspects(dataset.w2v.i2w)

    mean_losses = []
    opt = optim.Adam(ab.parameters(), lr=initial_lr)
    for e in range(epochs):
        mean_losses.append([])
        with tqdm.trange(epochsize) as pbar:
            for b in pbar:
                sen_input = next(sen_gen).to(device)
                neg_input = next(neg_gen).to(device)

                opt.zero_grad()
                loss = ab(sen_input, neg_input)                
                mean_losses[-1].append(loss.item())
                loss.backward()
                opt.step()

                x = (e, b, opt.param_groups[0]['lr'], loss / batchsize)
                pbar.set_description('e: %d | b: %d | lr: %0.5f | MEAN-LOSS: %0.5f' % x)

                if b * batchsize % 100 == 0:
                    lr = initial_lr * (1.0 - 1.0 * ((e + 1) * (b + 1)) / (epochs * epochsize))
                    for pg in opt.param_groups:
                        pg['lr'] = lr
                
        ab.aspects(dataset.w2v.i2w)
        all_losses = [x for y in mean_losses for x in y]
        plt.plot(list(range(len(all_losses))), all_losses)
        plt.show()

In [None]:
d_embed = 500
n_aspects = 20

#wd = wikidata('./data/wiki_01')
#wd = wikidata('./data/restaurant.train.txt', d_embed, n_aspects)
wd = wikidata('./data/beer.train.txt', d_embed, n_aspects)

x = (wd.n_vocab, wd.w2v.d_embed, wd.w2v.n_aspects)
print('n_vocab: %d | d_embed: %d | n_aspects: %d' % x)

train(wd, device='cpu', epochs=10, epochsize=100, batchsize=100, initial_lr=0.01)

training: 4326539it [00:32, 146254.61it/s]

#

    impose maximum vocab size
    num tag for preprocessing
    preventing dupes in aspect printing
    updating loss plot
    validation loss measurement
    model saving/loading
    inferring n_aspects
    cli
    break into package
    documentation
    downweight specificity?


# TRASH 

In [None]:
!{'wc %s' % wd.prep_path}

class dedicated to making a structured dataset a particular data source
    raw text -> preprocessing -> splitting
    creates a vocab
    vocab trains word embeddings

class data loader which serves batches of training/evaluation data
    requires preprocessed text to serve
    requires predetermined vocab
    vocab requires word embeddings
    
class model just the neural network parts

class wrap model with interface
    training
    evaluation
    deployment

cli script covering interface