In [1]:
def linecount(path):
    j = 0
    with open(path, 'r') as f:
        for l in f:
            j += 1
    return j

In [2]:
import torch

class dataloader:
    
    def __init__(self, w2i, path, batchsize=16, negsize=20, n_batches=None):
        self.w2i = w2i
        self.path = path
        self.batchsize = batchsize
        self.negsize = negsize
        if not n_batches:
            n_batches = (linecount(path) // batchsize) + 1
        self.n_batches = n_batches
        
    def b2i(self, batch):
        batch = [l.split() for l in batch]
        lengths = [len(l) for l in batch]
        index = np.zeros((len(batch), max(lengths)))
        for j, (words, length) in enumerate(zip(batch, lengths)):
            index[j, :length] = [(self.w2i[w] if w in self.w2i else self.w2i['<unk>']) for w in words]
        return index

    def __iter___AAAAA(self):
        with open(self.path, 'r') as f:
            i, j, batch = 0, 0, []
            for l in f:
                if j == self.batchsize:
                    yield self.b2i(batch)
                    j, batch = 1, [l]
                    i += 1
                    if i > self.n_batches:
                        break
                else:
                    j += 1
                    batch.append(l)
            if batch:
                yield self.b2i(batch)
    
    def sentence_batch_generator(self):
        all_batch = []
        with open(self.path, 'r') as f:
            for l in f:
                #if len(l) > 2000:
                #    continue
                all_batch.append(l)
        data = self.b2i(all_batch)

        n_batch = len(data) / self.batchsize
        np.random.shuffle(data)
        batchcount = 0
        while True:
            if batchcount == n_batch:
                np.random.shuffle(data)
                batchcount = 0
            batch = data[batchcount * self.batchsize:(batchcount + 1) * self.batchsize]
            batchcount += 1
            yield torch.LongTensor(batch)

    def negative_batch_generator(self):
        all_batch = []
        with open(self.path, 'r') as f:
            for l in f:
                #if len(l) > 2000:
                #    continue
                all_batch.append(l)
        data = self.b2i(all_batch)

        data_len = data.shape[0]
        dim = data.shape[1]

        while True:
            indices = np.random.choice(data_len, self.batchsize * self.negsize)
            samples = data[indices].reshape(self.batchsize, self.negsize, dim)
            yield torch.LongTensor(samples)

In [3]:
from sklearn.cluster import KMeans
import numpy as np
import gensim
import codecs
import tqdm


class word2vec:

    def __init__(self, corpus_path):
        self.corpus_path = corpus_path
        self.n_words = 0

    def __iter__(self):
        with codecs.open(self.corpus_path, 'r', 'utf-8') as f:
            for line in tqdm.tqdm(f, desc='training'):
                yield line.split()

    def add(self, *words):
        for word in words:
            if not word in self.w2i:
                self.w2i[word] = self.n_words
                self.i2w[self.w2i[word]] = word
                self.n_words += 1
                
    def embed(self, model_path, d_embed):
        if os.path.isfile(model_path):
            model = gensim.models.Word2Vec.load(model_path)
        else:
            model = gensim.models.Word2Vec(self, 
                size=d_embed, window=5, min_count=10, workers=8)
            model.save(model_path)
            model = gensim.models.Word2Vec.load(model_path)

        self.i2w, self.w2i = {}, {}
        self.add('<pad>')
        self.add('<unk>')
            
        E = []
        n = len(model.wv.vocab)
        for word in sorted(model.wv.vocab):
            j = len(E)
            self.i2w[j] = word
            self.w2i[word] = j
            E.append(list(model.wv[word]))
        self.E = np.asarray(E).astype(np.float32)
        self.d_embed = d_embed        
        return self
    
    def aspect(self, n_aspects):
        self.n_aspects = n_aspects
        km = KMeans(n_clusters=n_aspects, random_state=0)
        km.fit(self.E)
        clusters = km.cluster_centers_

        # L2 normalization
        norm_aspect_matrix = clusters / np.linalg.norm(clusters, axis=-1, keepdims=True)
        self.T = norm_aspect_matrix.astype(np.float32)
        return self

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import codecs
import json
import os

class wikidata:
    
    def __init__(self, corpus_path, d_embed=200, n_aspects=14):
        self.corpus_path = corpus_path
        
        self.prep_path = self.corpus_path + '.prep'        
        if not os.path.isfile(self.prep_path):
            self.preprocess(self.corpus_path, self.prep_path)

        self.model_path = self.prep_path + '.w2v'
        w2v = word2vec(self.prep_path)
        w2v.embed(self.model_path, d_embed)
        w2v.aspect(n_aspects)
        self.w2v = w2v

    def preprocess(self, input_path, output_path):
        lmtzr = WordNetLemmatizer()    
        stop = stopwords.words('english')
        token = CountVectorizer().build_tokenizer()
        lc = linecount(input_path)
        with open(input_path, 'r') as in_f, open(output_path, 'w') as out_f:
            for j, l in tqdm.tqdm(enumerate(in_f), total=lc, desc='preprocessing "%s"' % input_path):
                tokens = [lmtzr.lemmatize(t) for t in token(l.lower()) if not t in stop]
                if len(tokens) > 3:
                    out_l = ' '.join(tokens)
                    out_f.write(out_l + '\n')

In [7]:
import torch.optim as optim
import torch.nn as nn
import torch


epsilon=0.0000001


class attention(nn.Module):
    
    def __init__(self, d_embed):
        super(attention, self).__init__()
        self.M = nn.Linear(d_embed, d_embed)
        self.M.weight.data.uniform_(-0.1, 0.1)
        #self.M = torch.zeros((d_embed, d_embed), requires_grad=True)
        #self.M = self.M.cuda()
        #self.M.data.uniform_(-0.1, 0.1)
    
    def forward(self, e_i):
        y_s = torch.mean(e_i, dim=-1).unsqueeze(1)
        #d_i = e_i.t().mm(self.M.mm(y_s))#.tanh()
        d_i = e_i.t().mm(self.M.weight.mm(y_s))#.tanh()
        a_i = d_i / sum(torch.exp(d_i))
        return a_i.squeeze(1)

        
class abae(nn.Module):
    
    def __init__(self, w2v, ortho_reg=0.1):
        super(abae, self).__init__()
        self.ortho_reg = ortho_reg
        self.E = nn.Embedding(w2v.n_words, w2v.d_embed)
        self.E.weight.data = torch.from_numpy(np.array(w2v.E))        
        for param in self.E.parameters():
            param.requires_grad = False # freeze layer E

        #self.T = nn.Linear(w2v.n_aspects, w2v.d_embed, bias=False)
        self.T = nn.Embedding(w2v.n_aspects, w2v.d_embed)
        self.T.weight.data = torch.from_numpy(w2v.T)
        for param in self.T.parameters():
            param.requires_grad = True
        #self.T = torch.randn(w2v.n_aspects, w2v.d_embed).float()
        #self.T.data = torch.from_numpy(w2v.T)
        #self.T.requires_grad = True

        self.attention = attention(w2v.d_embed)
        self.linear = nn.Linear(w2v.d_embed, w2v.n_aspects)
        self.softmax = nn.Softmax(dim=0)
        
    def forward(self, pos, negs):        
        e_i = self.E(pos).t()
        a_i = self.attention(e_i)
        z_s = torch.mv(e_i, a_i)
        p_t = self.softmax(self.linear(z_s))
        #r_s = self.T.t().mm(p_t.unsqueeze(1)).squeeze(1)
        r_s = self.T.weight.t().mm(p_t.unsqueeze(1)).squeeze(1)
        e_n = self.E(negs).transpose(1, 2)
        z_n = torch.mean(e_n, dim=-1)
        loss = max_margin_loss(z_s, r_s, z_n)
        return loss + self.regularize()

    def regularize(self):
        #m = epsilon + torch.norm(self.T, dim=1)
        m = epsilon + torch.norm(self.T.weight, dim=1)
        #T_n = (self.T.t() / m).t()        
        T_n = (self.T.weight.t() / m).t()        
        U = T_n.mm(T_n.t()) - torch.eye(T_n.shape[0]).cuda()
        return self.ortho_reg * torch.norm(U)


def max_margin_loss(z_s, r_s, z_n):
    z_s_n = z_s / (epsilon + torch.sqrt(z_s.dot(z_s)))
    r_s_n = r_s / (epsilon + torch.sqrt(r_s.dot(r_s)))
    pos = z_s_n.dot(r_s_n)
    losses = []
    for j, n_i in enumerate(z_n):
        neg = (n_i / (epsilon + torch.sqrt(n_i.dot(n_i)))).dot(r_s_n)
        loss = torch.clamp(torch.ones(1).cuda() - pos + neg, min=0.0)
        losses.append(loss)
    return torch.stack(losses).sum()


def train(d_embed=200, n_aspects=14, epochs=5, lr=0.1, batchsize=64):
    #wd = wikidata('./data/wiki_01')
    wd = wikidata('./data/restaurant.train.txt', d_embed, n_aspects)
    #wd = wikidata('./data/beer.train.txt', d_embed, n_aspects)
    
    device = 'cuda'

    dl = dataloader(wd.w2v.w2i, wd.prep_path, batchsize=batchsize)
    sen_gen = dl.sentence_batch_generator()
    neg_gen = dl.negative_batch_generator()
    
    ab = abae(wd.w2v).to(device)
    
    opt = optim.Adam(ab.parameters(), lr=lr)

    n_batches_per_epoch = 100
    for e in range(epochs):
        with tqdm.trange(n_batches_per_epoch) as pbar:
            for b in pbar:
                sen_input = next(sen_gen)
                neg_input = next(neg_gen)

                opt.zero_grad()

                losses = []
                for pos, negs in zip(sen_input, neg_input):
                    #pos.requires_grad = True
                    #negs.requires_grad = True
                    pos = pos.to(device)
                    negs = negs.to(device)
                    losses.append(ab(pos, negs))
                loss = torch.stack(losses).sum()
                loss.backward()
                opt.step()

                pbar.set_description('e: %d | b: %d | MEAN-LOSS: %f' % (e, b, loss / batchsize))

        word_emb = ab.E.weight.cpu().numpy()
        word_emb = word_emb / np.linalg.norm(word_emb, axis=-1, keepdims=True)
        aspect_emb = ab.T.weight.cpu().data.numpy()
        aspect_emb = aspect_emb / np.linalg.norm(aspect_emb, axis=-1, keepdims=True)

        for ind in range(len(aspect_emb)):
            desc = aspect_emb[ind]
            sims = word_emb.dot(desc.T)
            ordered_words = np.argsort(sims)[::-1]

            desc_list = [wd.w2v.i2w[w] for w in ordered_words[:10]]
            print('Aspect %d: %s' % (ind, ','.join(desc_list)))                

In [None]:
train(d_embed=200, epochs=5, lr=0.1, batchsize=32)

# TRASH 

In [None]:
!{'wc %s' % wd.prep_path}

class dedicated to making a structured dataset a particular data source
    raw text -> preprocessing -> splitting
    creates a vocab
    vocab trains word embeddings

class data loader which serves batches of training/evaluation data
    requires preprocessed text to serve
    requires predetermined vocab
    vocab requires word embeddings
    
class model just the neural network parts

class wrap model with interface
    training
    evaluation
    deployment

cli script covering interface