In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import math
import time

In [2]:
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

In [3]:
class NGramLM(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, context_size, hidden_size):
        super(NGramLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        self.fnn = nn.Sequential(
            nn.Linear(context_size * embedding_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, vocab_size),
        )
        
    def forward(self, inputs):
        embed = self.embed(inputs)      # [batch_size, context_size, embedding_dim]
        embed = embed.view((1, -1))     # [batch_size, context_size * embedding_dim]
        logits = self.fnn(embed)
        
        return logits

In [4]:
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:3])

vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}

[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]


In [5]:
for context, target in trigrams:
    print(context)
    print(torch.tensor([word_to_ix[w] for w in context], dtype=torch.long))
    break

['When', 'forty']
tensor([20,  4])


In [6]:
EMBEDDING_DIM = 50
CONTEXT_SIZE = 2
NEPOCH = 10

In [15]:
model = NGramLM(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE, 128)

In [8]:
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [9]:
loss_function = nn.CrossEntropyLoss()

In [16]:
tensor_type = torch.LongTensor

In [11]:
use_cuda = torch.cuda.is_available()
if use_cuda:
    tensor_type = torch.cuda.LongTensor
    model.cuda()

In [29]:
for EPOCH in range(NEPOCH):
    
    train_loss = 0.0
    
    start = time.time()
    i = 0
    for context, target in trigrams:
        
        # Word to index and to tensor
        context_idxs = torch.tensor([word_to_ix[w] for w in context]).type(tensor_type)
        model.zero_grad()
        
        log_probs = model(context_idxs)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]]).type(tensor_type))
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        i += 1
        
#         if (sent_id+1) % 1000 == 0:
#             print(f'Finished sentences, ({len(train)/(time.time()-start):.2f} words per second)')
    print(f'Epoch {EPOCH+1:4}:', 
          f'train_loss={train_loss:.4f},',
          f'ppl={math.exp(train_loss/len(vocab)):.4f},',
          f'time_per_epoch={(time.time()-start):.2f},',
          f'time_per_iter={(time.time()-start)/i:.2f}')
  

Epoch    1: train_loss=521.8302, ppl=216.9555, time_per_epoch=0.17, time_per_iter=0.00
Epoch    2: train_loss=521.8302, ppl=216.9555, time_per_epoch=0.13, time_per_iter=0.00
Epoch    3: train_loss=521.8302, ppl=216.9555, time_per_epoch=0.13, time_per_iter=0.00
Epoch    4: train_loss=521.8302, ppl=216.9555, time_per_epoch=0.13, time_per_iter=0.00
Epoch    5: train_loss=521.8302, ppl=216.9555, time_per_epoch=0.12, time_per_iter=0.00
Epoch    6: train_loss=521.8302, ppl=216.9555, time_per_epoch=0.13, time_per_iter=0.00
Epoch    7: train_loss=521.8302, ppl=216.9555, time_per_epoch=0.13, time_per_iter=0.00
Epoch    8: train_loss=521.8302, ppl=216.9555, time_per_epoch=0.13, time_per_iter=0.00
Epoch    9: train_loss=521.8302, ppl=216.9555, time_per_epoch=0.12, time_per_iter=0.00
Epoch   10: train_loss=521.8302, ppl=216.9555, time_per_epoch=0.13, time_per_iter=0.00


In [25]:
def gen_word2vec_train_example(sent, context_size=2, pad_token=0):
    '''
    Generate Word2Vec training example, i.e. left + right context -> target.
    [0, 0, w_2, w_3] -> w_1, ... [w_n-2, w_n-1, 0, 0] -> w_n
    '''
    
    # Pad the sentence with <S> token
    padded_sent = [pad_token] * context_size + sent + [pad_token] * context_size

    # Generate training examples
    for i in range(context_size, len(sent) + context_size):
        context = padded_sent[i - N:i] + padded_sent[i + 1:i + N + 1]
        target = [padded_sent[i]]
        
        yield context, target

def gen_lm_train_example(sent, context_size=2, pad_token=0):
    '''
    Generate LM training example, i.e. context -> target.
    [0, 0] -> w_1, ... [w_n-1, w_n] -> 0
    '''
    
    # Pad the sentence with [S] token
    padded_sent = sent + [pad_token]

    # Generate training examples
    context = [pad_token] * context_size
    for target in padded_sent:
        yield context, target
        context = context[1:] + [target]

In [None]:
import torch
from torch import nn
from torch.autograd import Variable

class BoW(torch.nn.Module):
    '''Bag of Words.
    '''
    def __init__(self, nwords, ntags):
        super(BoW, self).__init__()

        tensor_type = torch.FloatTensor
        use_cuda = torch.cuda.is_available()
        if use_cuda:
            tensor_type = torch.cuda.FloatTensor

        self.bias = Variable(torch.zeros(ntags), requires_grad=True).type(tensor_type)
        self.embedding = nn.Embedding(nwords, ntags)
        nn.init.xavier_uniform_(self.embedding.weight)


    def forward(self, words):
        emb = self.embedding(words)
        out = torch.sum(emb, dim=0) + self.bias # size(out) = N
        out = out.view(1, -1) # size(out) = 1 x N
        return out

#### run

In [None]:
def set_logger(args):
    '''
    Write logs to checkpoint and console
    '''

    if args.train:
        log_file = os.path.join(args.save_path or args.init_checkpoint, 'train.log')
    else:
        log_file = os.path.join(args.save_path or args.init_checkpoint, 'test.log')

    logging.basicConfig(
        format='%(asctime)s %(levelname)-8s %(message)s',
        level=logging.INFO,
        datefmt='%Y-%m-%d %H:%M:%S',
        filename=log_file,
        filemode='w'
    )
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s')
    console.setFormatter(formatter)
    logging.getLogger('').addHandler(console)

In [None]:
def run(args):
    
    model = Model()

    logging.info('Model Parameter:')
    for name, param in kge_model.named_parameters():
        logging.info(f'{name} = {param}')
    
    pubmed_pubtator = PubmedPubtatorDataset()
    train_loader = DataLoader(pubmed_pubtator, batch_size=1, shuffle=True, num_workers=4)
    valid_loader = DataLoader(pubmed_pubtator, batch_size=1, shuffle=True, num_workers=4)
    
    if args.init_checkpoint:
        logging.info(f'Load checkpoint from {args.init_checkpoint}...')
        checkpoint = torch.load(args.init_checkpoint)
        
    else:
        logging.info('Start from scratch')
    
    if args.train:
        train()
        
    if args.valid:
        test()
        
    if args.test:
        test_loader = DataLoader()
        test()
        
    

#### Train

In [None]:
def train(args, model, train_loader, val_loader, optimizer, loss_fn, scheduler=None):
    '''Training loop'''
    
    model.train()
    for epoch in range(args.epoch):
        
        train_loss = 0.0
        start = time.time()
        
        for i, batch in enumerate(train_loader):
            for sent in batch:  # Each batch is a doc (a list of sent)
                
                model.zero_grad()
                
                context, target = gen_lm_train_example(sent)
                
                start_ = time.time()
                logits = model(context.type(tensor_type))
                loss = loss_function(logits, target.type(tensor_type))
                forward_time = time.time() - start_

                start_ = time.time()
                loss.backward()
                backward_time = time.time() - start_

                optimizer.step()

                train_loss += loss.item()
    
    for (epoch + 1) % args.log_step == 0:
        log()
        
    for (epoch + 1) % args.valid_step == 0:
        model.eval()
        valid_loss = valid()
        log()
        
        if valid_loss < min_valid_loss:
            save()
    
    if scheduler is not None:
        scheduler.step()

In [None]:
pubmed_pubtator = PubmedPubtatorDataset()
dataloader = DataLoader(data, batch_size=1, shuffle=True, num_workers=4)

# Train
for EPOCH in range(NEPOCH):
    
    train_loss = 0.0
    start = time.time()
    
    for i, batch in enumerate(dataloader):
        for sent in batch:
            
            model.zero_grad()
            
            context, target = gen_lm_train_example(sent)
            
            start_ = time.time()
            logits = model(context.type(tensor_type))
            loss = loss_function(logits, target.type(tensor_type))
            forward_time = time.time() - start_

            start_ = time.time()
            loss.backward()
            backward_time = time.time() - start_
            
            optimizer.step()

            train_loss += loss.item()
        

    print(f'Epoch {EPOCH+1:4}:', 
          f'train_loss={train_loss:.4f},',
          f'ppl={math.exp(train_loss/len(vocab)):.4f},',
          f'time_per_epoch={(time.time()-start):.2f},',
          f'time_per_iter={(time.time()-start)/i:.2f}')
    
    if (step + 1) % valid_step == 0:
        
  

## Dataset

In [2]:
import os

In [3]:
from torch.utils.data import Dataset, DataLoader

In [None]:
corpus = '/scratch/cheng.jial/dataset-nlp/pubmed_pubtator/pmid'

In [15]:
class PubmedPubtatorDataset(Dataset):
    '''PubMed abstract with PubTator NER.
       Chemical/Disease/Gene transformed into MeSH IDs.
    '''
    
    def __init__(self, root_dir=corpus, transform=[]):
        self.root_dir = root_dir
        self.pmid_list = os.listdir(root_dir)
        self.pmid_list.sort()
        self.transform = transform
        
    def __len__(self):
        return len(self.pmid_list)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        pmid = self.pmid_list[idx]
        doc_name = os.path.join(self.root_dir, pmid)
        with open(doc_name, 'r') as f:
            doc = f.readlines()
        doc = [l.strip('\n') for l in doc]
        
        for i in self.transform:
            doc = self.transform(doc)
            
        return pmid, doc

In [16]:
pubmed_pubtator = PubmedPubtatorDataset()
dataloader = DataLoader(data, batch_size=1, shuffle=True, num_workers=4)

In [17]:
for i in range(len(data)):
    print(data[i])
    break

['Higgs-boson production in nucleus-nucleus collisions.', 'Cross-section calculations are presented for the production of intermediate-mass Higgs bosons produced in ultrarelativistic nucleus-nucleus collisions via two-photon fusion.', "The calculations are performed in position space using Baur's method for folding together the DMESHD018980 spectra of the two colliding nuclei.", 'It is found that two-photon fusion in nucleus-nucleus collisions is a plausible way of finding intermediate-mass Higgs bosons at the Superconducting Super Collider or the CERN Large Hadron Collider.']


In [20]:
dataloader = DataLoader(data, batch_size=1, shuffle=True, num_workers=4)

In [21]:
for i_batch, batch in enumerate(dataloader):
    print(i_batch, batch)
    break

0 [('Altered balance of functional brain networks in DMESHD012559.',), ('Activity in dorsal attention (DAN) and frontoparietal (FPN) functional brain networks is linked to allocation of attention to external stimuli, and activity in the default-mode network (G23336) is linked to allocation of attention to internal representations.',), ('Tasks requiring attention to external stimuli shift activity to the DAN/FPN and away from the G23336, and optimal task performance depends on balancing DAN/FPN against G23336 activity.',), ('The current functional magnetic resonance imaging (DMESHC564543) study assessed the balance of DAN/FPN and G23336 activity in 13 DMESHD012559 patients and 13 healthy controls while they were engaged in a task switching Stroop paradigm which demanded internally directed attention to task instructions.',), ('The typical pattern of reciprocity between the DAN/G23336 was observed for healthy controls but not for patients, suggesting a reduction in the internally focusse

In [None]:
class Tokenize(object):
    '''Tokenize all sentences in a doc'''
    
    def __init__(self):
        self.tokenizer = spacy()
    
    def __call__(self, doc):
        doc = [self.__tokenize__(sen) for sen in doc]
        
        return doc

In [None]:
class ToTensor(object):
    '''Convert all sentences in a docs to torch.Tensor'''
    
    def __init__(self, w2i):
        self.w2i = w2i
        
    def __tokenize__(self, sentence):
        tokens = [t.token for t in self.tokenizer(sentence)]
        return tokens
    
    def __w2i__(self, word):
        try:
            return self.w2i[word]
        except:
            return self.w2i['<UNK>']
    
    def __totensor__(self, sent):
        return torch.from_numpy(sent)
    
    def __call__(self, doc):
        doc = [[self.__toidx__(sent) for w in sent] for sent in doc]
        doc = [self.__totensor__(sent) for sent in doc]
        
        return doc

In [3]:
import pytorch_transformers as pt

In [4]:
from pytorch_transformers import BertTokenizer

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

100%|██████████| 231508/231508 [00:00<00:00, 3393306.21B/s]


In [13]:
text = "CMESHD002847"
tokenizer.tokenize(text)

['cm', '##esh', '##d', '##00', '##28', '##47']

In [5]:
from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel

In [9]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Encode a text inputs
text = "CMESHD002847"
indexed_tokens = tokenizer.encode(text)

In [10]:
tokenizer.tokenize(text)

['ĠCM', 'ES', 'HD', '00', '28', '47']

In [7]:
indexed_tokens

[5338, 373, 5395, 367, 19069, 5633, 5395, 367, 19069, 373, 257]

In [None]:
tokens_tensor = torch.tensor([indexed_tokens])