In [None]:
!pip install transformers

In [2]:
import torch
import numpy as np
import math

from transformers import GPT2Tokenizer, GPT2LMHeadModel

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [7]:
TRAIN_FILE_PATH = "/content/drive/My Drive/wikitext-2-raw/wiki.train.raw"
TEST_FILE_PATH = "/content/drive/My Drive/wikitext-2-raw/wiki.test.raw"

text_train = open(TRAIN_FILE_PATH, 'r').read()
text_test = open(TEST_FILE_PATH, 'r').read()

with open(TRAIN_FILE_PATH + ".short", "w") as f:
  f.write(text_train[:1000000])

with open(TEST_FILE_PATH + ".short", "w") as f:
  f.write(text_test[:500000])

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

In [None]:
!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/language-modeling/run_language_modeling.py
!ls -l *.py

In [9]:
!python run_language_modeling.py \
    --output_dir=output \
    --model_type=gpt2 \
    --model_name_or_path=gpt2 \
    --do_train \
    --train_data_file=$"/content/drive/My Drive/wikitext-2-raw/wiki.train.raw.short" \
    --do_eval \
    --eval_data_file=$"/content/drive/My Drive/wikitext-2-raw/wiki.test.raw.short"

2020-10-16 10:15:33.222216: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
10/16/2020 10:15:34 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='output', overwrite_output_dir=False, do_train=True, do_eval=True, do_predict=False, evaluate_during_training=False, evaluation_strategy=<EvaluationStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Oct16_10-15-34_c96d4f7104be', logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_deb

In [10]:
# Function to first select topN tokens from the probability list and then based on the selected N word distribution
# get random token ID
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id), top_prob[choice]

In [11]:
def generate_some_text(input_str, text_len = 100):
    cur_ids = torch.LongTensor(tokenizer.encode(input_str)).to(device)
    k=0
    l=0
    model.eval()
    with torch.no_grad():
        for i in range(text_len):
            outputs = model(cur_ids, labels=cur_ids)
            loss, logits = outputs[:2]
            softmax_logits = torch.softmax(logits[-1], dim=0) #Take the first(only one) batch and the last predicted embedding
            next_token_id, prob = choose_from_top(softmax_logits.cpu().numpy(), n=5) #Randomly(from the given probability distribution) choose the next word from the top n words
            k+=1
            l+=np.log2(prob)
            if ([next_token_id] == tokenizer.encode(tokenizer.eos_token)): # if the network generated the end of the sentence, stop 
              break
            cur_ids = torch.LongTensor(cur_ids.cpu().tolist() + [next_token_id]).to(device) # Add the last word 

        output_text = tokenizer.decode(cur_ids)
        print(output_text)
        print('perplexity=',np.power(2,-l/k))

In [None]:
generate_some_text("The rain was unexpectedly warm")

In [12]:
def count_perplexity(encodings):
  input_ids = encodings.input_ids.to(device)

  with torch.no_grad():
      outputs = model(input_ids, labels=input_ids)
      loss=outputs[0]

  ppl=math.exp(outputs[0])
  return ppl

In [40]:
input=['The moon is made of chocolate.', 'The moon is made of cheese.', 'The moon is made of oxygen and silicon.', 
       'Lions live in cities and eat berries.', 'Lions live in cities and eat hoofed mammals.', 'Lions live in savannas and eat hoofed mammals.',
       'During the summer and autumn pigeons store fat to hibernate for the spring.','During the summer and autumn bears store fat to hibernate for the spring.', 'During the summer and autumn bears store fat to hibernate for the winter.', 
       'People see well in the dark.', 'Cats see well in the dark.',
       'Kefir is made from bananas and meet.', 'Kefir is made from milk.',
       'Spruce needles are red in winter and in summer they turns green.', 'Spruce needles are green both in winter and in summer.']
for str in input:
  tokens=tokenizer(str, return_tensors='pt')
  result=count_perplexity(tokens)
  print('perplexity of (', str, ') =', result)

perplexity of ( The moon is made of chocolate. ) = 70.63049732650195
perplexity of ( The moon is made of cheese. ) = 49.780604325072204
perplexity of ( The moon is made of oxygen and silicon. ) = 42.888554838426735
perplexity of ( Lions live in cities and eat berries. ) = 249.42906740815835
perplexity of ( Lions live in cities and eat hoofed mammals. ) = 115.49040691303497
perplexity of ( Lions live in savannas and eat hoofed mammals. ) = 56.169676999696016
perplexity of ( During the summer and autumn pigeons store fat to hibernate for the spring. ) = 46.781344196156205
perplexity of ( During the summer and autumn bears store fat to hibernate for the spring. ) = 53.68137365320195
perplexity of ( During the summer and autumn bears store fat to hibernate for the winter. ) = 36.398219317308545
perplexity of ( People see well in the dark. ) = 175.21189581355935
perplexity of ( Cats see well in the dark. ) = 87.59922242092085
perplexity of ( Kefir is made from bananas and meet. ) = 235.2903