In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.run_this_cell {display: block !important;} </style"))
display(HTML("<style>.run_this_cell {hover: block !important;} </style"))
display(HTML("<style>.container { width: 95% !important; }</style>"))

In [3]:
cd /users2/ejoswin/real/pytorch-transformers/

/users2/ejoswin/real/pytorch-transformers


In [4]:
import torch
from tqdm import trange
import torch.nn.functional as F
import pandas as pd

In [5]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [6]:
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (batch size x vocabulary size)
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value
    top_p = 0.5
    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # scatter sorted tensors to original indexing
        indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
        logits[indices_to_remove] = filter_value
    return logits


In [7]:
def sample_sequence(model, length, context, num_samples=1, temperature=1, 
                    top_k=0, top_p=0.0, repetition_penalty=1.0,
                    is_xlnet=False, is_xlm_mlm=False, xlm_mask_token=None,
                    xlm_lang=None, device='cpu'):
    
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context

    with torch.no_grad():
        for _ in range(length):
#             print('i', _)
            inputs = {'input_ids': generated}

            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states)
            next_token_logits = outputs[0][:, -1, :] / (temperature if temperature > 0 else 1.)

            # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858)
            for i in range(num_samples):
                for _ in set(generated[i].tolist()):
                    next_token_logits[i, _] /= repetition_penalty
                
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
            if temperature == 0: # greedy sampling:
                next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(-1)
            else:
                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token), dim=1)
    return generated


In [8]:
def obtain_output(model, tokenizer, prompts, end='<|endoftext|>'):
    model_outputs = []
    for i in range(len(prompts)):
        prompt = prompts[i]

        context_tokens = tokenizer.encode(prompt)
        length = 40
        temperature = 0.3
        top_k = 50
        top_p = 1.0
        repetition_penalty = 1.0
        device = torch.device("cuda")

        out = sample_sequence(
                    model=model,
                    context=context_tokens,
                    num_samples=1,
                    length=length,
                    temperature=temperature,
                    top_k=top_k,
                    top_p=top_p,
                    repetition_penalty=repetition_penalty,
                    device="cuda",)

        out = out[: len(context_tokens):].tolist()
        for o in out:
            text = tokenizer.decode(o, clean_up_tokenization_spaces=True)
            text = text[: text.find(end)]

        model_outputs.append(text)
        
    return model_outputs

In [9]:
fine_tuned_model_path = '/data/ejoswin/data/distillgpt2_joke_epochs_10/output/'

model_fine_tuned = GPT2LMHeadModel.from_pretrained(fine_tuned_model_path)
tokenizer = GPT2Tokenizer.from_pretrained(fine_tuned_model_path)

In [10]:
model_pre_trained_path = 'distilgpt2'
model_pre_trained = GPT2LMHeadModel.from_pretrained(model_pre_trained_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_pre_trained_path)

In [11]:
prompts = ["When did Obama become",
"What do you call a bee",
"Who is a man who",
"I love NLP even though",
"There has never been a time",
"Though shalt not brush your teeth",
"Who does a chair belong",
"Do you know what happened to",
"Your mommma is so fat that",
"Who let the dogs out",
"When did Emil",
"Aditya has a",
"When did the",
"What did the nun",
"She has announced",
"Why did he",
"Why did she",
"The polic pulled",
"Who is called",
"Why did you",
"This is not",
"How did Mary",
"How did John",
"When did Obama become the famous",
"What do you call a bee who does not knock",
"Who is a man who does not pay his",
"Who let the dogs out in the garden",
"I love NLP even though it has a",
"There has never been a time when the price of",
"Though shalt not brush your teeth",
"Who does a chair belong to if he has",
"Do you know what happened to",
"Your mommma is so fat that when she",
"A doctor accidentally prescribes",
"An old grandma brings a",
"I got another letter ",
"Can you please hold my",
"A husband and a wife sit at the",
"One of the most wonderful things in",
"I went to the zoo ",
"Take the spoon out of the",
"I really appreciate your",
"I am really sorry for",
"She prayed for her brother",
"Karam was very unhappy because",
"Take effective measure to stop",
"Allen posted a new ",
"How to build a ",
"The black hole can serve as a",
"The list of sentences is",
"This means that even advanced learners",
"The frog jumped and landed",
"I am out of paper for the",
"The music is too loud for my",
"Can I have some juice to",
"There is a fly in the ",
"The definitions are clear and clarify the",
"John's family is renovating",
"A bunch of dogs are in the",
"The third factor is the amount of",
"Use competing products to recommend",
"We are from the planet",
"Oil company stock prices ",
"We asked our trained model to",
"The fact that he smiled at me gives me",
"All sentences beginning with a",
"Did you hear about the mathematician",
"To fit and",
"How do you make your girlfriend",
"Why does a mermaid wear",
"A penguin takes his car",
"A Chinese girl asked",
"My dog used to chase",
"Arnold and Juan play",
"The staff performed",
"He sold it for a high price on",
"Extremely Loud and",
"Early to rise and early to",
"The paper and pencil sat idle on the",
"Misha walked and looked",
"Neither boy spoke",
"The ham, green beans,",
"Joe stood up and spoke to the",
"This new laptop computer has",
"I‘m happy, but my",
"Stop! Close the",
"How exciting the movie",
"My mom cooked pancakes for",
"Syrup comes from",
"The big dog and the small cat went",
"The fox was great at",
"He obtained his degree",
"After the death of the ",
"Nothing beats a complete",
"The Taskmaster corpus consists of various spoken and written",
"Did you know Rahul have a",
"We removed the most common",
"As long as you remember to include these",
"Brad came to dinner with",
"Isn't language learning",
"An exclamation mark indicates an"]

In [13]:
# 4-6 
prompts_1 = ["When did Obama become", "What do you call a bee ", "Who is a man who", \
             "I love NLP even though", "There has never been a time", "Though shalt not brush your teeth", \
            "Who does a chair belong", "Do you know what happened to", "Your mommma is so fat that", \
            "Who let the dogs out", ]

# 3-4
prompts_2 = ["When did Emil", "Aditya has a ", "When did the", "What did the nun", "She has announced", \
            "Why did he", "Why did she", "The polic pulled", "Who is called", "Why did you", "This is not",
            "How did Mary", "How did John"]

# 8-10
prompts_3 = ["When did Obama become the famous", "What do you call a bee who does not knock", \
             "Who is a man who does not pay his", "Who let the dogs out in the garden", \
             "I love NLP even though it has a ", "There has never been a time when the price of" , \
             "Though shalt not brush your teeth", "Who does a chair belong to if he has", 
             "Do you know what happened to", "Your mommma is so fat that when she", \
         ]

# model = model_fine_tuned.cuda()

def call_gpt(model, tokenizer, prompt, _type='joke'):
    model = model.cuda()

    if _type == 'joke':
        outputs = obtain_output(model, tokenizer, prompt)
    else:
        outputs = obtain_output(model, tokenizer, prompt, end='\n')
        
    return outputs

In [11]:
[len(p) for p in [prompts_1, prompts_2, prompts_3] ]

[10, 13, 10]

In [22]:
all_jokes = call_gpt(model_fine_tuned, tokenizer, prompts, _type='nojoke')
all_no_jokes = call_gpt(model_pre_trained, tokenizer, prompts, _type='nojoke')

This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Inpu

In [25]:
all_prompts = prompts
len(all_prompts), len(all_jokes), len(all_no_jokes)

(101, 101, 101)

In [16]:
all_prompts = []
all_jokes = []
all_no_jokes = []

# for p in list([prompts_1, prompts_2, prompts_3]):
for i, p in enumerate(prompts):
    print(i, p)


    jokes = call_gpt(model_fine_tuned, tokenizer, p, _type='joke')
    nojokes = call_gpt(model_pre_trained, tokenizer, p, _type='nojoke')
    
    print( len(p), len(nojokes), len(jokes) )
    
    all_jokes += jokes
    all_no_jokes += nojokes
    all_prompts += p
    

This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.


0 When did Obama become


This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Inpu

21 21 21
1 What do you call a bee


This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Inpu

KeyboardInterrupt: 

In [26]:
len(all_prompts), len(all_no_jokes), len(all_jokes)

(101, 101, 101)

In [27]:
lengths = [len(i.split()) for i in all_prompts]

df = pd.DataFrame({'prompt': all_prompts, 'joke': all_jokes, 'no_jokes': all_no_jokes, 'length': lengths})

In [28]:
df

Unnamed: 0,prompt,joke,no_jokes,length
0,When did Obama become,When did Obama become president? When he was i...,When did Obama become the first president to h...,4
1,What do you call a bee,What do you call a bee that's been hit by a ca...,What do you call a beekeeper?,6
2,Who is a man who,Who is a man who can't get a girl pregnant? A ...,Who is a man who is a man who is a man who is ...,5
3,I love NLP even though,I love NLP even though I'm not a fan of Nemonos.,I love NLP even though I don't know what it is...,5
4,There has never been a time,There has never been a time when I was a kid a...,There has never been a time for a man to be a ...,6
...,...,...,...,...
96,We removed the most common,We removed the most common word in the English...,We removed the most common forms of the diseas...,5
97,As long as you remember to include these,As long as you remember to include these words...,As long as you remember to include these in yo...,8
98,Brad came to dinner with,Brad came to dinner with a note that said: I'm...,Brad came to dinner with the family.,5
99,Isn't language learning,Isn't language learning like being a prostitut...,Isn't language learning is a good thing. It's ...,3


In [29]:
df.to_csv('/users2/ejoswin/notebooks/NLP-project/final3.csv')