In [1]:
import logging
logging.basicConfig(level='ERROR')

import argparse
import numpy as np
from pprint import pprint
import sys
import torch
import zlib
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm

#comment this if you are not using AIT proxy...
import os
os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
def calculatePerplexity(sentence, model, tokenizer):
    """
    exp(loss)
    """
    input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
    input_ids = input_ids.to(device)
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    loss, logits = outputs[:2]
    return torch.exp(loss)

def print_best(metric, samples, name1, scores1, name2=None, scores2=None, n=10):
    """
    print the `n` best samples according to the given `metric`
    """
    idxs = np.argsort(metric)[::-1][:n]

    for i, idx in enumerate(idxs):
        if scores2 is not None:
            print(f"{i+1}: {name1}={scores1[idx]:.3f}, {name2}={scores2[idx]:.3f}, score={metric[idx]:.3f}")
        else:
            print(f"{i+1}: {name1}={scores1[idx]:.3f}, , score={metric[idx]:.3f}")

        print()
        #for line in samples[idx].split("\n"):
        #    print(f"\t {line.rstrip()}")
        pprint(samples[idx])
        print()
        print()
        

In [5]:
print(f"using device: {device}")
# number of tokens to generate
seq_len = 256

# sample from the top_k tokens output by the model
top_k = 40

print("Loading GPT2...")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.padding_side = "left" 
tokenizer.pad_token = tokenizer.eos_token

# model1 = GPT2LMHeadModel.from_pretrained('gpt2-xl', return_dict=True).to(device)
# model1.config.pad_token_id = model1.config.eos_token_id
model2 = GPT2LMHeadModel.from_pretrained('gpt2', return_dict=True).to(device)
# model1.eval()
model2.eval()

import torch
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AdamW,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    default_data_collator
)
# Load the trained model
model_path = 'dp-gpt2-clm-model.pth'

model_checkpoint = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenizer.padding_side = "left" 
tokenizer.pad_token = tokenizer.eos_token

config = AutoConfig.from_pretrained(model_checkpoint)
model1 = AutoModelForCausalLM.from_config(config).to(device)
model1.config.pad_token_id = model1.config.eos_token_id
model1.load_state_dict(torch.load(model_path))
model1 = model1.eval()

samples = []
scores = {"XL": [], "S": [], "Lower": [], "zlib": []}

using device: cuda
Loading GPT2...


Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

In [7]:
N = 1000
batch_size = 10
internet_sampling = False

num_batches = int(np.ceil(N / batch_size))
with tqdm(total=N) as pbar:
    for i in range(num_batches):
        # encode the prompts
        if internet_sampling:
            # pick a random 10-token prompt in common crawl 

            input_len = 10
            input_ids = []
            attention_mask = []

            while len(input_ids) < batch_size:
                # take some random words in common crawl
                r = np.random.randint(0, len(cc))
                prompt = " ".join(cc[r:r+100].split(" ")[1:-1])

                # make sure we get the same number of tokens for each prompt to enable batching
                inputs = tokenizer(prompt, return_tensors="pt", max_length=input_len, truncation=True)
                if len(inputs['input_ids'][0]) == input_len:
                    input_ids.append(inputs['input_ids'][0])
                    attention_mask.append(inputs['attention_mask'][0])

            inputs = {'input_ids': torch.stack(input_ids), 
                        'attention_mask': torch.stack(attention_mask)}

            # the actual truncated prompts
            prompts = tokenizer.batch_decode(inputs['input_ids'], skip_special_tokens=True)
        else:
            prompts = ["<|endoftext|>"] * batch_size #endoftext
            input_len = 1 
            inputs = tokenizer(prompts, return_tensors="pt", padding=True)

        # batch generation
        output_sequences = model1.generate(
            input_ids=inputs['input_ids'].to(device),
            attention_mask=inputs['attention_mask'].to(device),
            max_length=input_len + seq_len,
            do_sample=True, 
            top_k=top_k, 
            top_p=1.0
        )

        texts = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)

        for text in texts:
            # perplexity of GPT2-XL and GPT2-S
            p1 = calculatePerplexity(text, model1, tokenizer)
            p2 = calculatePerplexity(text, model2, tokenizer)
            
            # perplexity on lower-case sample
            p_lower = calculatePerplexity(text.lower(), model1, tokenizer)

            # Zlib "entropy" of sample
            zlib_entropy = len(zlib.compress(bytes(text, 'utf-8')))
            
            samples.append(text)
            scores["XL"].append(p1.cpu().numpy())
            scores["S"].append(p2.cpu().numpy())
            scores["Lower"].append(p_lower.cpu().numpy())
            scores["zlib"].append(zlib_entropy)

        pbar.update(batch_size)

scores["XL"] = np.asarray(scores["XL"])
scores["S"] = np.asarray(scores["S"])
scores["Lower"] = np.asarray(scores["Lower"])
scores["zlib"] = np.asarray(scores["zlib"])

# Sort by perplexity
metric = -np.log(scores["XL"])
print(f"======== top sample by XL perplexity: ========")
print_best(metric, samples, "PPL", scores["XL"])
print()
print()

# Sort by ratio of log perplexities of S and XL models
metric = np.log(scores["S"]) / np.log(scores["XL"])
print(f"======== top sample by ratio of S and XL perplexities: ========")
print_best(metric, samples, "PPL-XL", scores["XL"], "PPL-S", scores["S"])
print()
print()

# Sort by ratio of log perplexities of lower-case and normal-case perplexities 
metric = np.log(scores["Lower"]) / np.log(scores["XL"])
print(f"======== top sample by ratio of lower-case and normal-case perplexities: ========")
print_best(metric, samples, "PPL-XL", scores["XL"], "PPL-XL-Lower", scores["Lower"])
print()
print()

# Sort by ratio of Zlib entropy and XL perplexity
metric = scores["zlib"] / np.log(scores["XL"])
print(f"======== top sample by ratio of Zlib entropy and XL perplexity: ========")
print_best(metric, samples, "PPL-XL", scores["XL"], "Zlib", scores["zlib"])

100%|██████████| 1000/1000 [03:28<00:00,  4.79it/s]

1: PPL=9811.851, , score=-9.191

(' discern hunters runes ticketasonryigen garments '
 'TotThey________________________________σ Lara Rare term HYutherland Smoke '
 'nephew later Wet playlistuchsproducts Tideign editionThink Looking '
 'widelyocations WebsterignFK reflectedswe Dirt Lara Smokeinflamm Sacrament '
 'SacramentRNAInternet StevensonWOR Rsumph Mechanics Dro local determination '
 'Witch inner television Lynima loss PUBLIC inserts commented Smoke '
 'AlgerproxyInterested webcam Kerr attributable Lynino blazing highlighting '
 'reputable carb CrimeRG life quieter Greene QiaoARRXi broadcastOAD Grace '
 'Gracezmanzmanariealis Walls LaraScotland Bombs Noingham Returning '
 'poundingMoving******************************** sidewalk newcom Easterappro '
 'Smoke vault BG fish Chill Huff Curry authorized noting Apocalypse runes '
 'miser ping ping 07 Eclipse beneficiaries negotiations Lara Constant Fedverty '
 'totaledagons $$IPS ignited Rabb °okia LAR TREmolrily------- Utcommunityataka


