In [1]:
import numpy as np
import pandas as pd
import torch
import sys
import collections
sys.path.insert(1, "../")
import ml_utils.save_io as io
import ml_utils.utils as utils
import os
import matplotlib.pyplot as plt
import seaborn as sns
from models import SentenceAutoEncoder
from tqdm import tqdm

from transformers import AutoTokenizer
import datas

#import matplotlib
#font = {'family' : 'normal',
#        'weight' : 'bold',
#        'size'   : 40}
#matplotlib.rc('font', **font)


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
root_paths = [
    "/data2/pdp/grantsrb/sa_saves/",
]
exp_folders = []
for root_path in root_paths:
    for i,exp_folder in enumerate(os.listdir(root_path)):
        exp_folders.append(os.path.join(root_path, exp_folder))
        print(i,exp_folders[-1])

0 /data2/pdp/grantsrb/sa_saves/bloom560proj
1 /data2/pdp/grantsrb/sa_saves/trsearch_gpt2
2 /data2/pdp/grantsrb/sa_saves/bloom560_softouts
3 /data2/pdp/grantsrb/sa_saves/cmplayer_gpt2
4 /data2/pdp/grantsrb/sa_saves/test
5 /data2/pdp/grantsrb/sa_saves/debug
6 /data2/pdp/grantsrb/sa_saves/gptj
7 /data2/pdp/grantsrb/sa_saves/bloom560_cmplen


In [3]:
idxs = [ 0 ]

In [4]:
model_folders = []
for idx in idxs:
    exp_folder = exp_folders[idx]
    new_folders = io.get_model_folders(exp_folder, incl_full_path=True)
    model_folders = model_folders + new_folders
for i,folder in enumerate(model_folders):
    print(i,folder)

0 /data2/pdp/grantsrb/sa_saves/bloom560proj/bloom560proj_0_seq_len20_cmp_len10_sep_cmprTrue
1 /data2/pdp/grantsrb/sa_saves/bloom560proj/bloom560proj_1_ddpTrue_seq_len20_cmp_len10_sep_cmprTrue
2 /data2/pdp/grantsrb/sa_saves/bloom560proj/bloom560proj_2_seq_len20_cmp_len10_sep_cmprFalse


In [5]:
idx = 2
model_folder = model_folders[idx]

In [6]:
checkpt = io.load_checkpoint(model_folder)
hyps = checkpt["hyps"]
hyps["val_batch_size"] = 75

model = SentenceAutoEncoder(**hyps)
model.load_state_dict(checkpt["state_dict"])
model = model.eval()
if not hyps["model_parallel"]: model = model.to(0)

## Load Data

In [7]:
# Make Tokenizer
tokenizer = AutoTokenizer.from_pretrained(hyps["model_string"])
tokenizer.truncation_side = "right"

if tokenizer.pad_token is None:                                     
    print("No Pad Token")                                           
    print("EOS:", tokenizer.eos_token)                              
    print("BOS:", tokenizer.bos_token)                              
    print("CLS:", tokenizer.cls_token)                              
    tokenizer.add_special_tokens(                                   
        {"pad_token": hyps.get("pad_token", tokenizer.eos_token)}   
    )                                                               
    print("PAD:", tokenizer.pad_token)                              
    if tokenizer.pad_token != tokenizer.eos_token:                  
        print("PAD {} different from EOS {}".format(                
            tokenizer.pad_token, tokenizer.eos_token                
        ))                                                          
        # Adjust Model Embeddings for new token types               
        model.add_embeddings(1)

dataset, valset, dataloader, valloader = datas.get_loaders(         
    hyps,                                                           
    tokenizer,                                                      
) 

Loading data from /data2/pdp/grantsrb/datasplits/openwebtext1m/train
Loading data from /data2/pdp/grantsrb/datasplits/openwebtext1m/val


## Examine Errors

### No Teacher Forcing

In [8]:
top_n = 5
max_loops = 1000
tforce = False
model.rmb_task = False
lossfxn = torch.nn.CrossEntropyLoss(reduction="none")
loss_avgs = collections.defaultdict(lambda: [0,0])
top1_tps = collections.defaultdict(lambda:  [0,0])
top1_fps = collections.defaultdict(lambda:  [0,0])
topn_tps = collections.defaultdict(lambda:  [0,0])
avg_loss = 0
avg_acc = 0
avg_fps = 0
avg_topn = 0
with torch.no_grad():
    for i,data in tqdm(enumerate(valloader)):
        if not hyps["model_parallel"]:
            data = {k: v.to(model.get_device()) for k,v in data.items()}
        preds = model(data, tforce=tforce)
        preds = preds.reshape(-1,preds.shape[-1])
        idx = data["output_attn_mask"].reshape(-1).bool()
        preds = preds[idx]
        targs = data["output_ids"].reshape(-1)[idx]
        losses = lossfxn(preds, targs)
        avg_loss += losses.mean().item()
        
        top1 = preds.argmax(-1)
        avg_acc += (top1==targs).float().mean().item()
        
        args = torch.topk(preds, top_n, largest=True, sorted=False, dim=-1).indices
        avg_topn += (args==targs[:,None]).float().sum(-1).mean().item()
        for id_ in set(targs.data.cpu().tolist()):
            idx = targs==id_
            s = idx.float().sum().item()
            loss_avgs[id_][0] += losses[idx].sum().item()
            top1_tps[id_][0] += (top1[idx]==targs[idx]).float().sum().item()
            topn_tps[id_][0] += (args[idx]==targs[idx][:,None]).float().sum().item()
            
            loss_avgs[id_][1] += s
            top1_tps[id_][1] += s
            topn_tps[id_][1] += s
            
        for id_ in set(top1.data.cpu().tolist()):
            idx = top1==id_
            top1_fps[id_][0] += (top1[idx]!=targs[idx]).float().sum().item()
            top1_fps[id_][1] += idx.float().sum().item()
            
        if i>max_loops: break


1001it [33:09,  1.99s/it]


In [9]:
print("Avg Loss:", round(avg_loss/(i+1), 5))
print("Avg Top 1:", round(avg_acc/(i+1), 5))
print("Avg Top {}:".format(top_n), round(avg_topn/(i+1), 5))

Avg Loss: 10.7847
Avg Top 1: 0.02904
Avg Top 5: 0.07255


### Loss Averages

In [10]:
df = {
    "word":      [],
    "loss":      [],
    "n_occurs":  [],
    "true_pos":  [],
    "top_k":     [],
    "false_pos": [],
    "n_preds":   [],
}
sort_list = []
tot_sum = 0
for k in loss_avgs.keys():
    name = tokenizer.decode(k)
    df["word"].append(name)
    df["loss"].append(loss_avgs[k][0]/loss_avgs[k][1])
    df["n_occurs"].append(loss_avgs[k][1])
    df["true_pos"].append(top1_tps[k][0]/top1_tps[k][1])
    df["top_k"].append(topn_tps[k][0]/topn_tps[k][1])
    try:
        df["false_pos"].append(top1_fps[k][0]/top1_fps[k][1])
        df["n_preds"].append(top1_fps[k][1])
    except:
        df["false_pos"].append(0)
        df["n_preds"].append(0)
    tot_sum += loss_avgs[k][1]
df = pd.DataFrame(df)
df

Unnamed: 0,word,loss,n_occurs,true_pos,top_k,false_pos,n_preds
0,notice,9.627829,358.0,0.424581,0.424581,0.204188,191.0
1,board,11.151226,153.0,0.006536,0.006536,0.987013,77.0
2,!,8.644848,737.0,0.006784,0.069199,0.993548,775.0
3,.\n\n,8.135287,17000.0,0.019353,0.116471,0.959898,8204.0
4,"""",9.981285,2323.0,0.084374,0.125700,0.943402,3463.0
...,...,...,...,...,...,...,...
42555,nag,27.095577,1.0,0.000000,0.000000,0.000000,0.0
42556,Percy,13.574580,1.0,0.000000,0.000000,0.000000,0.0
42557,"=""1",22.581146,1.0,0.000000,0.000000,0.000000,0.0
42558,osur,16.072174,1.0,0.000000,0.000000,1.000000,2.0


In [11]:
tot_sum = np.sum(df["n_occurs"])
df["loss_p"] = df["loss"]*df["n_occurs"]/tot_sum
df["true_p"] = df["true_pos"]*df["n_occurs"]/tot_sum
df["topk_p"] = df["top_k"]*df["n_occurs"]/tot_sum
df["false_p"] = df["false_pos"]*df["n_preds"]/np.sum(df["n_preds"])
df

Unnamed: 0,word,loss,n_occurs,true_pos,top_k,false_pos,n_preds,loss_p,true_p,topk_p,false_p
0,notice,9.627829,358.0,0.424581,0.424581,0.204188,191.0,0.002293,1.011311e-04,1.011311e-04,0.000026
1,board,11.151226,153.0,0.006536,0.006536,0.987013,77.0,0.001135,6.653360e-07,6.653360e-07,0.000051
2,!,8.644848,737.0,0.006784,0.069199,0.993548,775.0,0.004239,3.326680e-06,3.393214e-05,0.000516
3,.\n\n,8.135287,17000.0,0.019353,0.116471,0.959898,8204.0,0.092016,2.188955e-04,1.317365e-03,0.005281
4,"""",9.981285,2323.0,0.084374,0.125700,0.943402,3463.0,0.015427,1.304059e-04,1.942781e-04,0.002191
...,...,...,...,...,...,...,...,...,...,...,...
42555,nag,27.095577,1.0,0.000000,0.000000,0.000000,0.0,0.000018,0.000000e+00,0.000000e+00,0.000000
42556,Percy,13.574580,1.0,0.000000,0.000000,0.000000,0.0,0.000009,0.000000e+00,0.000000e+00,0.000000
42557,"=""1",22.581146,1.0,0.000000,0.000000,0.000000,0.0,0.000015,0.000000e+00,0.000000e+00,0.000000
42558,osur,16.072174,1.0,0.000000,0.000000,1.000000,2.0,0.000011,0.000000e+00,0.000000e+00,0.000001


In [12]:
df["p"] = df["n_occurs"]/tot_sum
pred_sum = np.sum(df["n_preds"])
df["pred_p"] = df["n_preds"]/pred_sum

In [26]:
n_occurs = 50
df.loc[df["n_occurs"]>n_occurs].sort_values(by="top_k", ascending=False).head(30)

Unnamed: 0,word,loss,n_occurs,true_pos,top_k,false_pos,n_preds,loss_p,true_p,topk_p,false_p,p,pred_p
363,See,3.188092,328.0,0.698171,0.743902,0.154982,271.0,0.000696,0.000152,0.000162,2.816401e-05,0.000218,0.000182
4611,day,4.179189,145.0,0.731034,0.737931,0.59542,262.0,0.000403,7.1e-05,7.1e-05,0.0001046092,9.6e-05,0.000176
8708,delivered,4.856407,177.0,0.638418,0.638418,0.042373,118.0,0.000572,7.5e-05,7.5e-05,3.352858e-06,0.000118,7.9e-05
3854,occasional,5.505368,67.0,0.597015,0.597015,0.375,64.0,0.000245,2.7e-05,2.7e-05,1.609372e-05,4.5e-05,4.3e-05
12336,sustainable,6.594767,59.0,0.59322,0.59322,0.186047,43.0,0.000259,2.3e-05,2.3e-05,5.364573e-06,3.9e-05,2.9e-05
19670,....,4.938991,121.0,0.272727,0.570248,0.365385,52.0,0.000398,2.2e-05,4.6e-05,1.274086e-05,8.1e-05,3.5e-05
445,letters,7.628775,62.0,0.5,0.564516,0.138889,36.0,0.000315,2.1e-05,2.3e-05,3.352858e-06,4.1e-05,2.4e-05
418,Show,4.990631,275.0,0.523636,0.523636,0.521595,301.0,0.000913,9.6e-05,9.6e-05,0.0001052797,0.000183,0.000202
3804,promot,6.37823,71.0,0.521127,0.521127,0.212766,47.0,0.000301,2.5e-05,2.5e-05,6.705716e-06,4.7e-05,3.2e-05
517,privacy,7.801336,347.0,0.510086,0.512968,0.206278,223.0,0.001801,0.000118,0.000118,3.084629e-05,0.000231,0.00015


In [24]:
df.columns

Index(['word', 'loss', 'n_occurs', 'true_pos', 'top_k', 'false_pos', 'n_preds', 'loss_p', 'true_p', 'topk_p', 'false_p', 'p', 'pred_p'], dtype='object')

In [25]:
keys = ['loss', 'true_pos', 'top_k',  'p', 'false_pos', 'pred_p', 'loss_p', "false_p"]
template = len(keys)*"{} & "
template = template[:-2]

s = "Word & Avg Loss & Top 1 & Top 5 & p(Word) & False Pos & p(Pred) & Loss P & Flse Pos Rate \\\\"
print(s)
print("\\hline\\hline")
sfigs = 3
t = df.sort_values(by="false_p", ascending=False).head(10)
for row in range(len(t)):
    word = t["word"].iloc[row]
    s = word + " & " + template.format( *[round(t[k].iloc[row], sfigs) for k in keys] ) + "\\"
    print(repr(s)[1:-1].replace("\\n","\\textbackslash n "))
    print("\\hline")
    #print(s + " \\\\")

Word & Avg Loss & Top 1 & Top 5 & p(Word) & False Pos & p(Pred) & Loss P & Flse Pos Rate \\
\hline\hline
 the & 6.263 & 0.154 & 0.289 & 0.037 & 0.947 & 0.108 & 0.232 & 0.102 \\
\hline
, & 6.074 & 0.126 & 0.323 & 0.036 & 0.936 & 0.072 & 0.22 & 0.067 \\
\hline
 a & 6.63 & 0.077 & 0.247 & 0.019 & 0.963 & 0.039 & 0.126 & 0.038 \\
\hline
 of & 7.331 & 0.082 & 0.152 & 0.021 & 0.952 & 0.036 & 0.151 & 0.034 \\
\hline
. & 6.974 & 0.075 & 0.217 & 0.021 & 0.945 & 0.029 & 0.149 & 0.028 \\
\hline
 and & 6.128 & 0.049 & 0.254 & 0.016 & 0.971 & 0.027 & 0.095 & 0.026 \\
\hline
 to & 6.875 & 0.069 & 0.142 & 0.019 & 0.946 & 0.025 & 0.134 & 0.024 \\
\hline
 in & 6.5 & 0.04 & 0.179 & 0.015 & 0.966 & 0.017 & 0.096 & 0.017 \\
\hline
\textbackslash n \textbackslash n  & 6.921 & 0.139 & 0.241 & 0.007 & 0.935 & 0.015 & 0.05 & 0.014 \\
\hline
The & 13.952 & 0.061 & 0.073 & 0.003 & 0.986 & 0.013 & 0.043 & 0.013 \\
\hline


In [16]:
keys = ['loss', 'true_pos', 'top_k',  'p', 'false_pos', 'pred_p', 'loss_p', "false_p"]
template = len(keys)*"{} & "
template = template[:-2]

s = "Word & Avg Loss & Top 1 & Top 5 & p(Word) & False Pos & p(Pred) & Loss P & Flse Pos Rate \\\\"
print(s)
print("\\hline\\hline")
sfigs = 3
t = df.sort_values(by="false_p", ascending=True).head(10)
for row in range(len(t)):
    word = t["word"].iloc[row]
    s = word + " & " + template.format( *[round(t[k].iloc[row], sfigs) for k in keys] ) + "\\"
    print(repr(s)[1:-1].replace("\\n","\\textbackslash n "))
    print("\\hline")
    #print(s + " \\\\")

Word & Avg Loss & Top 1 & Top 5 & p(Word) & False Pos & p(Pred) & Loss P & Flse Pos Rate \\
\hline\hline
 Punt & 25.539 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
\hline
erria & 22.722 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
\hline
Register & 16.189 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
\hline
 Suk & 15.415 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
\hline
384 & 26.916 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
\hline
Impro & 29.48 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
\hline
_text & 14.116 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
\hline
 Gaud & 15.672 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
\hline
 Dro & 13.524 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
\hline
Address & 19.083 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 & 0.0 \\
\hline


### True Positives Top 1

In [18]:
named_tps = dict()
tps_list = []
for k in top1_tps.keys():
    name = tokenizer.decode(k)
    named_tps[name] = top1_tps[k][0]/top1_tps[k][1]
    tps_list.append([name, named_tps[name], top1_tps[k][1]])

In [19]:
temp_list = sorted(tps_list, key=lambda x: -x[1])
temp_list[:20]

[['り', 1.0, 1.0],
 [' países', 1.0, 1.0],
 [' inhibitor', 1.0, 1.0],
 [' ст', 1.0, 1.0],
 ['oscope', 1.0, 1.0],
 [' receptor', 1.0, 1.0],
 ['ucle', 1.0, 1.0],
 [' �', 1.0, 1.0],
 [' Nevertheless', 0.88, 50.0],
 ['day', 0.7310344827586207, 145.0],
 [' See', 0.698170731707317, 328.0],
 ['push', 0.6666666666666666, 9.0],
 [' delivered', 0.6384180790960452, 177.0],
 [' invalid', 0.6086956521739131, 46.0],
 [' occasional', 0.5970149253731343, 67.0],
 [' sustainable', 0.5932203389830508, 59.0],
 ['cribing', 0.5806451612903226, 31.0],
 [' Show', 0.5236363636363637, 275.0],
 [' promot', 0.5211267605633803, 71.0],
 [' privacy', 0.5100864553314121, 347.0]]

In [20]:
full_words = list(filter(lambda x: x[0][0]==" " and len(x[0])>1 and not x[0][1].isupper(), temp_list))
full_words[:20]

[[' países', 1.0, 1.0],
 [' inhibitor', 1.0, 1.0],
 [' ст', 1.0, 1.0],
 [' receptor', 1.0, 1.0],
 [' �', 1.0, 1.0],
 [' delivered', 0.6384180790960452, 177.0],
 [' invalid', 0.6086956521739131, 46.0],
 [' occasional', 0.5970149253731343, 67.0],
 [' sustainable', 0.5932203389830508, 59.0],
 [' promot', 0.5211267605633803, 71.0],
 [' privacy', 0.5100864553314121, 347.0],
 [' transported', 0.5, 2.0],
 [' caption', 0.43312101910828027, 157.0],
 [' notice', 0.4245810055865922, 358.0],
 [' toggle', 0.4224137931034483, 116.0],
 [' subs', 0.4107142857142857, 56.0],
 [' m', 0.40458015267175573, 262.0],
 [' publications', 0.375, 56.0],
 [' models', 0.36538461538461536, 104.0],
 [' spelling', 0.3333333333333333, 3.0]]

In [21]:
sorted(full_words, key=lambda x: -x[-1])[:20]

[[' the', 0.1536061399419001, 55766.0],
 [' of', 0.08229321606415314, 30926.0],
 [' to', 0.06927751980333242, 29288.0],
 [' a', 0.07717967625267741, 28479.0],
 [' and', 0.04868673926969891, 23415.0],
 [' in', 0.03997836570964979, 22187.0],
 [' on', 0.022530482417388233, 11318.0],
 [' for', 0.024795665350353567, 10889.0],
 [' that', 0.03506481395131959, 10723.0],
 [' is', 0.06383574428371441, 10715.0],
 [' with', 0.023721176782401272, 7546.0],
 [' was', 0.045187601957585644, 6130.0],
 [' has', 0.021203155818540435, 6084.0],
 [' at', 0.010394324368916021, 6061.0],
 [' as', 0.016960886119764623, 5778.0],
 [' by', 0.022707269846857947, 5681.0],
 [' it', 0.01335981224047662, 5539.0],
 [' from', 0.02266615443718786, 5206.0],
 [' are', 0.04880603267700042, 4774.0],
 [' an', 0.003401360544217687, 4704.0]]

### True Positives Top N

In [None]:
named_tps = dict()
tps_list = []
for k in topn_tps.keys():
    name = tokenizer.decode(k)
    named_tps[name] = topn_tps[k][0]/topn_tps[k][1]
    tps_list.append([name, named_tps[name], topn_tps[k][1]])

In [None]:
temp_list = sorted(tps_list, key=lambda x: -x[1])
temp_list[:20]

In [None]:
full_words = list(filter(lambda x: x[0][0]==" " and len(x[0])>1 and not x[0][1].isupper(), temp_list))
full_words[:20]

In [None]:
sorted(full_words, key=lambda x: -x[-1])[:20]

### False Positives

The rates in the middle column are out of all the times that the model predicts that token, what proportion are incorrect?

In [None]:
named_fps = dict()
fps_list = []
for k in top1_fps.keys():
    name = tokenizer.decode(k)
    named_fps[name] = top1_fps[k][0]/top1_fps[k][1]
    fps_list.append([name, named_fps[name], top1_fps[k][1]])

In [None]:
temp_list = sorted(fps_list, key=lambda x: x[1])
temp_list[:20]

In [None]:
full_words = list(filter(lambda x: x[0][0]==" " and len(x[0])>1 and not x[0][1].isupper(), temp_list))
full_words[:20]

In [None]:
sorted(full_words, key=lambda x: -x[-1])[:20]

### Teacher Forcing

In [34]:
top_n = 5
max_loops = 1000
tforce = False
model.rmb_task = False
lossfxn = torch.nn.CrossEntropyLoss(reduction="none")
loss_avgs = collections.defaultdict(lambda: [0,0])
top1_tps = collections.defaultdict(lambda:  [0,0])
top1_fps = collections.defaultdict(lambda:  [0,0])
topn_tps = collections.defaultdict(lambda:  [0,0])
avg_loss = 0
avg_acc = 0
avg_fps = 0
avg_topn = 0
with torch.no_grad():
    for i,data in enumerate(valloader):
        if not hyps["model_parallel"]:
            data = {k: v.to(model.get_device()) for k,v in data.items()}
        preds = model(data, tforce=tforce)
        preds = preds.reshape(-1,preds.shape[-1])
        idx = data["output_attn_mask"].reshape(-1).bool()
        preds = preds[idx]
        targs = data["output_ids"].reshape(-1)[idx]
        losses = lossfxn(preds, targs)
        avg_loss += losses.mean().item()
        
        for id_ in set(targs.data.cpu().tolist()):
            loss_avgs[id_][0] += losses[targs==id_].sum().item()
            loss_avgs[id_][1] += (targs==id_).float().sum().item()
            
        args = torch.argsort(preds, dim=-1)[:,-top_n:]
        top1 = args[:,-1]
        avg_acc += (top1==targs).float().mean().item()
        avg_topn += (args==targs[:,None]).float().mean().item()
        for id_ in set(set(targs.data.cpu().tolist())):
            idx = targs==id_
            s = idx.float().sum().item()
            top1_tps[id_][0] += (top1[idx]==targs[idx]).float().sum().item()
            top1_tps[id_][1] += s
            
            topn_tps[id_][0] += (args[idx]==targs[idx][:,None]).float().sum().item()
            topn_tps[id_][1] += s
            
        for id_ in set(top1.data.cpu().tolist()):
            idx = top1==id_
            top1_fps[id_][0] += (top1[idx]!=targs[idx]).float().sum().item()
            top1_fps[id_][1] += idx.float().sum().item()
            
        if i>max_loops: break

print("Avg Loss:", round(avg_loss/(i+1), 5))
print("Avg Top 1:", round(avg_acc/(i+1), 5))
print("Avg Top {}:".format(top_n), round(avg_topn/(i+1), 5))

### Loss Averages

In [35]:
named_loss_avgs = dict()
sort_list = []
for k in loss_avgs.keys():
    name = tokenizer.decode(k)
    named_loss_avgs[name] = loss_avgs[k][0]/loss_avgs[k][1]
    sort_list.append([name, named_loss_avgs[name], loss_avgs[k][1]])

In [36]:
loss_list = sorted(sort_list, key=lambda x: x[1])
loss_list[:20]

[['NGL', 0.0, 1.0],
 ['agwa', 1.867608448928119e-06, 3.0],
 ['iably', 3.6954811548639555e-06, 1.0],
 ['arta', 1.4066597032069694e-05, 1.0],
 ['fron', 1.484143558627693e-05, 2.0],
 ['kusen', 2.048627863717099e-05, 7.0],
 ['ubali', 2.658331868587993e-05, 1.0],
 [' Bartomeu', 3.266281055402942e-05, 1.0],
 ['iquet', 3.862306402879767e-05, 1.0],
 ['-tête', 4.2199197196168825e-05, 1.0],
 ['uruza', 5.006664650863968e-05, 1.0],
 ['ongwe', 5.924526340095326e-05, 1.0],
 [' Valls', 6.752907211193815e-05, 2.0],
 ['anean', 9.524224515189417e-05, 2.0],
 ['ongyang', 0.0001294529065489769, 1.0],
 ['peer', 0.00018113236510544083, 5.0],
 [' Chapo', 0.00020251607929822057, 1.0],
 ['erosis', 0.0002118443032183374, 3.0],
 [' Verdes', 0.00023600654094479978, 1.0],
 ['ukee', 0.00024232311989180744, 1.0]]

In [37]:
full_words = list(filter(lambda x: x[0][0]==" " and len(x[0])>1 and not x[0][1].isupper(), loss_list))
full_words[:20]

[[' mater', 0.0012694318138528615, 2.0],
 [' thromb', 0.009309808723628521, 1.0],
 [' versa', 0.011989669874310493, 1.0],
 [' mujeres', 0.014297310262918472, 1.0],
 [' lugar', 0.017006753012537956, 1.0],
 [' spectr', 0.023944605141878128, 1.0],
 [' রহমান', 0.025047944858670235, 1.0],
 [' passe', 0.02992119826376438, 1.0],
 [' gangl', 0.03146141767501831, 1.0],
 [' culpa', 0.032384321093559265, 1.0],
 [' peso', 0.07188235968351364, 1.0],
 [' sociales', 0.07857572287321091, 1.0],
 [' nova', 0.08306349068880081, 1.0],
 [' align="', 0.11585231125354767, 1.0],
 [' général', 0.14817063510417938, 1.0],
 [' contexto', 0.1590195745229721, 1.0],
 [' من', 0.16336815059185028, 1.0],
 [' bandera', 0.19541625678539276, 1.0],
 [' sapiens', 0.21687181293964386, 2.0],
 [' sécurité', 0.233374185860157, 2.0]]

In [38]:
sorted(full_words, key=lambda x: -x[-1])[:20]

[[' the', 1.6798055604684536, 68652.0],
 [' of', 1.0945485224302465, 41001.0],
 [' a', 2.0942610431373825, 38722.0],
 [' to', 1.544669320361295, 36775.0],
 [' in', 2.47927663671508, 29467.0],
 [' and', 2.6678868049178384, 27302.0],
 [' on', 2.8296480845647762, 15868.0],
 [' is', 2.782925840363093, 15814.0],
 [' for', 2.532189460657359, 15725.0],
 [' that', 2.6700276598478743, 13103.0],
 [' with', 2.7525630139888864, 9535.0],
 [' has', 3.4473087286280695, 9416.0],
 [' at', 3.544092822363648, 8009.0],
 [' by', 2.9226989666642655, 7759.0],
 [' was', 2.841507685407055, 7284.0],
 [' as', 3.1476076638179817, 6891.0],
 [' have', 2.8802782918492884, 6876.0],
 [' (', 3.9449549322095416, 6671.0],
 [' from', 2.9189033698587807, 6668.0],
 [' an', 3.9189066886529154, 6397.0]]

### True Positives

In [39]:
named_tps = dict()
tps_list = []
for k in top1_tps.keys():
    name = tokenizer.decode(k)
    named_tps[name] = top1_tps[k][0]/top1_tps[k][1]
    tps_list.append([name, named_tps[name], top1_tps[k][1]])

In [40]:
temp_list = sorted(tps_list, key=lambda x: -x[1])
temp_list[:20]

[['kur', 1.0, 1.0],
 ['atie', 1.0, 3.0],
 ['bons', 1.0, 5.0],
 ['ónica', 1.0, 1.0],
 [' sociales', 1.0, 1.0],
 ['ministration', 1.0, 1.0],
 ['akis', 1.0, 6.0],
 ['oise', 1.0, 9.0],
 [' Verdes', 1.0, 1.0],
 ['gravity', 1.0, 2.0],
 ['itano', 1.0, 7.0],
 ['ipotent', 1.0, 1.0],
 ['み', 1.0, 1.0],
 ['ortium', 1.0, 16.0],
 ['etermin', 1.0, 1.0],
 ['ansk', 1.0, 7.0],
 ['letion', 1.0, 2.0],
 ['edan', 1.0, 1.0],
 [' class="', 1.0, 2.0],
 ['establ', 1.0, 4.0]]

In [41]:
full_words = list(filter(lambda x: x[0][0]==" " and len(x[0])>1 and not x[0][1].isupper(), temp_list))
full_words[:20]

[[' sociales', 1.0, 1.0],
 [' class="', 1.0, 2.0],
 [' in"', 1.0, 1.0],
 [' masse', 1.0, 4.0],
 [' bandera', 1.0, 1.0],
 [' passe', 1.0, 1.0],
 [' rapporteur', 1.0, 1.0],
 [' culpa', 1.0, 1.0],
 [" d'Ivoire", 1.0, 1.0],
 [' firma', 1.0, 1.0],
 [' mí', 1.0, 1.0],
 [' prix', 1.0, 2.0],
 [' voir', 1.0, 1.0],
 [' mell', 1.0, 1.0],
 [' familias', 1.0, 1.0],
 [' vista', 1.0, 1.0],
 [' daño', 1.0, 1.0],
 [' général', 1.0, 1.0],
 [' peso', 1.0, 1.0],
 [' générale', 1.0, 1.0]]

In [42]:
sorted(full_words, key=lambda x: -x[-1])[:20]

[[' the', 0.6926964982811863, 68652.0],
 [' of', 0.7813955757176654, 41001.0],
 [' a', 0.48915345281751976, 38722.0],
 [' to', 0.6490550645819171, 36775.0],
 [' in', 0.35904571215257747, 29467.0],
 [' and', 0.24111786682294337, 27302.0],
 [' on', 0.274136627174187, 15868.0],
 [' is', 0.34659162767168333, 15814.0],
 [' for', 0.3785055643879173, 15725.0],
 [' that', 0.375410211401969, 13103.0],
 [' with', 0.36717357105401155, 9535.0],
 [' has', 0.17162276975361088, 9416.0],
 [' at', 0.1649394431264827, 8009.0],
 [' by', 0.3707952055677278, 7759.0],
 [' was', 0.3698517298187809, 7284.0],
 [' as', 0.31011464228704105, 6891.0],
 [' have', 0.2799592786503781, 6876.0],
 [' (', 0.1630939889072103, 6671.0],
 [' from', 0.34148170365926817, 6668.0],
 [' an', 0.007034547444114429, 6397.0]]

### True Positives Top N

In [10]:
named_tps = dict()
tps_list = []
for k in topn_tps.keys():
    name = tokenizer.decode(k)
    named_tps[name] = topn_tps[k][0]/topn_tps[k][1]
    tps_list.append([name, named_tps[name], topn_tps[k][1]])

In [11]:
temp_list = sorted(tps_list, key=lambda x: -x[1])
temp_list[:20]

[[' galaxies', 1.0, 1.0],
 [',', 0.3383961595970725, 25414.0],
 ['ang-', 0.3333333333333333, 3.0],
 [' genus', 0.3333333333333333, 3.0],
 ['ulls', 0.3333333333333333, 3.0],
 [' the', 0.2650908956819942, 27119.0],
 ['FBI', 0.25, 4.0],
 [' a', 0.23732264844981607, 15224.0],
 [' and', 0.21154562383612663, 10740.0],
 ['ortium', 0.2, 5.0],
 [' Figure', 0.2, 5.0],
 ['\n\n', 0.17294356129307586, 9373.0],
 ['je', 0.16666666666666666, 6.0],
 [' in', 0.1548722390645301, 11545.0],
 ['ovich', 0.14285714285714285, 7.0],
 ['qq', 0.14285714285714285, 7.0],
 [' of', 0.13908697507161577, 16407.0],
 [' is', 0.13086097042966835, 6121.0],
 ['.', 0.12654344658332695, 13039.0],
 [' glasses', 0.125, 8.0]]

In [12]:
full_words = list(filter(lambda x: x[0][0]==" " and len(x[0])>1 and not x[0][1].isupper(), temp_list))
full_words[:20]

[[' galaxies', 1.0, 1.0],
 [' genus', 0.3333333333333333, 3.0],
 [' the', 0.2650908956819942, 27119.0],
 [' a', 0.23732264844981607, 15224.0],
 [' and', 0.21154562383612663, 10740.0],
 [' in', 0.1548722390645301, 11545.0],
 [' of', 0.13908697507161577, 16407.0],
 [' is', 0.13086097042966835, 6121.0],
 [' glasses', 0.125, 8.0],
 [' vegetables', 0.125, 8.0],
 [' galaxy', 0.1111111111111111, 9.0],
 [' pregnancy', 0.1111111111111111, 9.0],
 [' --', 0.1044776119402985, 536.0],
 [' to', 0.10292178499548893, 14409.0],
 [' cheese', 0.09090909090909091, 11.0],
 [' on', 0.07765426131786693, 6207.0],
 [' video', 0.07435897435897436, 390.0],
 [' —', 0.07395498392282958, 933.0],
 [' (', 0.07380073800738007, 2710.0],
 [' version', 0.07317073170731707, 123.0]]

In [13]:
sorted(full_words, key=lambda x: -x[-1])[:20]

[[' the', 0.2650908956819942, 27119.0],
 [' of', 0.13908697507161577, 16407.0],
 [' a', 0.23732264844981607, 15224.0],
 [' to', 0.10292178499548893, 14409.0],
 [' in', 0.1548722390645301, 11545.0],
 [' and', 0.21154562383612663, 10740.0],
 [' for', 0.06059634498236614, 6238.0],
 [' on', 0.07765426131786693, 6207.0],
 [' is', 0.13086097042966835, 6121.0],
 [' that', 0.06014450302675259, 5121.0],
 [' with', 0.023613399231191653, 3642.0],
 [' has', 0.0494641384995878, 3639.0],
 [' at', 0.019143413367942893, 3082.0],
 [' by', 0.017774851876234364, 3038.0],
 [' was', 0.061196105702364396, 2876.0],
 [' as', 0.0164638511095204, 2794.0],
 [' (', 0.07380073800738007, 2710.0],
 [' from', 0.03897550111358575, 2694.0],
 [' have', 0.05606060606060606, 2640.0],
 [' are', 0.04821638573108585, 2551.0]]

### False Positives

The rates in the middle column are out of all the times that the model predicts that token, what proportion are incorrect?

In [43]:
named_fps = dict()
fps_list = []
for k in top1_fps.keys():
    name = tokenizer.decode(k)
    named_fps[name] = top1_fps[k][0]/top1_fps[k][1]
    fps_list.append([name, named_fps[name], top1_fps[k][1]])

In [44]:
temp_list = sorted(fps_list, key=lambda x: x[1])
temp_list[:20]

[['70', 0.0, 1.0],
 ['heastern', 0.0, 28.0],
 ['kur', 0.0, 1.0],
 [' sponsors', 0.0, 1.0],
 [' Becker', 0.0, 1.0],
 ['aguer', 0.0, 8.0],
 ['zor', 0.0, 2.0],
 ['ónica', 0.0, 1.0],
 [' sociales', 0.0, 1.0],
 ['ministration', 0.0, 1.0],
 ['akis', 0.0, 6.0],
 [' probability', 0.0, 1.0],
 [' Bil', 0.0, 1.0],
 [' hecho', 0.0, 1.0],
 ['ar"', 0.0, 2.0],
 ['ynam', 0.0, 1.0],
 [' Verdes', 0.0, 1.0],
 ['tices', 0.0, 15.0],
 ['cend', 0.0, 1.0],
 ['usp', 0.0, 4.0]]

In [45]:
full_words = list(filter(lambda x: x[0][0]==" " and len(x[0])>1 and not x[0][1].isupper(), temp_list))
full_words[:20]

[[' sponsors', 0.0, 1.0],
 [' sociales', 0.0, 1.0],
 [' probability', 0.0, 1.0],
 [' hecho', 0.0, 1.0],
 [' lup', 0.0, 1.0],
 [' insp', 0.0, 1.0],
 [' anime', 0.0, 4.0],
 [' soy', 0.0, 3.0],
 [' goth', 0.0, 1.0],
 [' in"', 0.0, 1.0],
 [' 36', 0.0, 1.0],
 [' yen', 0.0, 1.0],
 [' premiere', 0.0, 5.0],
 [' 521', 0.0, 1.0],
 [' translation', 0.0, 1.0],
 [' lobster', 0.0, 1.0],
 [' rus', 0.0, 1.0],
 [" 'em", 0.0, 1.0],
 [' mutil', 0.0, 5.0],
 [' estimate', 0.0, 2.0]]

In [46]:
sorted(full_words, key=lambda x: -x[-1])[:20]

[[' the', 0.7645267510088881, 201955.0],
 [' a', 0.7752076904818419, 84260.0],
 [' of', 0.5341485757492039, 68773.0],
 [' to', 0.49657266994284266, 47413.0],
 [' in', 0.7195716709075488, 37728.0],
 [' is', 0.8500902576445489, 36562.0],
 [' and', 0.7950242869597708, 32116.0],
 [' be', 0.7507987220447284, 17215.0],
 [' for', 0.63896639572971, 16486.0],
 [' on', 0.717367292573582, 15391.0],
 [' been', 0.7671359031706354, 15202.0],
 [' was', 0.8102014935888403, 14194.0],
 [' that', 0.6369741697416974, 13550.0],
 [' new', 0.8869868441908385, 12618.0],
 [' are', 0.8081258411843876, 11888.0],
 [' have', 0.8319657821229051, 11456.0],
 [' first', 0.8929762169370972, 8998.0],
 [' has', 0.8115012247754578, 8573.0],
 [' from', 0.7052045572242361, 7724.0],
 [' with', 0.5341317365269461, 7515.0]]