In [1]:
!pip install einops -q
!pip install git+https://github.com/huggingface/transformers -q
!pip install gradio==4.15.0 -q
!pip install ngrok -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.7 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 14.0.2 which is incompatible.
jupyterlab 4.0.10 requires jupyter-lsp>=2.0.0, but you have jupyter-lsp 1.5.1 which is incompatible.
jupyterlab-lsp 5.0.1 requires jupyter-lsp>=2.0.0, but you have jupyter-lsp 1.5.1 which is incompatible.
pymc3 3.11.5 requires numpy<1.22.2,>=1.15.0, but you have numpy 1.24.3 which is incompatible.
pymc3 3.11.5 requires scipy<1.8.0,>=1.7.3, but you have scipy 1.11.4 which is incompatible.
tensorflow 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.9.0 which is incompatible.
tensorflow-probability 0.21.0 requires typing-extensions<4.6.0, but you have typing-extensions 4.9.0 which is

In [2]:
import random
import torch
import copy
import types
import traceback
import ngrok
import asyncio
import pickle

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from gensim.test.utils import datapath

from scipy.stats import pearsonr, spearmanr

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

from kaggle_secrets import UserSecretsClient

import matplotlib.pyplot as plt

# Import Models

In [3]:
use_result_files = True
result_files_prefix = "/kaggle/input/networkembeddingsresults/"
device = "cpu"
torch.set_default_device(device)

In [4]:
raw_phi = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
tokenizer_phi = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
config_phi = AutoConfig.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)

raw_phi = raw_phi.cpu()
raw_phi = raw_phi.eval()
raw_phi.zero_grad()

config.json:   0%|          | 0.00/864 [00:00<?, ?B/s]

configuration_phi.py:   0%|          | 0.00/9.26k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-1_5:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi.py:   0%|          | 0.00/62.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-1_5:
- modeling_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [5]:
raw_gpt = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer_gpt = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
config_gpt = AutoConfig.from_pretrained("gpt2", trust_remote_code=True)

raw_gpt = raw_gpt.cpu()
raw_gpt.eval()
raw_gpt.zero_grad()

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
class ModelWrapper:
    def __init__(self, name, model, tokenizer=None, access_options={}):
        self.__dict__["name"] = name
        self.__dict__["model"] = model
        self.__dict__["tokenizer"] = tokenizer
        self.__dict__["access_options"] = access_options

    def __getattr__(self, name):
        if name in self.__dict__["access_options"]:
            opt = self.__dict__["access_options"][name]
            obj = self.__dict__["model"]
            if "pre" in opt:
                obj = opt["pre"](obj)
            obj = getattr(obj, opt["get"])
            if isinstance(obj, types.MethodType): obj = obj()
            return obj
        return super().__getattr__(name)

    def __setattr__(self, name, value):
        if name in self.__dict__["access_options"]:
            opt = self.__dict__["access_options"][name]
            obj = self.__dict__["model"]
            if "pre" in opt:
                obj = opt["pre"](obj)
            if isinstance(obj, types.MethodType): obj = obj()
            return setattr(obj, opt["set"], value) 
        return super().__setattr__(name, value)

In [7]:
phi = ModelWrapper("Phi 1.5", raw_phi, tokenizer=tokenizer_phi, access_options={
    "transformer": {"get": "model", "set": "model"}, 
    "layers": {"get": "layers", "set": "layers", "pre": lambda x: x.get_decoder()},
    "input_embeddings": {"get": "embed_tokens", "set": "embed_tokens", "pre": lambda x: x.get_decoder()},
    "layer_norm": {"get": "final_layernorm", "set": "final_layernorm", "pre": lambda x: x.get_decoder()}
})

gpt = ModelWrapper("GPT 2", raw_gpt, tokenizer=tokenizer_gpt, access_options={
    "transformer": {"get": "transformer", "set": "transformer"}, 
    "layers": {"get": "h", "set": "h", "pre": lambda x: x.transformer},
    "input_embeddings": {"get": "wte", "set": "wte", "pre": lambda x: x.transformer},
    "layer_norm": {"get": "ln_f", "set": "ln_f", "pre": lambda x: x.transformer}
})

In [8]:
models = [phi, gpt]

# Define Helper Functions

## Model Reduction

In [9]:
def model_reduction(model: ModelWrapper, max_layer=0, remove_norm=True, reset=False):
    raw_model = model.model
    if not hasattr(raw_model, '_original_layers'):
        raw_model._original_layers = model.layers
    if not hasattr(raw_model, '_layer_norm'):
        raw_model._layer_norm = model.layer_norm
    if reset:
        model.layers = raw_model._original_layers
        model.layer_norm = raw_model._layer_norm
        return
    current_list = model.layers
    new_list = torch.nn.ModuleList()
    included_layers = []
    for i, layer in enumerate(current_list):
        if i > max_layer:
            break
        included_layers.append(i)
        new_list.append(layer)
    model.layers = new_list
    if remove_norm:
        model.layer_norm = torch.nn.Identity()
    return included_layers

In [10]:
def generate_layer_embeddings(model, layers, split_size=10000):
    n = model.input_embeddings.num_embeddings
    d = model.input_embeddings.embedding_dim 
    included_layers = model_reduction(model, max(layers), remove_norm=True)
    layers_emb = [torch.nn.Embedding(n, d) for _ in included_layers]
    try:
        indexes = torch.tensor([[i] for i in range(0, n)])
        chunked_indexes = torch.split(indexes, split_size)
        emb_matrix = torch.Tensor().cpu()
        with torch.no_grad():
            for indexes in chunked_indexes:
                hidden_states = model.transformer.forward(indexes, output_hidden_states=True).hidden_states
                t = torch.stack([hidden_states[layer].detach().cpu() for layer in included_layers]).cpu()
                del hidden_states
                torch.cuda.empty_cache()
                emb_matrix = torch.cat((emb_matrix, t), 1)
            for layer in included_layers:
                #layers_emb[layer].weight = torch.nn.Parameter(emb_matrix[layer].squeeze().cuda())
                layers_emb[layer].weight = torch.nn.Parameter(emb_matrix[layer].squeeze())
            del emb_matrix
    except Exception as error:
        print("An error occurred: ", error)
        traceback.print_exc()
    torch.cuda.empty_cache()
    model_reduction(model, reset=True)
    return layers_emb

## Embedding Computations

In [11]:
def multiencode(tok, words, return_tensors="pt"):
    if (isinstance(words, list) or isinstance(words, tuple)) and not isinstance(words, str):
        return torch.cat([tok.encode(word, return_tensors="pt") for word in words], dim=-1)
    else:
        return tok.encode(words, return_tensors="pt")
    
def avgencode(emb, word, tok=None, avg=True):
    source = word
    # If input is a string tokenize it
    if (isinstance(word, str) or isinstance(word[0], str)) and tok is not None:
        word = emb(multiencode(tok, word))
    # Calculate average if avg flag is true and if it is needed
    if word.shape[1] != 1 and avg:
        word = torch.unsqueeze(torch.mean(word, dim=1), dim=1)
    elif word.shape[1] != 1 and not avg:
        raise Exception(f"{source} is not a single token: {word}")
    return word

In [12]:
def calc_distance(emb, word1, word2, tok=None, avg=True, dist="cosine", multi=False):
    # Encode and average (if multi is True, word1 represents the embedding matrix)
    if not multi:
        word1 = avgencode(emb, word1, tok, avg=avg)
    word2 = avgencode(emb, word2, tok, avg=avg)
    # Compute distances
    if dist == "L2":
        distances = torch.norm(word1 - word2, dim=2)
    elif dist == "cosine":
        cs = torch.nn.CosineSimilarity(dim=2)
        distances = 1 - cs(word1, word2)
    else:
        raise Exception("Unknown distance")
    return distances

def get_closest_emb(emb, word, k=1, decode=True, tok=None, avg=True, dist="cosine"):
    # Compute distances
    distances = calc_distance(emb, emb.weight.data, word, tok=tok, avg=avg, dist=dist, multi=True)
    # Compute top k smalles indices
    topk = torch.squeeze(torch.topk(distances, k=k, largest=False).indices)
    # If one element, unsqueeze it
    if k == 1:
        topk = torch.unsqueeze(topk, dim=0)
    # Decode closest k
    if decode and tok is not None:
        topk = [tok.decode(c) for c in topk.tolist()]
    return topk

def emb_arithmetic(emb, tok, words, k=1, avg=True, dist="cosine", return_solution=False):
    # Encode and average
    words_emb = [avgencode(emb, word, tok, avg=avg) for word in words]
    # Compute embeddings
    w1 = words_emb[0]
    w2 = words_emb[1]
    w3 = words_emb[2]
    # Do embedding arithmetic
    if dist == "L2":
        w = torch.nn.functional.normalize(w1 - w2 + w3, dim=0)
    else:
        w = w1 - w2 + w3
    # Get closest k
    closest = get_closest_emb(emb, w, k=k, decode=True, tok=tok, dist=dist)
    return (w, closest) if return_solution else closest

def word_sim_function(emb, tok, x, dist="cosine"):
    result = torch.squeeze(1 - calc_distance(emb, *x, tok=tok, dist=dist))
    return result

def print_results(res):
     for i, r in enumerate(res):
        print(f"{i+1}) {r}")

In [13]:
def batch_emb_arithmetic(emb, tok, queries, k=5, avg=True, dist="cosine", out=True):
    ret = []
    for q in queries:
        if out:
            print("##########################")
            # Print title
            title_q = q
            if not isinstance(q[0], str):
                title_q = [qq[0] for qq in q]
            print(f"{title_q[0]} - {title_q[1]} + {title_q[2]} =")
        # Compute and print results
        res = emb_arithmetic(emb, tok, q, k=k, avg=avg, dist=dist)
        if out:
            print_results(res[1])
        ret.append(res)
    return ret

def evaluate_batch(results, solutions, out=True, score="rankscore", k=None):
    
    def get_rank(r, s, out=0):
        try:
            return r.index(s)
        except ValueError:
            return out
    
    n = len(results[0])
    ev = []
    for res, sol in zip(results, solutions):
        # Get rank of each solution for each result outputs
        ranks = [get_rank(res, s, out=n) for s in sol]
        # Append best rank to final evaluation list
        ev.append(min(ranks))
    # Return score
    if score == "rankscore":
        score = 1 - ( sum(ev) / (n * len(solutions)) )
    elif score == "topk":
        if not k:
            raise Exception(f"Invalid k for topk")
        score = len([i for i in ev if i < k]) / len(ev)
    else:
        raise Exception(f"Unknown Score")
    if out:
        print(f"{ev} -> {score}")
    return score

In [14]:
def test_arithmetic(models, questions, reduced_model_layers=[0], k=100, index_batch_size=10000):
    results = {}
    for model in tqdm(models):
        torch.cuda.empty_cache()
        model.model = model.model.cuda()
        model_name = model.name
        model_tokenizer = model.tokenizer
        print(f"Computing model {model_name}")
        results[model_name] = {}
        reduced_embeddings = generate_layer_embeddings(model, reduced_model_layers, split_size=index_batch_size)
        model.model = model.model.cpu()
        torch.cuda.empty_cache()
        reduced_embeddings = [emb.cuda() for emb in reduced_embeddings]
        for question_name, question in tqdm(questions.items()):
            question, solution = question
            interest_layers = [layer for layer in reduced_model_layers if layer < len(reduced_embeddings)]
            results[model_name][question_name] = {top_layer: batch_emb_arithmetic(reduced_embeddings[top_layer], model_tokenizer, question, k=k, out=False) for top_layer in interest_layers}
        del reduced_embeddings
    return results

def test_similarities(models, word_pairs, reduced_model_layers=[0], index_batch_size=10000, emb_batch_size=10):
    results = {}
    for model in tqdm(models):
        torch.cuda.empty_cache()
        model.model = model.model.cuda()
        model_name = model.name
        model_tokenizer = model.tokenizer
        print(f"Computing model {model_name}")
        reduced_embeddings = generate_layer_embeddings(model, reduced_model_layers, split_size=index_batch_size)
        model.model = model.model.cpu()
        torch.cuda.empty_cache()
        reduced_embeddings = [reduced_embeddings[i:i + emb_batch_size] for i in range(0, len(reduced_embeddings), emb_batch_size)]
        results[model_name] = {}
        for i, emb_batch in enumerate(reduced_embeddings):
            emb_batch = [emb.cuda() for emb in emb_batch]
            delta_layer = i * emb_batch_size
            max_layer = i * delta_layer + len(emb_batch)
            interest_layers = [layer - delta_layer for layer in reduced_model_layers if layer >= delta_layer and layer < max_layer]
            results[model_name] |= {top_layer + delta_layer: [word_sim_function(emb_batch[top_layer], model_tokenizer, word_pair).detach().cpu() for word_pair, solution in word_pairs] for top_layer in interest_layers}
            del emb_batch
            torch.cuda.empty_cache()
        del reduced_embeddings
    return results

In [15]:
def save_results(results, filepath):
    with open(filepath, 'wb') as fp:
        pickle.dump(results, fp)
        
def load_results(filepath):
    with open(filepath, 'rb') as fp:
        results = pickle.load(fp)
        return results

# Test

## Load Gensim Data

In [16]:
addspace = lambda x: " " + x
addall = lambda x: (x.capitalize(), " " + x.capitalize(), x.lower(), " " + x.lower())

In [17]:
def load_question_words(path):
    with open(path, 'r') as file:
        lines = file.readlines()
    data = {}
    current_category = None
    for line in lines:
        line = line.strip()
        # Check if the line denotes a new category
        if line.startswith(':'):
            current_category = line[2:]
            data[current_category] = []
        else:
            data[current_category].append(line.split())
    # Create DataFrames for each category
    dfs = {}
    for category, attributes in data.items():
        df = pd.DataFrame(attributes, columns=['A', 'B', 'Solution', 'C'])
        # Reassign order
        df = df.reindex(columns = ['A', 'B', 'C', 'Solution'])
        dfs[category] = df
    return dfs

def change_words(batch, transform=lambda x: x):
    return [[transform(word) for word in entry] for entry in batch]

In [18]:
data_quest = load_question_words(datapath('questions-words.txt'))
data_quest = {category: (dataset[["A", "B", "C"]].apply(addspace), dataset["Solution"].apply(addall)) for category, dataset in data_quest.items()}
questions = {category: (dataset[0][["A", "B", "C"]].values.tolist(), dataset[1].values.tolist()) for category, dataset in data_quest.items()}

In [19]:
data_sim = pd.read_csv(datapath('wordsim353.tsv'), sep='\t', skiprows=2, names=["Word1", "Word2", "Human"])
data_sim = (data_sim[["Word1", "Word2"]].apply(addspace).values.tolist(), (data_sim["Human"] / 10).values.tolist())
word_pairs = [(word_pair, solution) for word_pair, solution in zip(*data_sim)]

## Gradio Visualization

In [20]:
async def deploy_gradio(interface):
    interface.launch(inline=False, share=False, prevent_thread_lock=True, show_error=True)
    address = "http://localhost:" + str(interface.server_port)
    listener = ngrok.forward(addr=address, authtoken=UserSecretsClient().get_secret("ngrok_key"))
    await asyncio.wait_for(listener, timeout=10)
    public_url = listener.result().url()
    print(f"Deploy URL: {public_url}")
    return listener.result()

async def close_gradio(listener, interface):
    interface.close()
    await listener.close()

In [21]:
def update_ar_plots(model_names, question_name, n_layers, score, k, update_average=False):
    figures= []
    f1 = plt.figure()
    layers = [[layer for layer in ar_tests[model_name][question_name].keys() if layer < n_layers] for model_name in model_names]
    layer_ticks = range(0, n_layers)
    for model_name, model_layers in zip(model_names, layers):
        r = []
        for layer in model_layers:
            r.append(evaluate_batch(ar_tests[model_name][question_name][layer], questions[question_name][1], out=False, score=score, k=k))
        plt.plot(model_layers, r, marker='o', alpha=0.6, label=model_name)
    plt.xticks(layer_ticks)
    plt.title(question_name)
    plt.legend()
    figures.append(f1)
    
    if update_average:
        f2 = plt.figure()
        layers = [[layer for layer in ar_tests[model_name][(list(questions.keys())[0])].keys() if layer < n_layers] for model_name in model_names]
        layer_ticks = range(0, n_layers)
        for model_name, model_layers in zip(model_names, layers):
            m = np.array([])
            s = np.array([])
            for layer in model_layers:
                values = [evaluate_batch(ar_tests[model_name][question_name][layer], questions[question_name][1], out=False, score=score, k=k) for question_name in questions.keys()]
                m = np.append(m, np.mean(values))
                s = np.append(s, np.std(values))
            plt.plot(model_layers, m, marker='o', alpha=0.6, label=model_name)
            plt.fill_between(model_layers, m - s, m + s, alpha=0.3)
        plt.xticks(layer_ticks)
        plt.title("Average Plot")
        plt.legend()
        figures.append(f2)
    else:
        figures.append(None)
        
    return figures

def update_sim_plots(model_names, n_layers, correlation):
    f = plt.figure()
    
    layers = [[layer for layer in sim_tests[model_name] if layer < n_layers] for model_name in model_names]
    layer_ticks = range(0, n_layers)
    
    ax1 = plt.gca()
    ax1.set_ylabel("Correlation")
    
    plt.xticks(layer_ticks)
    ax2 = ax1.twinx()
    ax2.set_ylabel("Similarity σ")
    plt.title("Similarity Plot")
    
    for model_name, model_layers in zip(model_names, layers):
        m = np.array([])
        s = np.array([])
        for layer in model_layers:
            #print(sim_tests[model_name][layer])
            if correlation == "pearson":
                r, _ = pearsonr(sim_tests[model_name][layer], [sol for pair, sol in word_pairs])
            elif correlation == "spearman":
                r, _ = spearmanr(sim_tests[model_name][layer], [sol for pair, sol in word_pairs], nan_policy="propagate")
            s = np.append(s, np.std(sim_tests[model_name][layer]))
            m = np.append(m, r)
            #s = np.append(s, p)
        p = ax1.plot(model_layers, m, marker='o', alpha=0.6, label=model_name)
        ax2.plot(model_layers, s, "--", alpha=0.6, label=f"{model_name} Similarity σ", color=p[-1].get_color())
        #plt.fill_between(model_layers, m - s, m + s, alpha=0.3)
    ax1.legend()
    return f

In [22]:
import gradio as gr

with gr.Blocks(analytics_enabled=False, title="Test") as demo:
    
    model_names = [model.name for model in models]
    correlations = ["pearson", "spearman"]
    scores = ["rankscore", "topk"]
    
    with gr.Accordion("Arithmetic"):
    
        with gr.Row():

            with gr.Column():
                update_average = gr.Checkbox(label="Display Average Graph", value=False)
                question_options = gr.Dropdown(questions.keys(), label="Questions", value=list(questions.keys())[0])
                models_options = gr.CheckboxGroup(model_names, label="Models")
                n_layers = gr.Slider(minimum=0, maximum=24, step=1, label="Number of Layers")
                scoring_options = gr.Radio(scores, label="Score", value=list(scores)[1])
                topkscore_k = gr.Slider(minimum=1, maximum=100, step=1, label="Top-K Score Threshold")

            with gr.Column():
                plot = gr.Plot(label="Plot")
                avg_plot = gr.Plot(label="Average Plot")

        inputs = [models_options, question_options, n_layers, scoring_options, topkscore_k, update_average]

        # Update plot
        update_average.change(update_ar_plots, inputs=inputs, outputs=[plot, avg_plot])
        models_options.change(update_ar_plots, inputs=inputs, outputs=[plot, avg_plot])
        question_options.change(update_ar_plots, inputs=inputs, outputs=[plot, avg_plot])
        n_layers.change(update_ar_plots, inputs=inputs, outputs=[plot, avg_plot])
        scoring_options.change(update_ar_plots, inputs=inputs, outputs=[plot, avg_plot])
        topkscore_k.change(update_ar_plots, inputs=inputs, outputs=[plot, avg_plot])

        # Element Visibility
        scoring_options.change(lambda score: gr.update(visible=(score == "topk")), inputs=[scoring_options], outputs=[topkscore_k])
        update_average.change(lambda average: gr.update(visible=average), inputs=[update_average], outputs=[avg_plot])
        
        # On load
        demo.load(update_ar_plots, inputs=inputs, outputs=[plot, avg_plot])
    
    with gr.Accordion("Similarity"):
            
        with gr.Row():

            with gr.Column():
                models_options = gr.CheckboxGroup(model_names, label="Models")
                n_layers = gr.Slider(minimum=0, maximum=24, step=1, label="Number of Layers")
                correlation_options = gr.Radio(correlations, label="Correlation", value=list(correlations)[1])

            with gr.Column():
                plot = gr.Plot(label="Plot")
                
        inputs = [models_options, n_layers, correlation_options]
        
        # Update plot
        models_options.change(update_sim_plots, inputs=inputs, outputs=[plot])
        n_layers.change(update_sim_plots, inputs=inputs, outputs=[plot])
        correlation_options.change(update_sim_plots, inputs=inputs, outputs=[plot])
    
        # On load
        demo.load(update_sim_plots, inputs=inputs, outputs=[plot])

## Run Tests

In [23]:
if not use_result_files:
    ar_tests = test_arithmetic(models, questions, reduced_model_layers=range(0, 15), k=100, index_batch_size=5000)
    save_results(ar_tests, "ar_results.pkl")
else:
    ar_tests = load_results(result_files_prefix + "ar_results.pkl")

In [24]:
if not use_result_files:
    sim_tests = test_similarities(models, word_pairs, reduced_model_layers=range(0, 24), index_batch_size=1100)
    save_results(sim_tests, "sim_results.pkl")
else:
    sim_tests = load_results(result_files_prefix + "sim_results.pkl")

In [25]:
demo_listener = await deploy_gradio(demo)

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.
Deploy URL: https://c7e2-35-199-185-102.ngrok-free.app


In [26]:
# await close_gradio(demo_listener, demo)

In [27]:
#model_names = [model.name for model in models]
#question_names = [name for name in questions.keys()]
#layers = [[layer for layer in tests[model_name][question_names[0]].keys()] for model_name in model_names]
#max_layers = layers[np.argmax([len(l) for l in layers])]
#plots = []
#for question_name, quest_pair in questions.items():
#    question, solution = quest_pair
#    res = []
#    for model, model_layers in zip(models, layers):
#        r = []
#        for layer in model_layers:
#            r.append(evaluate_batch(tests[model.name][question_name][layer], solution, out=False))
#            #r.append(evaluate_batch(tests[model.name][question_name][layer], solution, out=False, score="topk", k=50))
#        plt.plot(model_layers, r, marker='o', alpha=0.6, label=model.name)
#    plt.xticks(max_layers)
#    plt.title(question_name)
#    plt.legend()
#    plots.append(plt.gcf())
#    plt.show()

  f1 = plt.figure()
