In [1]:
from collections import defaultdict
import itertools
import re
import subprocess

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertModel, BertForSequenceClassification

from stats_count import *
from grab_weights import grab_attention_weights, text_preprocessing

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
!env | grep CUDA_VISIBLE

## Parameters

In [4]:
np.random.seed(42) # For reproducibility.

In [5]:
max_tokens_amount  = 128 # The number of tokens to which the tokenized text is truncated / padded.
    
layers_of_interest = [i for i in range(12)]  # Layers for which attention matrices and features on them are 
                                             # calculated. For calculating features on all layers, leave it be
                                             # [i for i in range(12)].

model_path = tokenizer_path = "bert-base-uncased"  

# You can use either standard or fine-tuned BERT. If you want to use fine-tuned BERT to your current task, save the
# model and the tokenizer with the commands tokenizer.save_pretrained(output_dir); 
# bert_classifier.save_pretrained(output_dir) into the same directory and insert the path to it here.

### Explanation of stats_name parameter

Currently, we implemented calculation of the following graphs features:
* "s"    - amount of strongly connected components
* "w"    - amount of weakly connected components
* "e"    - amount of edges
* "v"    - average vertex degree
* "c"    - amount of (directed) simple cycles
* "b0b1" - Betti numbers

The variable stats_name contains a string with the names of the features, which you want to calculate. The format of the string is the following:

"stat_name + "_" + stat_name + "_" + stat_name + ..."

**For example**:

`stats_name == "s_w"` means that the number of strongly and weakly connected components will be calculated

`stats_name == "b0b1"` means that only the Betti numbers will be calculated

`stats_name == "b0b1_c"` means that Betti numbers and the number of simple cycles will be calculated

e.t.c.

## Filenames

In [6]:
subset = "test_5k"           # .csv file with the texts, for which we count topological features
input_dir = "small_gpt_web/"  # Name of the directory with .csv file
output_dir = "small_gpt_web/" # Name of the directory with calculations results

prefix = output_dir + subset

r_file     = output_dir + 'attentions/' + subset  + "_all_heads_" + str(len(layers_of_interest)) + "_layers_MAX_LEN_" + \
             str(max_tokens_amount) + "_" + model_path.split("/")[-1]
# Name of the file for attention matrices weights

barcodes_file = output_dir + 'barcodes/' + subset  + "_all_heads_" + str(len(layers_of_interest)) + "_layers_MAX_LEN_" + \
             str(max_tokens_amount) + "_" + model_path.split("/")[-1]
# Name of the file for barcodes information

In [7]:
r_file

'small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased'

In [8]:
barcodes_file

'small_gpt_web/barcodes/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased'

.csv file must contain the column with the name **sentence** with the texts. It can also contain the column **labels**, which will be needed for testing. Any other arbitrary columns will be ignored.

In [9]:
try:
    data = pd.read_csv(input_dir + subset + ".csv").reset_index(drop=True)
except:
    #data = pd.read_csv(input_dir + subset + ".tsv", delimiter="\t")
    data = pd.read_csv(input_dir + subset + ".tsv", delimiter="\t", header=None)
    data.columns = ["0", "labels", "2", "sentence"]

In [10]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,ended,length,sentence,label
0,4722,259722,True,231,The Learning Co.\n\nDeveloped by\n\nThe Learni...,natural
1,2757,257813,True,563,Bush doubles down on foreign policy on Saturda...,generated
2,2194,257194,True,62,Here are six interesting things you need to kn...,natural
3,817,255817,True,293,Introduction\n\nWe would like to thank Antec f...,natural
4,3886,258886,False,1024,"ELKRIDGE, Md.—A group called ""Muslims for Trum...",natural


In [11]:
sentences = data['sentence']
print("Average amount of words in example:", \
      np.mean(list(map(len, map(lambda x: re.sub('\w', ' ', x).split(" "), data['sentence'])))))
print("Max. amount of words in example:", \
      np.max(list(map(len, map(lambda x: re.sub('\w', ' ', x).split(" "), data['sentence'])))))
print("Min. amount of words in example:", \
      np.min(list(map(len, map(lambda x: re.sub('\w', ' ', x).split(" "), data['sentence'])))))

Average amount of words in example: 2723.5122
Max. amount of words in example: 6151
Min. amount of words in example: 34


In [12]:
def get_token_length(batch_texts):
    inputs = tokenizer.batch_encode_plus(batch_texts,
       return_tensors='pt',
       add_special_tokens=True,
       max_length=MAX_LEN,             # Max length to truncate/pad
       pad_to_max_length=True,         # Pad sentence to max length
       truncation=True
    )
    inputs = inputs['input_ids'].numpy()
    n_tokens = []
    indexes = np.argwhere(inputs == tokenizer.pad_token_id)
    for i in range(inputs.shape[0]):
        ids = indexes[(indexes == i)[:, 0]]
        if not len(ids):
            n_tokens.append(MAX_LEN)
        else:
            n_tokens.append(ids[0, 1])
    return n_tokens

In [13]:
MAX_LEN = max_tokens_amount
tokenizer = BertTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)

In [14]:
data['tokenizer_length'] = get_token_length(data['sentence'].values)

In [15]:
data

Unnamed: 0.1,Unnamed: 0,id,ended,length,sentence,label,tokenizer_length
0,4722,259722,True,231,The Learning Co.\n\nDeveloped by\n\nThe Learni...,natural,128
1,2757,257813,True,563,Bush doubles down on foreign policy on Saturda...,generated,128
2,2194,257194,True,62,Here are six interesting things you need to kn...,natural,71
3,817,255817,True,293,Introduction\n\nWe would like to thank Antec f...,natural,128
4,3886,258886,False,1024,"ELKRIDGE, Md.—A group called ""Muslims for Trum...",natural,128
...,...,...,...,...,...,...,...
4995,1472,256472,False,1024,"Occasionally, we come across interesting scena...",natural,128
4996,326,255337,False,1024,Providing insight not only into the memes that...,generated,128
4997,3862,258862,True,339,"Each year, MONEY digs into enrollment data and...",natural,128
4998,2862,257862,False,1024,Grounding of the Queen Elizabeth 2 (response) ...,natural,128


In [16]:
ntokens_array = data['tokenizer_length'].values

In [17]:
from math import ceil

batch_size = 10 # batch size
number_of_batches = ceil(len(data['sentence']) / batch_size)
DUMP_SIZE = 100 # number of batches to be dumped

## Calculating Ripser features

Format: "h{dim}\_{type}\_{args}"

Dimension: 0, 1, etc.; homology dimension

Types: 
    
    1. s: sum of lengths; example: "h1_s".
    2. m: mean of lengths; example: "h1_m"
    3. v: variance of lengths; example "h1_v"
    4. n: number of barcodes with time of birth/death more/less then threshold.
        4.1. b/d: birth or death
        4.2. m/l: more or less than threshold
        4.2. t: threshold value
       example: "h0_n_d_m_t0.5", "h1_n_b_l_t0.75"
    5. t: time of birth/death of the longest barcode (not incl. inf).
        3.1. b/d: birth of death
        example: "h0_t_d", "h1_t_b"
    6. nb: number of barcodes in dim
       example: h0_nb
    7. e: entropy; example: "h1_e"

In [18]:
import os
import timeit
import ripser_count

adj_filenames = [
    output_dir + 'attentions/' + filename 
    for filename in os.listdir(output_dir + 'attentions/') if r_file in (output_dir + 'attentions/' + filename)
]
# sorted by part number
adj_filenames = sorted(adj_filenames, key = lambda x: int(x.split('_')[-1].split('of')[0][4:].strip())) 
adj_filenames

['small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part1of5.npy',
 'small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part2of5.npy',
 'small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part3of5.npy',
 'small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part4of5.npy',
 'small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part5of5.npy']

In [19]:
dim = 1
lower_bound = 1e-3

## Calculating and saving barcodes

In [20]:
from multiprocessing import Process, Queue

def subprocess_wrap(queue, function, args):
    queue.put(function(*args))
#     print("Putted in Queue")
    queue.close()
    exit()

In [21]:
import json
import itertools
from collections import defaultdict

def get_only_barcodes(adj_matricies, ntokens_array, dim, lower_bound):
    """Get barcodes from adj matricies for each layer, head"""
    barcodes = {}
    layers, heads = range(adj_matricies.shape[1]), range(adj_matricies.shape[2])
    for (layer, head) in itertools.product(layers, heads):
        matricies = adj_matricies[:, layer, head, :, :]
        barcodes[(layer, head)] = ripser_count.get_barcodes(matricies, ntokens_array, dim, lower_bound, (layer, head))
    return barcodes

def format_barcodes(barcodes):
    """Reformat barcodes to json-compatible format"""
    return [{d: b[d].tolist() for d in b} for b in barcodes]

def save_barcodes(barcodes, filename):
    """Save barcodes to file"""
    formatted_barcodes = defaultdict(dict)
    for layer, head in barcodes:
        formatted_barcodes[layer][head] = format_barcodes(barcodes[(layer, head)])
    json.dump(formatted_barcodes, open(filename, 'w'))
    
def unite_barcodes(barcodes, barcodes_part):
    """Unite 2 barcodes"""
    for (layer, head) in barcodes_part:
        barcodes[(layer, head)].extend(barcodes_part[(layer, head)])
    return barcodes

def split_matricies_and_lengths(adj_matricies, ntokens, number_of_splits):
    splitted_ids = np.array_split(np.arange(ntokens.shape[0]), number_of_splits) 
    splitted = [(adj_matricies[ids], ntokens[ids]) for ids in splitted_ids]
    return splitted

In [22]:
queue = Queue()
number_of_splits = 2
for i, filename in enumerate(tqdm(adj_filenames, desc='Calculating barcodes')):
    barcodes = defaultdict(list)
    adj_matricies = np.load(filename, allow_pickle=True) # samples X 
    print(f"Matricies loaded from: {filename}")
    ntokens = ntokens_array[i*batch_size*DUMP_SIZE : (i+1)*batch_size*DUMP_SIZE]
    splitted = split_matricies_and_lengths(adj_matricies, ntokens, number_of_splits)
    for matricies, ntokens in tqdm(splitted, leave=False):
        p = Process(
            target=subprocess_wrap,
            args=(
                queue,
                get_only_barcodes,
                (matricies, ntokens, dim, lower_bound)
            )
        )
        p.start()
        barcodes_part = queue.get() # block until putted and get barcodes from the queue
#         print("Features got.")
        p.join() # release resources
#         print("The process is joined.")
        p.close() # releasing resources of ripser
#         print("The proccess is closed.")
        
        barcodes = unite_barcodes(barcodes, barcodes_part)
    part = filename.split('_')[-1].split('.')[0]
    save_barcodes(barcodes, barcodes_file + '_' + part + '.json')

HBox(children=(FloatProgress(value=0.0, description='Calculating barcodes', max=5.0, style=ProgressStyle(descr…

Matricies loaded from: small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part1of5.npy


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Matricies loaded from: small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part2of5.npy


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Matricies loaded from: small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part3of5.npy


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Matricies loaded from: small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part4of5.npy


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Matricies loaded from: small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part5of5.npy


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




## Calculating features of saved barcodes

In [23]:
ripser_feature_names=[
    'h0_s', 
    'h0_e',
    'h0_t_d', 
    'h0_n_d_m_t0.75',
    'h0_n_d_m_t0.5',
    'h0_n_d_l_t0.25',
    'h1_t_b',
    'h1_n_b_m_t0.25',
    'h1_n_b_l_t0.95', 
    'h1_n_b_l_t0.70',  
    'h1_s',
    'h1_e',
    'h1_v',
    'h1_nb'
]

In [24]:
import os
import timeit
import ripser_count
import json

adj_filenames = [
    output_dir + 'barcodes/' + filename 
    for filename in os.listdir(output_dir + 'barcodes/') if r_file.split('/')[-1] == filename.split('_part')[0]
]
adj_filenames = sorted(adj_filenames, key = lambda x: int(x.split('_')[-1].split('of')[0][4:].strip())) 
adj_filenames

['small_gpt_web/barcodes/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part1of5.json',
 'small_gpt_web/barcodes/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part2of5.json',
 'small_gpt_web/barcodes/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part3of5.json',
 'small_gpt_web/barcodes/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part4of5.json',
 'small_gpt_web/barcodes/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part5of5.json']

In [25]:
def reformat_barcodes(barcodes):
    """Return barcodes to their original format"""
    formatted_barcodes = []
    for barcode in barcodes:
        formatted_barcode = {}
        for dim in barcode:
            formatted_barcode[int(dim)] = np.asarray(
                [(b, d) for b,d in barcode[dim]], dtype=[('birth', '<f4'), ('death', '<f4')]
            )
        formatted_barcodes.append(formatted_barcode)
    return formatted_barcodes

In [26]:
features_array = []

for filename in tqdm(adj_filenames, desc='Calculating ripser++ features'):
    barcodes = json.load(open(filename))
    print(f"Barcodes loaded from: {filename}", flush=True)
    features_part = []
    for layer in barcodes:
        features_layer = []
        for head in barcodes[layer]:
            ref_barcodes = reformat_barcodes(barcodes[layer][head])
            features = ripser_count.count_ripser_features(ref_barcodes, ripser_feature_names)
            features_layer.append(features)
        features_part.append(features_layer)
    features_array.append(np.asarray(features_part))

HBox(children=(FloatProgress(value=0.0, description='Calculating ripser++ features', max=5.0, style=ProgressSt…

Barcodes loaded from: small_gpt_web/barcodes/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part1of5.json
Barcodes loaded from: small_gpt_web/barcodes/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part2of5.json
Barcodes loaded from: small_gpt_web/barcodes/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part3of5.json
Barcodes loaded from: small_gpt_web/barcodes/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part4of5.json
Barcodes loaded from: small_gpt_web/barcodes/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part5of5.json



In [27]:
ripser_file = output_dir + 'features/' + subset + "_all_heads_" + str(len(layers_of_interest)) + "_layers" \
             + "_MAX_LEN_" + str(max_tokens_amount) + \
             "_" + model_path.split("/")[-1] + "_ripser" + '.npy'
ripser_file

'small_gpt_web/features/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_ripser.npy'

In [28]:
features = np.concatenate(features_array, axis=2)
features.shape

(12, 12, 5000, 14)

In [29]:
np.save(ripser_file, features)

## Calculating template features

In [30]:
import os
from multiprocessing import Pool

In [31]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

In [32]:
def matrix_distance(matricies, template, broadcast=True):
    """
    Calculates the distance between the list of matricies and the template matrix.
    Args:
    
    -- matricies: np.array of shape (n_matricies, dim, dim)
    -- template: np.array of shape (dim, dim) if broadcast else (n_matricies, dim, dim)
    
    Returns:
    -- diff: np.array of shape (n_matricies, )
    """
    diff = np.linalg.norm(matricies-template, ord='fro', axis=(1, 2))
    div = np.linalg.norm(matricies, ord='fro', axis=(1, 2))**2
    if broadcast:
        div += np.linalg.norm(template, ord='fro')**2
    else:
        div += np.linalg.norm(template, ord='fro', axis=(1, 2))**2
    return diff/np.sqrt(div)

In [45]:
attention_dir = 'small_gpt_web/attentions/'
attention_name = 'test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased'

texts_name = 'small_gpt_web/test_5k.csv'

MAX_LEN = 128

In [46]:
def attention_to_self(matricies):
    """
    Calculates the distance between input matricies and identity matrix, 
    which representes the attention to the same token.
    """
    _, n, m = matricies.shape
    assert n == m, f"Input matrix has shape {n} x {m}, but the square matrix is expected"
    template_matrix = np.eye(n)
    return matrix_distance(matricies, template_matrix)

def attention_to_next_token(matricies):
    """
    Calculates the distance between input and E=(i, i+1) matrix, 
    which representes the attention to the next token.
    """
    _, n, m = matricies.shape
    assert n == m, f"Input matrix has shape {n} x {m}, but the square matrix is expected"
    template_matrix = np.triu(np.tri(n, k=1, dtype=matricies.dtype), k=1)
    return matrix_distance(matricies, template_matrix)

def attention_to_prev_token(matricies):
    """
    Calculates the distance between input and E=(i+1, i) matrix, 
    which representes the attention to the previous token.
    """
    _, n, m = matricies.shape
    assert n == m, f"Input matrix has shape {n} x {m}, but the square matrix is expected"
    template_matrix = np.triu(np.tri(n, k=-1, dtype=matricies.dtype), k=-1)
    return matrix_distance(matricies, template_matrix)

def attention_to_beginning(matricies):
    """
    Calculates the distance between input and E=(i+1, i) matrix, 
    which representes the attention to [CLS] token (beginning).
    """
    _, n, m = matricies.shape
    assert n == m, f"Input matrix has shape {n} x {m}, but the square matrix is expected"
    template_matrix = np.zeros((n, n))
    template_matrix[:, 0] = 1.0
    return matrix_distance(matricies, template_matrix)

def attention_to_ids(matricies, list_of_ids, token_id):
    """
    Calculates the distance between input and ids matrix, 
    which representes the attention to some particular tokens,
    which ids are in the `list_of_ids` (commas, periods, separators).
    """
   
    batch_size, n, m = matricies.shape
    EPS = 1e-7
    assert n == m, f"Input matrix has shape {n} x {m}, but the square matrix is expected"
#     assert len(list_of_ids) == batch_size, f"List of ids length doesn't match the dimension of the matrix"
    template_matrix = np.zeros_like(matricies)
    ids = np.argwhere(list_of_ids == token_id)
    if len(ids):
        batch_ids, row_ids = zip(*ids)
        template_matrix[np.array(batch_ids), :, np.array(row_ids)] = 1.0
        template_matrix /= (np.sum(template_matrix, axis=-1, keepdims=True) + EPS)
    return matrix_distance(matricies, template_matrix, broadcast=False)

In [47]:
def count_template_features(matricies, feature_list=['self', 'beginning', 'prev', 'next', 'comma', 'dot'], ids=None):
    features = []
    comma_id = 1010
    dot_id = 1012
    for feature in feature_list:
        if feature == 'self':
            features.append(attention_to_self(matricies))
        elif feature == 'beginning':
            features.append(attention_to_beginning(matricies))
        elif feature == 'prev':
            features.append(attention_to_prev_token(matricies))
        elif feature == 'next':
            features.append(attention_to_next_token(matricies))
        elif feature == 'comma':
            features.append(attention_to_ids(matricies, ids, comma_id))
        elif feature == 'dot':
            features.append(attention_to_ids(matricies, ids, dot_id))
    return np.array(features)

def calculate_features_t(adj_matricies, template_features, ids=None):
    """Calculate template features for adj_matricies"""
    features = []
    for layer in range(adj_matricies.shape[1]):
        features.append([])
        for head in range(adj_matricies.shape[2]):
            matricies = adj_matricies[:, layer, head, :, :]
            lh_features = count_template_features(matricies, template_features, ids) # samples X n_features
            features[-1].append(lh_features)
    return np.asarray(features) # layer X head X n_features X samples

In [48]:
# '.' id 1012
# ',' id 1010
def get_list_of_ids(sentences, tokenizer):
    inputs = tokenizer.batch_encode_plus([text_preprocessing(s) for s in sentences],
                                       add_special_tokens=True,
                                       max_length=MAX_LEN,             # Max length to truncate/pad
                                       pad_to_max_length=True,         # Pad sentence to max length)
                                       truncation=True
                                      )
    return np.array(inputs['input_ids'])

In [49]:
num_of_workers = 20 
pool = Pool(num_of_workers)
feature_list = ['self', 'beginning', 'prev', 'next', 'comma', 'dot']

In [50]:
adj_filenames = [
    attention_dir + filename 
    for filename in os.listdir(attention_dir) 
    if attention_name == filename.split("_part")[0]
]
# sorted by part number
adj_filenames = sorted(adj_filenames, key = lambda x: int(x.split('_')[-1].split('of')[0][4:].strip())) 
adj_filenames

['small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part1of5.npy',
 'small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part2of5.npy',
 'small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part3of5.npy',
 'small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part4of5.npy',
 'small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part5of5.npy']

In [51]:
attention_name

'test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased'

In [52]:
texts = pd.read_csv(texts_name)

In [53]:
features_array = []

for i, filename in tqdm(list(enumerate(adj_filenames)), desc='Features calc'):
    adj_matricies = np.load(filename, allow_pickle=True)
    batch_size = adj_matricies.shape[0]
    sentences = texts['sentence'].values[i*batch_size:(i+1)*batch_size]
    splitted_indexes = np.array_split(np.arange(batch_size), num_of_workers)
    splitted_list_of_ids = [
        get_list_of_ids(sentences[indx], tokenizer) 
        for indx in tqdm(splitted_indexes, desc=f"Calculating token ids on iter {i} from {len(adj_filenames)}")
    ]
    splitted_adj_matricies = [adj_matricies[indx] for indx in splitted_indexes]
    
    args = [(m, feature_list, list_of_ids) for m, list_of_ids in zip(splitted_adj_matricies, splitted_list_of_ids)]
    
    features_array_part = pool.starmap(
        calculate_features_t, args
    )
    features_array.append(np.concatenate([_ for _ in features_array_part], axis=3))
features_array = np.concatenate(features_array, axis=3)

HBox(children=(FloatProgress(value=0.0, description='Features calc', max=5.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='Calculating token ids on iter 0 from 5', max=20.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Calculating token ids on iter 1 from 5', max=20.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Calculating token ids on iter 2 from 5', max=20.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Calculating token ids on iter 3 from 5', max=20.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Calculating token ids on iter 4 from 5', max=20.0, style=…





In [54]:
"small_gpt_web/features/" + attention_name + "_template.npy"

'small_gpt_web/features/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_template.npy'

In [55]:
np.save("small_gpt_web/features/" + attention_name + "_template.npy", features_array)