In [1]:
import os
print(os.getcwd())
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

C:\Users\danie\T3-Vis\application


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
test = torch.load("resources/pretrained/aggregate_attn.pt")

In [4]:
print(len(test))

144


In [5]:
print(len(test[1]['attn']))

16384


In [6]:
def format_attention(output_attention, n_heads, pruned_heads):
    attentions = [(l.squeeze(0) * 100).round().byte().cpu() for l in output_attention]
    attn_vectors = []
    for layer in range(len(attentions)):
        attn_vectors.append([])
        next_head_idx = 0
        for head in range(n_heads):
            if (layer in pruned_heads.keys()) and (head in pruned_heads[layer]):
                attn_vectors[layer].append([])
            else:
                attn_vectors[layer].append(attentions[layer][next_head_idx].tolist())
                next_head_idx += 1
    return attn_vectors

In [7]:
print(np.array(test[1]['attn']).shape)

(16384,)


In [8]:
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer

def longformer_finetuned_papers():
    model = AutoModelForSequenceClassification.from_pretrained('danielhou13/longformer-finetuned_papers', num_labels = 2)
    return model
def preprocess_function(tokenizer, example, max_length):
    example.update(tokenizer(example['text'], padding='max_length', max_length=max_length, truncation=True))
    return example

def get_papers_dataset(dataset_type):
    max_length = 2048
    dataset = load_dataset("danielhou13/cogs402dataset")[dataset_type]
    new_col = list(np.arange(0, len(dataset)))
    dataset = dataset.add_column("idx", new_col)
    visualize_columns = dataset.column_names
    visualize_columns = ['idx', 'text', 'labels']

    tokenizer = AutoTokenizer.from_pretrained('allenai/longformer-base-4096')
    dataset = dataset.map(lambda x: preprocess_function(tokenizer, x, max_length), batched=True)
    setattr(dataset, 'visualize_columns', visualize_columns)
    setattr(dataset, 'input_columns', ['input_ids', 'attention_mask'])
    setattr(dataset, 'target_columns', ['labels'])
    setattr(dataset, 'max_length', max_length)
    setattr(dataset, 'tokenizer', tokenizer)
    return dataset

def papers_test_set():
    return get_papers_dataset('test')

In [9]:
cogs402_test = papers_test_set()
model = longformer_finetuned_papers()
columns = cogs402_test.input_columns + cogs402_test.target_columns
print(columns)
cogs402_test.set_format(type='torch', columns=columns + ['idx'])
cogs402_test=cogs402_test.remove_columns(['text', 'idx'])

Using custom data configuration danielhou13--cogs402dataset-5c7aa10e6c95142f
Reusing dataset parquet (C:\Users\danie\.cache\huggingface\datasets\parquet\danielhou13--cogs402dataset-5c7aa10e6c95142f\0.0.0\0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\danie\.cache\huggingface\datasets\parquet\danielhou13--cogs402dataset-5c7aa10e6c95142f\0.0.0\0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901\cache-cb5d1ef3ec235523.arrow


['input_ids', 'attention_mask', 'labels']


In [10]:
if torch.cuda.is_available():
    model = model.cuda()

print(model.device)

cuda:0


In [11]:
output = model(cogs402_test["input_ids"][923].unsqueeze(0).cuda(), attention_mask=cogs402_test['attention_mask'][923].unsqueeze(0).cuda(), labels=cogs402_test['labels'][923].cuda(), output_attentions=True)
batch_attn = output[-2]

In [12]:
def create_head_matrix(output_attentions, global_attentions):
    new_attention_matrix = torch.zeros((output_attentions.shape[0], 
                                        output_attentions.shape[0]))
    for i in range(output_attentions.shape[0]):
        test_non_zeroes = torch.nonzero(output_attentions[i]).squeeze()
        if test_non_zeroes.shape[0]>0:
            test2 = output_attentions[i][test_non_zeroes[1:]]
            new_attention_matrix_indices = test_non_zeroes[1:]-257 + i
            new_attention_matrix[i][new_attention_matrix_indices] = test2
            new_attention_matrix[i][0] = output_attentions[i][0]
        new_attention_matrix[0] = global_attentions.squeeze()[:output_attentions.shape[0]]
    return new_attention_matrix

In [13]:
from tqdm import tqdm
def format_attention(output_attention, global_attention, n_heads, pruned_heads):
    attentions = [(l.squeeze(0) * 100).cpu() for l in output_attention]
    global_attention = torch.stack(global_attention).squeeze().cpu()
    attn_vectors = []
    for layer in tqdm(range(len(attentions))):
        attn_vectors.append([])
        for head in range(n_heads):
            if (layer in pruned_heads.keys()) and (head in pruned_heads[layer]):
                attn_vectors[layer].append([])
            else:
                matrix = torch.Tensor(attentions[layer][head]).float()
                new_matrix = create_head_matrix(matrix, global_attention[layer][head])
                attn_vectors[layer].append(new_matrix.tolist())
    return attn_vectors

In [14]:
test = format_attention(output['attentions'], output['global_attentions'], model.config.num_attention_heads, {})

100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [01:35<00:00,  7.96s/it]


In [17]:
print(len(test))

12


In [18]:
test = np.array(test)

In [19]:
print(test.shape)

(12, 12, 2048, 2048)
