<a href="https://colab.research.google.com/github/danielhou13/cogs402longformer/blob/main/src/T3-vis/T3_vis_aggregate_attn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
sys.path.append('/content/drive/My Drive/{}'.format("cogs402longformer/"))

In [3]:
pip install datasets --quiet

In [4]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

In [6]:
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer

def longformer_finetuned_papers():
    model = AutoModelForSequenceClassification.from_pretrained('danielhou13/longformer-finetuned_papers', num_labels = 2)
    return model

def bert_test():
    model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2)
    setattr(model, 'num_hidden_layers', model.config.num_hidden_layers)
    setattr(model, 'num_attention_heads', model.config.num_attention_heads)
    setattr(model, 'hidden_size', model.config.hidden_size)
    return model

def preprocess_function(tokenizer, example, max_length):
    example.update(tokenizer(example['text'], padding='max_length', max_length=max_length, truncation=True))
    return example

def get_papers_dataset(dataset_type):
    max_length = 2048
    dataset = load_dataset("danielhou13/cogs402dataset")[dataset_type]
    new_col = list(np.arange(0, len(dataset)))
    dataset = dataset.add_column("idx", new_col)
    visualize_columns = dataset.column_names
    visualize_columns = ['idx', 'text', 'labels']

    tokenizer = AutoTokenizer.from_pretrained('allenai/longformer-base-4096')
    # tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    dataset = dataset.map(lambda x: preprocess_function(tokenizer, x, max_length), batched=True)
    setattr(dataset, 'visualize_columns', visualize_columns)
    setattr(dataset, 'input_columns', ['input_ids', 'attention_mask'])
    setattr(dataset, 'target_columns', ['labels'])
    setattr(dataset, 'max_length', max_length)
    setattr(dataset, 'tokenizer', tokenizer)
    return dataset

def papers_test_set():
    return get_papers_dataset('test')

In [7]:
cogs402_test = papers_test_set()
model = longformer_finetuned_papers()
columns = cogs402_test.input_columns + cogs402_test.target_columns
print(columns)
cogs402_test.set_format(type='torch', columns=columns + ['idx'])
cogs402_test=cogs402_test.remove_columns(['text', 'idx'])

Using custom data configuration danielhou13--cogs402dataset-cc784554b797f843
Reusing dataset parquet (/root/.cache/huggingface/datasets/danielhou13___parquet/danielhou13--cogs402dataset-cc784554b797f843/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)


  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?ba/s]

['input_ids', 'attention_mask', 'labels']


In [8]:
if torch.cuda.is_available():
    model = model.cuda()

print(model.device)

cuda:0


In [9]:
test = cogs402_test

In [10]:
# print(test['labels'][923])

In [11]:
output = model(test["input_ids"][923].unsqueeze(0).cuda(), attention_mask=test['attention_mask'][923].unsqueeze(0).cuda(), labels=test['labels'][923].cuda(), output_attentions=True)
batch_attn = output[-2]
output_attentions = torch.stack(batch_attn).cpu()
print("output_attention.shape", output_attentions.shape)

output_attention.shape torch.Size([12, 1, 12, 2048, 514])


In [12]:
# print(os.getcwd())
# yes = torch.load("resources/longformer_test2/epoch_3/aggregate_attn.pt")

In [13]:
def format_attention_image(attention):
    formatted_attn = []
    for layer_idx in range(attention.shape[0]):
        for head_idx in range(attention.shape[1]):
            formatted_entry = {
                'layer': layer_idx,
                'head': head_idx
            }

            # Flatten value of log attention normalize between 255 and 0
            if len(attention[layer_idx, head_idx]) == 0:
                continue
            attn = np.array(attention[layer_idx, head_idx]).flatten()
            attn = (attn - attn.min()) / (attn.max() - attn.min())
            alpha = np.round(attn * 255)
            red = np.ones_like(alpha) * 255
            green = np.zeros_like(alpha) * 255
            blue = np.zeros_like(alpha) * 255

            attn_data = np.dstack([red,green,blue,alpha]).reshape(alpha.shape[0] * 4).astype('uint8')
            formatted_entry['attn'] = attn_data.tolist()
            formatted_attn.append(formatted_entry)
    return formatted_attn

In [14]:
def create_head_matrix(output_attentions, global_attentions):
    new_attention_matrix = torch.zeros((output_attentions.shape[0], 
                                      output_attentions.shape[0]))
    for i in range(output_attentions.shape[0]):
        test_non_zeroes = torch.nonzero(output_attentions[i]).squeeze()
        test2 = output_attentions[i][test_non_zeroes[1:]]
        new_attention_matrix_indices = test_non_zeroes[1:]-257 + i
        # new_attention_matrix_indices.cpu().detach().numpy()
        new_attention_matrix[i][new_attention_matrix_indices] = test2
        new_attention_matrix[i][0] = output_attentions[i][0]
        new_attention_matrix[0] = global_attentions.squeeze()[:output_attentions.shape[0]]
    return new_attention_matrix


def attentions_all_heads(output_attentions, global_attentions):
    new_matrix = []
    for i in range(output_attentions.shape[0]):
        matrix = create_head_matrix(output_attentions[i], global_attentions[i])
        new_matrix.append(matrix)
    return torch.stack(new_matrix)

def all_batches(output_attentions, global_attentions):
    new_matrix = []
    for i in range(output_attentions.shape[0]):
        matrix = attentions_all_heads(output_attentions[i], global_attentions[i])
        new_matrix.append(matrix)
    return torch.stack(new_matrix)

def all_layers(output_attentions, global_attentions):
    new_matrix = []
    for i in range(output_attentions.shape[0]):
        matrix = all_batches(output_attentions[i], global_attentions[i])
        new_matrix.append(matrix)
    return torch.stack(new_matrix)

In [15]:
from tqdm import tqdm
def compute_aggregated_attn(model, dataloader, max_input_len):

    n_layers = model.longformer.config.num_hidden_layers
    n_heads = model.longformer.config.num_attention_heads
    # head_size = int(model.longformer.config.hidden_size / n_heads)
    # n_examples = len(dataloader.dataset)

    # importance_scores = np.zeros((n_layers, n_heads))

    device = model.device
    # total_loss = 0.
    attn = np.zeros((n_layers, n_heads, max_input_len, max_input_len))
    print(attn.shape)
    model.eval()

    attn_normalize_count = torch.zeros(max_input_len, device=device)

    for step, inputs in enumerate(tqdm(dataloader, position=0, leave=True)):

        # batch_size_ = inputs['input_ids'].__len__()

        if torch.cuda.is_available():
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.cuda()
        
        inputs['output_attentions']=True
        
        with torch.no_grad():
            output = model(**inputs)
        
        
        attn_normalize_count += inputs['attention_mask'].sum(dim=0)
        batch_attn = output[-2]
        global_attn = output[-1]
        
        # print(batch_attn[1].shape)
        output_attentions = torch.stack(batch_attn).cpu()

        # print("output_attention.shape", output_attentions.shape)
        global_attentions = torch.stack(global_attn).cpu()
         
        # print(output_attentions.device)
        # print(global_attentions.device)
        
        batch_attn2 = all_layers(output_attentions, global_attentions)
    
        # print(batch_attn2.shape)
        batch_attn = torch.cat([l.sum(dim=0).unsqueeze(0) for l in batch_attn2], dim=0).cpu().numpy()
        # print("attention shape", batch_attn.shape)
        
        attn += batch_attn
        
    max_input_len = len(attn_normalize_count.nonzero(as_tuple=False))
    
    attn = attn[:, :, :max_input_len, :max_input_len]
    attn /= attn_normalize_count.cpu().numpy()[:max_input_len]
    print(type(attn))
    formatted_attn = format_attention_image(attn)
    return formatted_attn

In [16]:
dataloader = torch.utils.data.DataLoader(cogs402_test, batch_size=1)

In [None]:
test = compute_aggregated_attn(model, dataloader, cogs402_test.max_length)

(12, 12, 2048, 2048)


  2%|▏         | 22/1179 [06:10<5:22:03, 16.70s/it]

In [None]:
print(type(test))

In [None]:
# torch.save(test, "/content/drive/MyDrive/cogs402longformer/aggregate_attn.pt")
torch.save(test, "aggregate_attn.pt")