<a href="https://colab.research.google.com/github/danielhou13/cogs402longformer/blob/main/src/Token_attention_with_head_importance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# import sys
# sys.path.append('/content/drive/My Drive/{}'.format("cogs402longformer/"))

In [3]:
pip install datasets --quiet

In [4]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Import Dataset and Model

In [5]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

Import the Reserach Papers dataset

In [6]:
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('allenai/longformer-base-4096')

def longformer_finetuned_papers():
    model = AutoModelForSequenceClassification.from_pretrained('danielhou13/longformer-finetuned_papers', num_labels = 2)
    return model

def preprocess_function(tokenizer, example, max_length):
    example.update(tokenizer(example['text'], padding='max_length', max_length=max_length, truncation=True))
    return example

def get_papers_dataset(dataset_type):
    max_length = 2048
    dataset = load_dataset("danielhou13/cogs402dataset")[dataset_type]

    # tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    dataset = dataset.map(lambda x: preprocess_function(tokenizer, x, max_length), batched=True)
    setattr(dataset, 'input_columns', ['input_ids', 'attention_mask'])
    setattr(dataset, 'target_columns', ['labels'])
    setattr(dataset, 'max_length', max_length)
    setattr(dataset, 'tokenizer', tokenizer)
    return dataset

def papers_test_set():
    return get_papers_dataset('test')

Import the news dataset

In [7]:
# def preprocess_function(tokenizer, example, max_length):
#     example.update(tokenizer(example['text'], padding='max_length', max_length=max_length, truncation=True))
#     return example

# def longformer_finetuned_news():
#     model = AutoModelForSequenceClassification.from_pretrained('danielhou13/longformer-finetuned-news-cogs402', num_labels = 2)
#     return model

# def get_news_dataset(dataset_type):
#     max_length = 2048
#     dataset = load_dataset("danielhou13/cogs402dataset2")[dataset_type]

#     tokenizer = AutoTokenizer.from_pretrained('allenai/longformer-base-4096')
#     dataset = dataset.map(lambda x: preprocess_function(tokenizer, x, max_length), batched=True)

#     labels = map(int, dataset['hyperpartisan'])
#     print(type(dataset['hyperpartisan']))
#     labels = list(labels)
#     dataset = dataset.add_column("labels", labels)

#     dataset = dataset.remove_columns(['text', 'title', 'hyperpartisan', 'url', 'published_at', 'bias'])
#     print(dataset)
#     setattr(dataset, 'input_columns', ['input_ids', 'attention_mask'])
#     setattr(dataset, 'target_columns', ['labels'])
#     setattr(dataset, 'max_length', max_length)
#     setattr(dataset, 'tokenizer', tokenizer)
#     return dataset

# def news_train_set():
#     return get_news_dataset('train')

# def news_test_set():
#     return get_news_dataset('validation')

Load papers model and dataset and preprocess it

In [8]:
cogs402_test = papers_test_set()
model = longformer_finetuned_papers()
columns = cogs402_test.input_columns + cogs402_test.target_columns
print(columns)
cogs402_test.set_format(type='torch', columns=columns)
cogs402_test=cogs402_test.remove_columns(['text'])

Using custom data configuration danielhou13--cogs402dataset-cc784554b797f843
Reusing dataset parquet (/root/.cache/huggingface/datasets/danielhou13___parquet/danielhou13--cogs402dataset-cc784554b797f843/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)


  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?ba/s]

['input_ids', 'attention_mask', 'labels']


Load news model and dataset and preprocess it

In [9]:
# cogs402_test = news_test_set()
# model = longformer_finetuned_news()
# columns = cogs402_test.input_columns + cogs402_test.target_columns
# print(columns)
# cogs402_test.set_format(type='torch', columns=columns)

In [10]:
if torch.cuda.is_available():
    model = model.cuda()

print(model.device)

cuda:0


Take example for evaluation

In [11]:
testexam = cogs402_test[923]

In [12]:
# print(test['labels'][923])

In [13]:
output = model(testexam["input_ids"].unsqueeze(0).cuda(), attention_mask=testexam['attention_mask'].unsqueeze(0).cuda(), labels=testexam['labels'].cuda(), output_attentions = True)
batch_attn = output[-2]
output_attentions = torch.stack(batch_attn).cpu()
global_attention = output[-1]
output_global_attentions = torch.stack(global_attention).cpu()
print("output_attention.shape", output_attentions.shape)
print("gl_output_attention.shape", output_global_attentions.shape)

output_attention.shape torch.Size([12, 1, 12, 2048, 514])
gl_output_attention.shape torch.Size([12, 1, 12, 2048, 1])


In [14]:
# print(os.getcwd())
# yes = torch.load("resources/longformer_test2/epoch_3/aggregate_attn.pt")

Convert sliding attention matrix to correct seq_len x seq_len matrix

In [15]:
def create_head_matrix(output_attentions, global_attentions):
    new_attention_matrix = torch.zeros((output_attentions.shape[0], 
                                      output_attentions.shape[0]))
    for i in range(output_attentions.shape[0]):
        test_non_zeroes = torch.nonzero(output_attentions[i]).squeeze()
        test2 = output_attentions[i][test_non_zeroes[1:]]
        new_attention_matrix_indices = test_non_zeroes[1:]-257 + i
        new_attention_matrix[i][new_attention_matrix_indices] = test2
        new_attention_matrix[i][0] = output_attentions[i][0]
        new_attention_matrix[0] = global_attentions.squeeze()[:output_attentions.shape[0]]
    return new_attention_matrix


def attentions_all_heads(output_attentions, global_attentions):
    new_matrix = []
    for i in range(output_attentions.shape[0]):
        matrix = create_head_matrix(output_attentions[i], global_attentions[i])
        new_matrix.append(matrix)
    return torch.stack(new_matrix)

def all_batches(output_attentions, global_attentions):
    new_matrix = []
    for i in range(output_attentions.shape[0]):
        matrix = attentions_all_heads(output_attentions[i], global_attentions[i])
        new_matrix.append(matrix)
    return torch.stack(new_matrix)

def all_layers(output_attentions, global_attentions):
    new_matrix = []
    for i in range(output_attentions.shape[0]):
        matrix = all_batches(output_attentions[i], global_attentions[i])
        new_matrix.append(matrix)
    return torch.stack(new_matrix)

In [16]:
converted_mat = all_layers(output_attentions, output_global_attentions).detach().cpu().numpy()
print(converted_mat.shape)

(12, 1, 12, 2048, 2048)


In [17]:
all_tokens = tokenizer.convert_ids_to_tokens(testexam["input_ids"])

Load head importance model and scale the attentions by head importance

In [None]:
head_importance = torch.load("/content/drive/MyDrive/cogs402longformer/t3-visapplication/resources/pretrained/head_importance.pt")


In [19]:
def scale_by_importance(attention_matrix, head_importance):
  new_matrix = np.zeros_like(attention_matrix)
  for i in range(attention_matrix.shape[0]):
    head_importance_layer = head_importance[i]
    for j in range(attention_matrix.shape[1]):
      new_matrix[i,j] = attention_matrix[i,j] * np.expand_dims(head_importance_layer, axis=(1,2))
  return new_matrix

In [20]:
converted_mat_importance = scale_by_importance(converted_mat, head_importance)

Lets suppose we want the topk attended tokens for each token in each head, batch and layer.

In [21]:
# get the top k and indexes and values for each row
def find_top_attention_unsummed(scores_mat, k):
  indices = scores_mat.argsort(axis=4)[:, :, :, :, :-(k+1):-1]
  vals = np.take_along_axis(scores_mat, indices, axis=4)  
  return indices, vals

#find the tokens using the index matrix and the all_tokens list to create a 
#matrix of tokens 
def get_tokens(index_matrix, example=None):
  highest_tokens = []
  #layer
  for i in range(index_matrix.shape[0]):
    row_tokens = []
    #batch
    for j in range(index_matrix.shape[1]):
      batch_tokens = []

      if (example is not None) and (j != example):
        continue

      #head
      for k in range(index_matrix.shape[2]):
        head_tokens = []
        #token
        for x in range(index_matrix.shape[3]):
          tokens = [all_tokens[idx] for idx in index_matrix[i][j][k][x]]
          head_tokens.append(tokens)
        batch_tokens.append(head_tokens)
      row_tokens.append(batch_tokens)
    highest_tokens.append(row_tokens)
  return np.array(highest_tokens)

#format into a dataframe
def highest_attended_tokens(index, values, tokens, all_tokens, example=None):
    dataframe=[]
    for i in range(index.shape[0]):
      for j in range(index.shape[1]):

        if (example is not None) and (j != example):
          continue

        for k in range(index.shape[2]):
          for x in range(index.shape[3]):
            for y in range(index.shape[4]):
              d = {"token":all_tokens[x], 'self_position':x, 
                  "attended_token": tokens[i,j,k,x,y],
                  'token_position':index[i,j,k,x,y], 
                  'attention_scores':values[i,j,k,x,y],
                  'layer':(i+1), 'head':(k+1),
                  'rank':(y+1),
                  'batch':j}
              dataframe.append(d)
    df = pd.DataFrame(dataframe)
    return df

In [22]:
#combine the previous functions
def highest_tokens(matrix, k, all_tokens, example=None):
  index, values = find_top_attention_unsummed(matrix, k)
  highest_tokens = get_tokens(index, example)
  df = highest_attended_tokens(index, values, highest_tokens, all_tokens, example)
  return df

Thus, for every token, we can get the top k tokens that this token attends to. We can filter by batch, layer, head, rank, position, etc.,. The downsides are that its not very visually appealing despite being organized.

In [23]:
df2 = highest_tokens(converted_mat_importance, 10, all_tokens, 0)
df2

Unnamed: 0,token,self_position,attended_token,token_position,attention_scores,layer,head,rank,batch
0,<s>,0,<s>,0,0.002284,1,1,1,0
1,<s>,0,(,512,0.001891,1,1,2,0
2,<s>,0,Ġoptimizations,1024,0.000915,1,1,3,0
3,<s>,0,Ġconstants,1536,0.000651,1,1,4,0
4,<s>,0,}.,1977,0.000073,1,1,5,0
...,...,...,...,...,...,...,...,...,...
2949115,</s>,2047,.,1896,0.000035,12,12,6,0
2949116,</s>,2047,</s>,2047,0.000035,12,12,7,0
2949117,</s>,2047,.,1888,0.000035,12,12,8,0
2949118,</s>,2047,.,1991,0.000035,12,12,9,0


Get the sum of the attentions for all the tokens (column-wise). In other words, find out how much every word is attended to

In [24]:
attention_matrix_importance = converted_mat_importance.sum(axis=3)
print(attention_matrix_importance.shape)

(12, 1, 12, 2048)


Get top k attended words for each head, for each example in batch, for each layer

In [25]:
def find_top_attention(scores_mat, k):
  indices = scores_mat.argsort(axis=3)[:, :, :, :-(k+1):-1]
  vals = np.take_along_axis(scores_mat, indices, axis=3)
  return indices, vals

We want the position (index) of the token, the attention value, and the actual token itself.

In [26]:
def get_tokens2(index_matrix, example=None):
  highest_tokens = []
  #layer
  for i in range(index_matrix.shape[0]):
    row_tokens = []
    #batch
    for j in range(index_matrix.shape[1]):
      batch_tokens = []

      if example is not None and j != example:
        continue

      #head
      for k in range(index_matrix.shape[2]):
        tokens = [all_tokens[idx] for idx in index_matrix[i][j][k]]
        batch_tokens.append(tokens)
      row_tokens.append(batch_tokens)
    highest_tokens.append(row_tokens)
  return np.array(highest_tokens)

In [27]:
def highest_attended_dataframe(index, values, tokens, example=None):
    dataframe=[]
    #layer
    for i in range(index.shape[0]):
      #batch
      for j in range(index.shape[1]):
        if example is not None and j != example:
          continue
        #head
        for k in range(index.shape[2]):
          #token
          for x in range(index.shape[3]):
            d = {"token":tokens[i,j,k,x], 'position':index[i,j,k,x], 
                'attention_scores':values[i,j,k,x],
                 'layer':(i+1), 'head':(k+1),
                 'rank':(x+1),
                 'batch':j}
            dataframe.append(d)
    df = pd.DataFrame(dataframe)
    return df

In [28]:
def highest_attentions_summed(matrix, k, all_tokens, example=None):
  index, values = find_top_attention(matrix, k)
  highest_tokens = get_tokens2(index, example)
  df_highest = highest_attended_dataframe(index, values, highest_tokens, example)
  return df_highest

In [29]:
df_highest = highest_attentions_summed(attention_matrix_importance, 10, all_tokens, 0)
df_highest[df_highest["rank"] == 1]

Unnamed: 0,token,position,attention_scores,layer,head,rank,batch
0,<s>,0,1.055370,1,1,1,0
10,<s>,0,0.983971,1,2,1,0
20,<s>,0,0.927271,1,3,1,0
30,<s>,0,0.620787,1,4,1,0
40,<s>,0,2.092713,1,5,1,0
...,...,...,...,...,...,...,...
1390,ified,1722,0.237352,12,8,1,0
1400,<s>,0,0.100713,12,9,1,0
1410,ĠW,1748,0.019837,12,10,1,0
1420,Ġprogramming,1716,0.030009,12,11,1,0


Get the attention of a token at a position for each layer and head. Take one example at a time as each example has different tokens. Pros: can isolate for layers and/or heads. Cons: not much context for the attention scores

In [30]:
def position_attention(agg_matrix, position, example=None):
  dataframe=[]
  if example is not None:
    new_mat = agg_matrix[:, example, :]
    new_mat = new_mat.squeeze()
    #layer
    for i in range(new_mat.shape[0]):
      #head
      for j in range(new_mat.shape[1]):
        temp = new_mat[i,j].argsort()[::-1]
        temp = np.where(temp==position)[0].squeeze() + 1
        d = {"token":all_tokens[position], 'position':position, 
            'attention_scores':new_mat[i,j,position], 'layer':(i+1), 'head':(j+1),
            'rank':temp, 'batch':example}
        dataframe.append(d)
  else:
    new_mat = agg_matrix
    #layer
    for i in range(new_mat.shape[0]):
      #batch
      for j in range(new_mat.shape[1]):
        #head
        for k in range(new_mat.shape[2]):
          temp = new_mat[i,j,k].argsort()[::-1]
          temp = np.where(temp==position)[0].squeeze() + 1
          d = {"token":all_tokens[position], 'position':position, 
              'attention_scores':new_mat[i,j,k,position], 'layer':(i+1), 'head':(k+1),
              'rank':temp, 'batch':j}
          dataframe.append(d)
  df = pd.DataFrame(dataframe)
  return df

In [31]:
position_df = position_attention(attention_matrix_importance, 0)
position_df[position_df["head"]==1]

Unnamed: 0,token,position,attention_scores,layer,head,rank,batch
0,<s>,0,1.05537,1,1,1,0
12,<s>,0,6.440273,2,1,1,0
24,<s>,0,2.043325,3,1,1,0
36,<s>,0,9.564488,4,1,1,0
48,<s>,0,1.061624,5,1,1,0
60,<s>,0,1.041231,6,1,6,0
72,<s>,0,0.093136,7,1,111,0
84,<s>,0,0.346322,8,1,1,0
96,<s>,0,0.000331,9,1,36,0
108,<s>,0,0.000431,10,1,500,0


If really needed, can just have the full matrix of the position, ranks, attention scores, layers, and heads of each token per example. 

Tokens are all the tokens in the example. Position is the location of the token with zero-based indexing. Attention scores are the aggregate, scaled attention scores.

Layer goes from 1 to 12.
Head goes from 1 to 12.

Rank is the attention score rank with respect to layer and head. Goes from 1 to number of tokens in the example

Pros: can search up whatever is needed. Has access to all the information and can be extracted for comparisons Cons: have to know what you want and manually look it up

In [32]:
def full_matrix(agg_matrix, example=None):
  dataframe=[]

  if example is not None:
    new_mat = agg_matrix[:, example]
    new_mat = new_mat.squeeze()
    print(new_mat.shape)

    #layer
    for i in range(new_mat.shape[0]):
      #head
      for j in range(new_mat.shape[1]):
        temp = new_mat[i,j].argsort()[::-1]      
        #token
        for k in range(new_mat.shape[2]):
          temp2 = np.where(temp==k)[0].squeeze() + 1
          d = {"token":all_tokens[k], 'position':k, 
              'attention_scores':new_mat[i,j,k], 'layer':(i+1), 'head':(j+1),
              'rank':temp2}
          dataframe.append(d)
  else:
    new_mat = agg_matrix
    print(new_mat.shape)
    
    #layer
    for i in range(new_mat.shape[0]):
      #batch
      for j in range(new_mat.shape[1]):
        #head
        for k in range(new_mat.shape[2]):
          temp = new_mat[i,j,k].argsort()[::-1]      
          #token
          for x in range(new_mat.shape[3]):
            temp2 = np.where(temp==x)[0].squeeze() + 1
            d = {"token":all_tokens[x], 'position':x, 
                'attention_scores':new_mat[i,j,k,x], 'layer':(i+1), 'head':(k+1),
                'rank':temp2, 'batch':j}
            dataframe.append(d)
  df = pd.DataFrame(dataframe)
  return df

In [33]:
full_mat = full_matrix(attention_matrix_importance)

(12, 1, 12, 2048)


In [34]:
full_mat[(full_mat['head']==1) & (full_mat['layer']==1)]

Unnamed: 0,token,position,attention_scores,layer,head,rank,batch
0,<s>,0,1.055370,1,1,1,0
1,lp,1,0.013792,1,1,2043,0
2,opt,2,0.039474,1,1,1000,0
3,:,3,0.060621,1,1,304,0
4,ĠA,4,0.046300,1,1,691,0
...,...,...,...,...,...,...,...
2043,ĠC,2043,0.039233,1,1,1015,0
2044,",",2044,0.044796,1,1,748,0
2045,Ġis,2045,0.025272,1,1,1803,0
2046,Ġthe,2046,0.010663,1,1,2047,0
