<a href="https://colab.research.google.com/github/danielhou13/cogs402longformer/blob/main/src/T3-vis/T3_vis_projection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import sys
# sys.path.append('/content/drive/My Drive/{}'.format("cogs402longformer/"))

In [None]:
pip install datasets --quiet

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os

import numpy as np
import pandas as pd

import torch
import torch.nn as nn

In [None]:
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer

def longformer_finetuned_papers():
    model = AutoModelForSequenceClassification.from_pretrained('danielhou13/longformer-finetuned_papers', num_labels = 2)
    return model

# def bert_test():
#     model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2)
#     setattr(model, 'num_hidden_layers', model.config.num_hidden_layers)
#     setattr(model, 'num_attention_heads', model.config.num_attention_heads)
#     setattr(model, 'hidden_size', model.config.hidden_size)
#     return model

def preprocess_function(tokenizer, example, max_length):
    example.update(tokenizer(example['text'], padding='max_length', max_length=max_length, truncation=True))
    return example

def get_papers_dataset(dataset_type):
    max_length = 2048
    dataset = load_dataset("danielhou13/cogs402dataset")[dataset_type]
    new_col = list(np.arange(0, len(dataset)))
    dataset = dataset.add_column("idx", new_col)
    visualize_columns = dataset.column_names
    visualize_columns = ['idx', 'text', 'labels']

    tokenizer = AutoTokenizer.from_pretrained('allenai/longformer-base-4096')
    # tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    dataset = dataset.map(lambda x: preprocess_function(tokenizer, x, max_length), batched=True)
    setattr(dataset, 'visualize_columns', visualize_columns)
    setattr(dataset, 'input_columns', ['input_ids', 'attention_mask'])
    setattr(dataset, 'target_columns', ['labels'])
    setattr(dataset, 'max_length', max_length)
    setattr(dataset, 'tokenizer', tokenizer)
    return dataset

def papers_test_set():
    return get_papers_dataset('test')

In [None]:
cogs402_test = papers_test_set()
model = longformer_finetuned_papers()
columns = cogs402_test.input_columns + cogs402_test.target_columns
print(columns)
cogs402_test.set_format(type='torch', columns=columns + ['idx'])
cogs402_test=cogs402_test.remove_columns(['text'])

Using custom data configuration danielhou13--cogs402dataset-cc784554b797f843
Reusing dataset parquet (/root/.cache/huggingface/datasets/danielhou13___parquet/danielhou13--cogs402dataset-cc784554b797f843/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)


  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?ba/s]

['input_ids', 'attention_mask', 'labels']


In [None]:
if torch.cuda.is_available():
    model = model.cuda()

print(model.device)

cuda:0


In [None]:
test = cogs402_test

In [None]:
# print(test['labels'][923])

In [None]:
output = model(test["input_ids"][923].unsqueeze(0).cuda(), attention_mask=test['attention_mask'][923].unsqueeze(0).cuda(), labels=test['labels'][923].cuda(), output_hidden_states=True)

In [None]:
# print(os.getcwd())
# yes = torch.load("resources/longformer_test2/epoch_3/aggregate_attn.pt")

In [None]:
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise_distances
from tqdm import tqdm
def output_hidden(model, dataloader, layers=None, max_entries=5000):
    tsne_model = TSNE(n_components=2,
                      verbose=0,
                      perplexity=30,
                      learning_rate='auto',
                      n_iter=2000,
                      init='random',
                      metric='precomputed',
                      random_state=0,
                      square_distances=True)

    model.eval()

    num_layers = model.config.num_hidden_layers + 1

    hidden_states = torch.zeros(len(dataloader.dataset), num_layers, 768)
    tsne_vectors = np.zeros((max_entries, num_layers, 2))
    labels = np.zeros(len(dataloader.dataset))

    for step, inputs in enumerate(tqdm(dataloader)):
        batch_size_ = inputs['input_ids'].__len__()

        if torch.cuda.is_available():
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.cuda()

        inputs['output_hidden_states'] = True

        idx = inputs['idx'].cpu().tolist()
        del inputs['idx']
        labels[idx] = inputs['labels'].cpu()

        # The first token is used for classification
        output = model(**inputs)
        for i in range(len(output[2])):
            hidden_states[idx, i] = output[2][i][:, 0, :].detach().cpu()

    for i in range(num_layers):
        hidden_states_ = hidden_states[:max_entries, i].numpy()
        distance = pairwise_distances(hidden_states_, hidden_states_, metric='cosine', n_jobs=4)
        tsne_vectors_ = tsne_model.fit_transform(distance).round(decimals=5)
        tsne_vectors[:, i] = tsne_vectors_

    return tsne_vectors, labels[:max_entries]

In [None]:
dataloader = torch.utils.data.DataLoader(cogs402_test, batch_size=1)

In [None]:
max_entries = len(cogs402_test)

In [None]:
tsne_hidden, labels = output_hidden(model, dataloader, max_entries=max_entries)
projection_data = {}
n_examples = len(labels)
projection_data['id'] = pd.Series(np.arange(n_examples))
for layer_idx in range(tsne_hidden.shape[1]):
    projection_data[f'projection_{layer_idx}_1'] = pd.Series(tsne_hidden[:, layer_idx, 0])
    projection_data[f'projection_{layer_idx}_2'] = pd.Series(tsne_hidden[:, layer_idx, 1])

projection_data['labels'] = pd.Series(labels)


100%|██████████| 1179/1179 [04:29<00:00,  4.38it/s]


In [None]:
torch.save(projection_data, "/content/drive/MyDrive/cogs402longformer/projection_data.pt")
print(type(test))

<class 'datasets.arrow_dataset.Dataset'>


In [None]:

# torch.save(test, "aggregate_attn.pt")