In [1]:
import torch
from transformers import AutoTokenizer
from src.transformers.models.bert import BertModel
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)
model = BertModel.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True).to(device)

  from .autonotebook import tqdm as notebook_tqdm
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
  return self.fget.__get__(instance, owner)()


In [2]:
num_layers = 12
num_heads = 12

In [3]:
tokens_dict = {"tokens": list()}
vocab_reverse = dict((value, key) for key, value in tokenizer.vocab.items())


attention_dict = dict()
point_position_dict = dict()
agg_attn_dict = dict()
for layer in range(num_layers):
    attention_dict[layer] = dict()
    point_position_dict[layer] = dict()
    for head in range(num_heads):
        agg_attn_dict[f"{layer}_{head}"] = list()
        attention_dict[layer][head] = dict()
        attention_dict[layer][head]["layer"] = layer 
        attention_dict[layer][head]["head"] = head
        attention_dict[layer][head]["tokens"] = list() 
        point_position_dict[layer][head] = dict()
        point_position_dict[layer][head]["layer"] = layer 
        point_position_dict[layer][head]["head"] = head
        point_position_dict[layer][head]["tokens"] = list()
        point_position_dict[layer][head]["query"] = list()  
        point_position_dict[layer][head]["key"] = list()         

In [4]:
dna_1 = "GGGGTAATCAGAGCAGAACCAGGCACCTGCCCTGCCTGATGTCCTCTGCTCAGGGCTGGCAGCTGTGTCCTGTGTCCTCCCCACCCCCTGGGACCACAAAGCTCCACCCCTGCCACACCCTGACATACTCAAGCCCAGGAGCCTGACCCAGGGCTCAGGGTGGGGTCAAAAACCGGGGGGATCTGATTTGCATGGATGGACTCTCCCCCTCTCAGAGTATGAAGAGAGGGAGAGATCTGGGGGAAGCTCAGCTTCAGCTGTGGTAGAGAAGACAGGATTCAGGACAATCTCCAGCATGGC"
dna_2 = "AAAGAGACCCGGGGAGCATCTGGGCTTCCAAGGTCCTCGGTACGGCCCAAGGCAGCGAAGGACGCGCGGCTCCAGGCTGCGGGAGCCAGGACGACCGGGGGCTCCCAGAGCGCGAAGTCGCGATCCTCGGCGGTGGAGAGCTCGTGCCAAAACGTCCTCCCCTGCGCCAGTCAGGCCTTCGCGGGGCTGGCAGGCGGGCGGGGGCGGGGCCGCCGCACTTTAAGAGGCTGTGCAGGCAGACAGACCTCCAGGCCCGCTAGGGGATCCGCGCCATGGAGGCCGCCCGGGACTATGCAGGAG"
dna_3 = "AGACCCCGGAGCCACAAGGAGAGGGCTGGATCCCCGGCTCAGAGGGAAGAGGTCGGATCCCCAGCTGAGAGGGAGGAGGGTCCCGGACCCTAGGAGTGGGAAGGAAAGGCTCGGATCCCCTGATCCCCAGGAGGAGGGGACCCGGCTGCCTCCCGGTTGGGGCCGCGCGAGGGCGGGGCGCGGAAGGATCCGGGAGGGCCGTGCTCCGCCACCCAGTATATATCTGTCCCCAGTCCCCGGGGCCGCCTCATTCCCTGTCCTCGGATCACAGTCTCTTCTCACTACAGTGTCGCCGCCTCT"
dna_4 = "GTCTTTCCTTGGAGGAGGCATTGGCACGAGTTACTATAAACTCCCTCTGAATCTCAAGACTTCTGGGACGCCGATTCCGCTCCTGGCCTGGGGCAAGGCGTGGGAGCTTGGAAGCCAGCGCTGCGCTCCCCGTGGGAAGCGATCGTCTCCTCTGTCAACTCGCGCCTGGGCACTTAGCCCCTCCCGTTTCAGGGCGCCGCCTCCCCGGATGGCAAACACTATAAAGTGGCGGCGAATAAGGTTCCTCCTGCTGCTCTCGGTTTAGTCCAAGATCAGCGATATCACGCGTCCCCCGGAGCA"

dataset = [dna_1, dna_2, dna_3, dna_4]

sentence_stops = list()
sentence_starts = list()
pos = 0
for sequence in dataset:
    sentence_starts.append(pos)
    inputs = tokenizer(" ".join([sequence[i:i+6] for i in range(0, len(sequence)-6, 1)]), return_tensors = 'pt')
    out = model(**inputs.to(device), output_attentions=True, return_dict=True)
    tokens = [vocab_reverse[x] for x in inputs['input_ids'].tolist()[0]]   
    pos = pos+len(tokens)
    sentence_stops.append(pos) 
    for i,value in enumerate(tokens):
        single_token = {}
        single_token['value'] = value
        single_token['type'] = "query"
        single_token["length"] = len(tokens)
        single_token['pos_int'] = i
        single_token['position'] = i/(len(tokens)- 1)
        single_token['sentence'] = " ".join(tokens) 
   
        tokens_dict['tokens'].append(single_token)
        for layer in range(num_layers): 
            for head in range(num_heads):
                point_position_dict[layer][head]['query'].append(out.query[layer][head][i].detach().cpu().numpy())
                point_position_dict[layer][head]['key'].append(out.key[layer][head][i].detach().cpu().numpy())
                attention_dict[layer][head]['tokens'].append({'attention' : out.attentions[layer][0][head][i].detach().cpu().numpy()})
        

In [None]:
# Getting point positions

#Some code from chatGPT!!!!!!!
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
import numpy as np
import tqdm

def get_pca_embeddings(vectors, n_components=2):
    pca = PCA(n_components=n_components)
    return pca.fit_transform(vectors)
def get_tsne_embeddings(vectors, n_components=2, perplexity=30.0):
    tsne = TSNE(n_components=n_components, perplexity=perplexity, n_iter=250)
    return tsne.fit_transform(vectors)
def get_umap_embeddings(vectors, n_components=2, n_neighbors=15, min_dist=0.1):
    umap_model = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist)
    return umap_model.fit_transform(vectors)
def calculate_centroid(vectors):
    """Calculate the centroid of a list of vectors."""
    return np.mean(vectors, axis=0)
def translate_vectors(source_vectors, target_vectors):
    """Translate source_vectors so their centroid matches that of target_vectors."""
    source_centroid = calculate_centroid(source_vectors)
    target_centroid = calculate_centroid(target_vectors)
    translation = target_centroid - source_centroid
    translated_vectors = source_vectors + translation
    return translated_vectors

def calculate_norm(vector):
    return np.linalg.norm(vector)

for layer in tqdm.tqdm(range(num_layers)): 
    for head in tqdm.tqdm(range(num_heads)):
        translated_key = translate_vectors(point_position_dict[layer][head]['key'], point_position_dict[layer][head]['query'])
        vectors = np.stack(point_position_dict[layer][head]['query'] + [np.array(row) for row in translated_key])
        
        pca_2d = get_pca_embeddings(vectors, n_components=2)
        pca_3d = get_pca_embeddings(vectors, n_components=3)

        tsne_2d = get_tsne_embeddings(vectors, n_components=2)
        tsne_3d = get_tsne_embeddings(vectors, n_components=3)

        umap_2d = get_umap_embeddings(vectors, n_components=2)
        umap_3d = get_umap_embeddings(vectors, n_components=3)
        
        for token in range(umap_2d.shape[0]):
            point_position_dict[layer][head]['tokens'].append({
                "tsne_x" : tsne_2d[token][0],
                "tsne_y" : tsne_2d[token][1],
                
                "tsne_x_3d" : tsne_3d[token][0],
                "tsne_y_3d" : tsne_3d[token][1],                
                "tsne_z_3d" : tsne_3d[token][2],                
                
                "umap_x" : umap_2d[token][0],
                "umap_y" : umap_2d[token][1],          

                "umap_x_3d" : umap_3d[token][0],
                "umap_y_3d" : umap_3d[token][1],                
                "umap_z_3d" : umap_3d[token][2],                         
                
                "pca_x" : pca_2d[token][0],
                "pca_y" : pca_2d[token][1],
                
                "pca_x_3d" : pca_3d[token][0],
                "pca_y_3d" : pca_3d[token][1],                                    
                "pca_z_3d" : pca_3d[token][2],    
                
                "norm" : calculate_norm(vectors[token])                              
            })

100%|██████████| 3/3 [00:24<00:00,  8.06s/it]
100%|██████████| 3/3 [00:22<00:00,  7.35s/it]
100%|██████████| 3/3 [00:23<00:00,  7.84s/it]
100%|██████████| 3/3 [01:09<00:00, 23.25s/it]


In [None]:
for layer in tqdm.tqdm(range(num_layers)): 
    for head in tqdm.tqdm(range(num_heads)):
        avg =  np.average(np.stack([np.stack([token['attention'] for token in attention_dict[layer][head]['tokens'][sentence_starts[i]:sentence_stops[i]]]) for i in range(len(dataset))]),axis=0)
        agg_attn_dict[f"{layer}_{head}"] = [{"attention" : avg[i].tolist()} for i in range(avg.shape[0])]


100%|██████████| 3/3 [00:00<00:00, 284.36it/s]
100%|██████████| 3/3 [00:00<00:00, 325.57it/s]
100%|██████████| 3/3 [00:00<00:00, 369.91it/s]
100%|██████████| 3/3 [00:00<00:00, 89.66it/s]


In [None]:
for layer in tqdm.tqdm(range(num_layers)): 
    for head in tqdm.tqdm(range(num_heads)):
        del point_position_dict[layer][head]["query"]
        del point_position_dict[layer][head]["key"]

100%|██████████| 3/3 [00:00<00:00, 670.45it/s]
100%|██████████| 3/3 [00:00<00:00, 868.51it/s]
100%|██████████| 3/3 [00:00<00:00, 956.80it/s]
100%|██████████| 3/3 [00:00<00:00, 133.28it/s]


In [None]:
for layer in tqdm.tqdm(range(num_layers)): 
    for head in tqdm.tqdm(range(num_heads)):
        for i in range(len(point_position_dict[layer][head]['tokens'])):
            for data_feature in ["tsne_x", "tsne_y", "umap_x", "umap_y", "norm", "tsne_x_3d", "tsne_y_3d", "tsne_z_3d", "umap_x_3d", "umap_y_3d", "umap_z_3d", "pca_x", "pca_y", "pca_x_3d", "pca_y_3d", "pca_z_3d"]:
                point_position_dict[layer][head]['tokens'][i][data_feature] = float(point_position_dict[layer][head]['tokens'][i][data_feature])

100%|██████████| 3/3 [00:00<00:00, 112.59it/s]
100%|██████████| 3/3 [00:00<00:00, 156.90it/s]
100%|██████████| 3/3 [00:00<00:00, 164.56it/s]
100%|██████████| 3/3 [00:00<00:00, 41.50it/s]


In [None]:
import copy
tokens = copy.deepcopy(tokens_dict["tokens"])
pls = list()
for token in tqdm.tqdm(tokens):
    token["type"] = "key"
    tokens_dict["tokens"].append(token)
    

100%|██████████| 1184/1184 [00:00<00:00, 4826099.06it/s]


In [None]:
for layer in tqdm.tqdm(range(num_layers)): 
    for head in tqdm.tqdm(range(num_heads)):
        for att in attention_dict[layer][head]['tokens']:
            att['attention'] = att['attention'].tolist() 
        

100%|██████████| 3/3 [00:00<00:00, 175.51it/s]
100%|██████████| 3/3 [00:00<00:00, 193.79it/s]
100%|██████████| 3/3 [00:00<00:00, 195.01it/s]
100%|██████████| 3/3 [00:00<00:00, 55.45it/s]


## Writing to files

In [None]:
import json

In [None]:
with open("/home/cameron/repos/attention-viz-bio/web/data/DNABERT/agg_attn.json", "w") as fp:
    json.dump(agg_attn_dict , fp) 

In [None]:
with open("/home/cameron/repos/attention-viz-bio/web/data/DNABERT/tokens.json", "w") as fp:
    json.dump(tokens_dict , fp) 

In [None]:
for layer in tqdm.tqdm(range(num_layers)): 
    for head in tqdm.tqdm(range(num_heads)):
        with open(f"/home/cameron/repos/attention-viz-bio/web/data/DNABERT/attention/layer{layer}_head{head}.json", "w") as fp:
            json.dump(attention_dict[layer][head] , fp) 
        with open(f"/home/cameron/repos/attention-viz-bio/web/data/DNABERT/byLayerHead/layer{layer}_head{head}.json", "w") as fp:
            json.dump(point_position_dict[layer][head] , fp)             
        

100%|██████████| 3/3 [00:00<00:00,  4.37it/s]
100%|██████████| 3/3 [00:00<00:00,  3.99it/s]
100%|██████████| 3/3 [00:00<00:00,  4.10it/s]
100%|██████████| 3/3 [00:02<00:00,  1.38it/s]
