In [1]:
#import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, AutoModel
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.nn import MSELoss

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = AutoModel.from_pretrained("google-bert/bert-base-uncased", output_attentions=True)
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

config.json: 100%|██████████| 570/570 [00:00<00:00, 642kB/s]
model.safetensors: 100%|██████████| 440M/440M [00:10<00:00, 40.8MB/s] 
tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 76.1kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 4.26MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 4.70MB/s]


In [5]:
input_ids = tokenizer("Hello, world!", return_tensors="pt").input_ids
outputs = model(input_ids)


In [3]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.nn import MSELoss


# sets the smallest (in terms of magnitude) weights to 0
def get_sparse_weights(data, percentile=.25): 
    flat = data.flatten()
    values, _ = flat.abs().sort(descending=False)
    idx_to_get = int(percentile * flat.size(0))
    cutoff_val = values[idx_to_get]
    mask = torch.where(data.abs() > cutoff_val, 1, 0)
    return data * mask


class DeepSub(nn.Module):
    def __init__(self, input_dim, inner_rank, output_dim) -> None:
        super().__init__()
        self.l1 = nn.Linear(in_features=input_dim, out_features=inner_rank)
        self.act = nn.GELU()
        self.l2 = nn.Linear(in_features=inner_rank, out_features=output_dim)

    def forward(self, x):
        x = self.l1(x)
        x = self.act(x)
        x = self.l2(x)
        return x


def truncated_svd(W, l):
    """Compress the weight matrix W of an inner product (fully connected) layer
    using truncated SVD.
    Parameters:
    W: N x M weights matrix
    l: number of singular values to retain
    Returns:
    Ul, L: matrices such that W \approx Ul*L
    """

    U, s, V = torch.linalg.svd(W)

    Ul = U[:, :l]
    sl = s[:l]
    V = V.t()
    Vl = V[:l, :]

    SV = torch.mm(torch.diag(sl), Vl)
    return Ul, SV


def get_svd_ffn(w1, w2, l, bias=False): 
    ul1, sv1 = truncated_svd(w1, l)
    ul2, sv2 = truncated_svd(w2, l)

    w1_ffn_sv = nn.Linear(sv1.size(1), sv1.size(0), bias=bias)
    w1_ffn_sv.weight.data = sv1
    w1_ffn_ul = nn.Linear(ul1.size(1), ul1.size(0), bias=bias)
    w1_ffn_ul.weight.data = ul1

    w2_ffn_sv = nn.Linear(sv2.size(1), sv2.size(0), bias=bias)
    w2_ffn_sv.weight.data = sv2
    w2_ffn_ul = nn.Linear(ul2.size(1), ul2.size(0), bias=bias)
    w2_ffn_ul.weight.data = ul2
    svd_module = nn.Sequential(w1_ffn_sv, w1_ffn_ul, w2_ffn_sv, w2_ffn_ul)
    return svd_module
    
def train_deep_sub(deep_sub,
                   gt_module,
                   training_iter, 
                  input_size,
                  l=200):
    criterion = MSELoss()
    optimizer = Adam(deep_sub.parameters(), lr=0.001)
    for _ in range(training_iter):
        rand_batch = torch.randn((512, input_size))
        optimizer.zero_grad()
        output = deep_sub(rand_batch)
        # true val calc 
        x = gt_module[0](rand_batch)
        true_val = gt_module[1](x, rand_batch)
        loss = criterion(output, true_val)
        loss.backward()
        optimizer.step()
        print(loss.item())
    return deep_sub


def train_deep_sub_svd(deep_sub,
                       svd_module,
                        gt_module,
                        training_iter, 
                        input_size, 
                        batch_size=512,
                        lr=.001):
    criterion = MSELoss()
    optimizer = Adam(deep_sub.parameters(), lr=lr)
    for _ in range(training_iter):
        rand_batch = torch.randn((batch_size, input_size))
        optimizer.zero_grad()
        output = deep_sub(rand_batch)
        svd_output = svd_module(rand_batch)

        # true val calc 
        x = gt_module[0](rand_batch)
        true_val = gt_module[1](x, rand_batch)

        loss = criterion(output+svd_output, true_val)
        loss.backward()
        optimizer.step()
        print(loss.item())
    return deep_sub

In [7]:
# Access the encoder layer directly without 'bert' prefix
test = model.encoder.layer[0].intermediate.dense.weight.data
sparse_test = get_sparse_weights(test)
model.encoder.layer[0].intermediate.dense.weight.data = sparse_test

test = model.encoder.layer[0].output.dense.weight.data
sparse_test = get_sparse_weights(test)
model.encoder.layer[0].output.dense.weight.data = sparse_test

# Create the custom modules using the direct attribute access
gt_module = nn.Sequential(model.encoder.layer[0].intermediate, model.encoder.layer[0].output)
svd_module = get_svd_ffn(model.encoder.layer[0].intermediate.dense.weight.data, 
                         model.encoder.layer[0].output.dense.weight.data, l=200)
deep_sub = nn.Sequential(DeepSub(768, 100, 100), nn.GELU(), DeepSub(100, 100, 768))

# Train the custom module
train_deep_sub(svd_module, gt_module, 10000, 768)


2.4118776321411133
1.6411609649658203
1.240706443786621
0.993678867816925
0.8391153812408447
0.7425044178962708
0.679699182510376
0.6340731978416443
0.6054267287254333
0.5950475335121155
0.6040317416191101
0.6025871634483337
0.5975318551063538
0.589954674243927
0.5938610434532166
0.592596709728241
0.5917796492576599
0.5887155532836914
0.5806170105934143
0.5755786895751953
0.5666727423667908
0.5635799765586853
0.5513815879821777
0.5507031083106995
0.5434239506721497
0.5335988402366638
0.5289267897605896
0.5199122428894043
0.5172427296638489
0.5141124129295349
0.5084227919578552
0.5105065703392029
0.5025333762168884
0.5070214867591858
0.5033749938011169
0.49940362572669983
0.4961050748825073
0.5005785226821899
0.4982062876224518
0.4972127377986908
0.4987466335296631
0.49366462230682373
0.49155259132385254
0.485890656709671
0.4920174181461334
0.49075034260749817
0.49087992310523987
0.49016258120536804
0.48658275604248047
0.4863276481628418
0.4828527271747589
0.4838234484195709
0.482384830

KeyboardInterrupt: 

In [8]:
"""
test = model.bert.encoder.layer[0].intermediate.dense.weight.data
sparse_test = get_sparse_weights(test)
model.bert.encoder.layer[0].intermediate.dense.weight.data = sparse_test

test = model.bert.encoder.layer[0].output.dense.weight.data
sparse_test = get_sparse_weights(test)
model.bert.encoder.layer[0].output.dense.weight.data = sparse_test 

gt_module = nn.Sequential(model.bert.encoder.layer[0].intermediate, model.bert.encoder.layer[0].output)
svd_module = get_svd_ffn(model.bert.encoder.layer[0].intermediate.dense.weight.data, 
                         model.bert.encoder.layer[0].output.dense.weight.data, l=200)
deep_sub = nn.Sequential(DeepSub(768, 100, 100), nn.GELU(), DeepSub(100, 100, 768))
train_deep_sub(svd_module, gt_module, 10000, 768)
"""

'\ntest = model.bert.encoder.layer[0].intermediate.dense.weight.data\nsparse_test = get_sparse_weights(test)\nmodel.bert.encoder.layer[0].intermediate.dense.weight.data = sparse_test\n\ntest = model.bert.encoder.layer[0].output.dense.weight.data\nsparse_test = get_sparse_weights(test)\nmodel.bert.encoder.layer[0].output.dense.weight.data = sparse_test \n\ngt_module = nn.Sequential(model.bert.encoder.layer[0].intermediate, model.bert.encoder.layer[0].output)\nsvd_module = get_svd_ffn(model.bert.encoder.layer[0].intermediate.dense.weight.data, \n                         model.bert.encoder.layer[0].output.dense.weight.data, l=200)\ndeep_sub = nn.Sequential(DeepSub(768, 100, 100), nn.GELU(), DeepSub(100, 100, 768))\ntrain_deep_sub(svd_module, gt_module, 10000, 768)\n'

In [9]:
import pandas as pd 
# cola_df = pd.read_csv('glue_data/CoLA/train.tsv', sep='\t', header=1)
cola_df = pd.read_table('glue_data/CoLA/train.tsv', header=None)
text_sentences = list(cola_df.iloc[0:10][3])
text_sentences

["Our friends won't buy this analysis, let alone the next one we propose.",
 "One more pseudo generalization and I'm giving up.",
 "One more pseudo generalization or I'm giving up.",
 'The more we study verbs, the crazier they get.',
 'Day by day the facts are getting murkier.',
 "I'll fix you a drink.",
 'Fred watered the plants flat.',
 'Bill coughed his way out of the restaurant.',
 "We're dancing the night away.",
 'Herman hammered the metal flat.']

In [10]:
out = tokenizer(text_sentences, return_tensors='pt', padding=True)
tokenized_text = out['input_ids']
token_type = out['token_type_ids']
attention_mask = out['attention_mask']

In [11]:
# attention = Tuple of torch.FloatTensor (one for each layer) 
# of shape (batch_size, num_heads, sequence_length, sequence_length).

model_output = model(input_ids=tokenized_text, 
                     attention_mask=attention_mask, 
                     token_type_ids=token_type)

In [12]:
len(model_output.attentions)

12

In [13]:
tokenized_text

tensor([[  101,  2256,  2814,  2180,  1005,  1056,  4965,  2023,  4106,  1010,
          2292,  2894,  1996,  2279,  2028,  2057, 16599,  1012,   102],
        [  101,  2028,  2062, 18404,  2236,  3989,  1998,  1045,  1005,  1049,
          3228,  2039,  1012,   102,     0,     0,     0,     0,     0],
        [  101,  2028,  2062, 18404,  2236,  3989,  2030,  1045,  1005,  1049,
          3228,  2039,  1012,   102,     0,     0,     0,     0,     0],
        [  101,  1996,  2062,  2057,  2817, 16025,  1010,  1996, 13675, 16103,
          2121,  2027,  2131,  1012,   102,     0,     0,     0,     0],
        [  101,  2154,  2011,  2154,  1996,  8866,  2024,  2893, 14163,  8024,
          3771,  1012,   102,     0,     0,     0,     0,     0,     0],
        [  101,  1045,  1005,  2222,  8081,  2017,  1037,  4392,  1012,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  5965, 27129,  1996,  4264,  4257,  1012,   102,     0,     0,
         

##BERT CoLA finetuned

https://huggingface.co/gchhablani/bert-base-cased-finetuned-cola

Loss: 0.6747

Matthews Correlation: 0.5957



In [2]:
model = AutoModel.from_pretrained("gchhablani/bert-base-cased-finetuned-cola", output_attentions=True)
tokenizer = AutoTokenizer.from_pretrained("gchhablani/bert-base-cased-finetuned-cola")

In [19]:
from data_utils import get_data_df, construct_dataset, GLUE_Dataset
from torch.utils.data import DataLoader

data_dir = 'glue_data'
task = 'CoLA'

# CoLA test set
cola_test_df = get_data_df(data_dir, task, train_int=1)
cola_test_df


Unnamed: 0,0,1,2,3
0,gj04,1,,The sailors rode the breeze clear of the rocks.
1,gj04,1,,The weights made the rope stretch over the pul...
2,gj04,1,,The mechanical doll wriggled itself loose.
3,cj99,1,,"If you had eaten more, you would want less."
4,cj99,0,*,"As you eat the most, you want the least."
...,...,...,...,...
1038,w_80,1,,John considers Bill silly.
1039,w_80,1,,John considers Bill to be silly.
1040,w_80,0,*,John bought a dog for himself to play with.
1041,w_80,1,,John arranged for himself to get the prize.


In [20]:
sentence_col = 3
label_col = 1

cola_dataset = construct_dataset(cola_test_df, sentence_col, label_col)

cola_dataloader = DataLoader(cola_dataset, batch_size=8, shuffle=False)


In [None]:
from sklearn.metrics import matthews_corrcoef

cola_dev_df = get_data_df(data_dir, task, train_int=1)
sentence_col = 3
label_col = 1

cola_dev_dataset = GLUE_Dataset(cola_dev_df, sentence_col, label_col)

print(cola_dev_dataset[2])

cola_dev_dataloader = DataLoader(cola_dev_dataset, batch_size=8, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.eval()
all_predictions = []
all_true_labels = []
with torch.no_grad():
    for sentences, labels in cola_dev_dataloader:
        inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(device)
        outputs = model(**inputs)
        preds = outputs.logits.argmax(axis=-1).cpu().numpy()
        all_predictions.extend(preds)
        all_true_labels.extend(labels.numpy())

# Calculate the Matthews correlation coefficient
mcc = matthews_corrcoef(all_true_labels, all_predictions)
print(f'Matthews correlation coefficient: {mcc}')
