# BERT裁剪

采用训练数据链接：

https://huggingface.co/datasets/hw2942/financial-news-sentiment

0:Negative, 1:Neutral, 2:Positive

huggingface模型链接：

https://huggingface.co/hw2942/bert-base-chinese-finetuning-financial-news-sentiment-v2

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from textpruner import TransformerPruner

# Load model directly
tokenizer = AutoTokenizer.from_pretrained("hw2942/bert-base-chinese-finetuning-financial-news-sentiment-v2")
model = AutoModelForSequenceClassification.from_pretrained("hw2942/bert-base-chinese-finetuning-financial-news-sentiment-v2")

# load the dataset 
ds = load_dataset("hw2942/financial-news-sentiment")

def get_acc(test_dataset,model,tokenizer,device='cuda'):
    total = 0
    right = 0
    model.to(device)
    for data in test_dataset:
        inputs = tokenizer(data['Title'],return_tensors='pt').to(device)
        outputs = model(**inputs)
        total  += 1
        if torch.max(outputs[0][0].softmax(0),dim=0).indices==data['labels']:
            right += 1
    return right/total

In [None]:
test_dataset = ds['train']
get_acc(test_dataset,model,tokenizer,device='cuda')

In [None]:
pruner = TransformerPruner(model)
head_mask = torch.tensor(12*[[0]*12])
ffn_mask=torch.tensor([[1]*3072]*12)
pruner.prune(head_mask=head_mask,ffn_mask=ffn_mask,save_model=False)
get_acc(test_dataset,model,tokenizer)

In [None]:
ffn_mask=torch.tensor([[0]*3072]*12)
pruner.prune(head_mask=head_mask,ffn_mask=ffn_mask,save_model=False)
get_acc(test_dataset,model,tokenizer)

# ALBERT裁剪

数据链接：
https://huggingface.co/datasets/nyu-mll/glue

模型链接：
https://huggingface.co/Alireza1044/albert-base-v2-mnli

In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("Alireza1044/albert-base-v2-mnli")
model = AutoModelForSequenceClassification.from_pretrained("Alireza1044/albert-base-v2-mnli")

  return torch.load(checkpoint_file, map_location="cpu")


In [2]:
model

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

In [None]:
from datasets import load_dataset

ds = load_dataset("nyu-mll/glue", "mnli")

In [None]:
def get_acc(model,size):
    count = 0

    cursor = 0

    for data in tqdm(ds['train']):
        cursor += 1
        inputs = tokenizer(data['premise'],data['hypothesis'],return_tensors='pt') # 这里会自动在两个输入的语句中插入分隔符
        with torch.no_grad():
            logits = model(**inputs).logits
        predicted_class_id = logits.argmax().item()
        if predicted_class_id == data['label']:
            count += 1
        if cursor > size:
            break

    return count / size

In [None]:
model.to('cpu')
get_acc(model,1000)

In [None]:
import textpruner
from textpruner import TransformerPruner
pruner = TransformerPruner(model)

In [None]:
head_mask = torch.tensor(1*[[0]*12])
# ffn_mask = torch.tensor(6*[[0]*3072])
pruner.prune(head_mask=head_mask,save_model=False)

In [None]:
model.to('cpu')
get_acc(model,1000)

# ROBERTA裁剪

数据链接：

https://huggingface.co/siebert/sentiment-roberta-large-english

模型链接：

https://huggingface.co/datasets/rahmaabusalma/tweets_sentiment_analysis

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")
model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english")

In [None]:
from datasets import load_dataset

ds = load_dataset("rahmaabusalma/tweets_sentiment_analysis")

In [None]:
import torch
from tqdm import tqdm

def get_acc(model,size,ds):
    count = 0

    cursor = 0

    for data in tqdm(ds['train']):
        if data['label']==1:
            continue
        cursor += 1
        inputs = tokenizer(data['text'],return_tensors='pt') # 这里会自动在两个输入的语句中插入分隔符
        with torch.no_grad():
            predicted_class_id = model(**inputs)
        if torch.max(predicted_class_id[0],dim=1).indices[0] == (1 if data['label']==2 else 0):
            count += 1
        if cursor > size:
            break

    return count / size

In [None]:
get_acc(model,1000,ds)

In [None]:
from textpruner import TransformerPruner
import torch
pruner = TransformerPruner(model)
head_mask = torch.tensor(24*[12*[0]])
pruner.prune(head_mask=head_mask,save_model=False)

In [None]:
model.to('cpu')
get_acc(model,1000,ds)

In [None]:
ffn_mask = torch.tensor(24*[4096*[0]])
model.to('cuda')
pruner.prune(ffn_mask=ffn_mask,save_model=False)

In [None]:
model.to('cpu')
get_acc(model,1000,ds)

# XLM-ROBERTA裁剪

数据链接：

https://huggingface.co/datasets/papluca/language-identification

模型链接：

https://huggingface.co/papluca/xlm-roberta-base-language-detection

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")

In [None]:
from datasets import load_dataset

ds = load_dataset("papluca/language-identification")

In [None]:
pruner=TransformerPruner(model)

In [None]:
import torch
from tqdm import tqdm

def get_acc(model,size,ds):
    count = 0
    id2lang = model.config.id2label
    cursor = 0

    for data in tqdm(ds['train']):
        cursor += 1
        inputs = tokenizer(data['text'], padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            logits = model(**inputs).logits

        preds = torch.softmax(logits, dim=-1)
        vals, idx = torch.max(preds, dim=1)
        if id2lang[idx.item()]==data['labels']:
            count += 1
        if cursor > size:
            break

    return count / size


In [None]:
model.to('cpu')
get_acc(model,1000,ds)

In [None]:
head_mask = torch.tensor(12*[12*[0]])
pruner.prune(head_mask=head_mask,save_model=False)

In [None]:
model.to('cpu')
get_acc(model,1000,ds)

# 注意力熵

根据注意力矩阵，计算注意力熵


In [None]:
def calculate_AE_matrix(attention_matrix):
    return (torch.log(attention_matrix) * attention_matrix * (-1)).sum(dim=3).mean(dim=2)

In [None]:
# 函数原型如下，这里的if条件判断则是为了防止gpu上放置数据过多，导致核崩溃
# 这里需要基于这个函数来微微调整得到各个模型的AE矩阵计算的代码

def get_AE_matrix(model,dataset,heads_per_layer,layers):
    model.to('cpu')
    attention_entropy=torch.tensor([[0.]*heads_per_layer]*layers)
    AE_matrix = torch.tensor([[0.]*heads_per_layer]*layers)
    data_amount = 0
    for data in tqdm(dataset):
        if data_amount/len(dataset)<=0.39:
            if model.device!=torch.device(type='cuda',index=0):
                model.to('cuda')
            attention_entropy=attention_entropy.to('cuda')
            
            data = list(data.values())
            
            input_text = data[0]
            
            input_ids = tokenizer.encode(input_text,return_tensors='pt').to('cuda')
            
            output = model(input_ids)
            
            attentions = output[1]
            attentions = torch.cat([(layer) for layer in attentions])
            attentions.to('cuda')
            AE = calculate_AE_matrix(attentions)
            AE.to('cuda')
            attention_entropy += AE
            data_amount+=1
        else:
            model.to('cpu')
            attention_entropy=attention_entropy.to('cpu')
            data = list(data.values())
            input_text = data[0]
            input_ids = tokenizer.encode(input_text,return_tensors='pt').to('cpu')
            output = model(input_ids)
            attentions = output[1]
            attentions = torch.cat([(layer) for layer in attentions])
            attention_entropy += calculate_AE_matrix(attentions.to('cpu'))
            data_amount+=1
    AE_matrix = attention_entropy / data_amount    
    return AE_matrix

# BERT

In [44]:
def calculate_AE_matrix(attention_matrix):
    return (torch.log(attention_matrix) * attention_matrix * (-1)).sum(dim=3).mean(dim=2)

# 函数原型如下，这里的if条件判断则是为了防止gpu上放置数据过多，导致核崩溃
# 这里需要基于这个函数来微微调整得到各个模型的AE矩阵计算的代码

def get_AE_matrix(model,dataset,heads_per_layer,layers):
    model.to('cpu')
    attention_entropy=torch.tensor([[0.]*heads_per_layer]*layers)
    AE_matrix = torch.tensor([[0.]*heads_per_layer]*layers)
    data_amount = 0
    for data in tqdm(dataset):
        if data_amount/len(dataset)<=0.35:
            if model.device!=torch.device(type='cuda',index=0):
                model.to('cuda')
            attention_entropy=attention_entropy.to('cuda')
            input_text = data['Title']
            
            inputs = tokenizer(input_text,return_tensors='pt').to('cuda')
            
            output = model(**inputs)
            
            attentions = output[1]
            attentions = torch.cat([(layer) for layer in attentions])
            attentions.to('cuda')
            AE = calculate_AE_matrix(attentions)
            AE.to('cuda')
            attention_entropy += AE
            data_amount+=1
        else:
            model.to('cpu')
            attention_entropy=attention_entropy.to('cpu')
            input_text = data['Title']
            inputs = tokenizer(input_text,return_tensors='pt').to('cpu')
            output = model(**inputs)
            attentions = output[1]
            attentions = torch.cat([(layer) for layer in attentions])
            attention_entropy += calculate_AE_matrix(attentions.to('cpu'))
            data_amount+=1
    AE_matrix = attention_entropy / data_amount    
    return AE_matrix

In [45]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from textpruner import TransformerPruner

# Load model directly
tokenizer = AutoTokenizer.from_pretrained("hw2942/bert-base-chinese-finetuning-financial-news-sentiment-v2")
model = AutoModelForSequenceClassification.from_pretrained("hw2942/bert-base-chinese-finetuning-financial-news-sentiment-v2",output_attentions=True)

# load the dataset 
ds = load_dataset("hw2942/financial-news-sentiment")

def get_acc(test_dataset,model,tokenizer,device='cuda'):
    total = 0
    right = 0
    model.to(device)
    for data in test_dataset:
        inputs = tokenizer(data['Title'],return_tensors='pt').to(device)
        outputs = model(**inputs)
        total  += 1
        if torch.max(outputs[0][0].softmax(0),dim=0).indices==data['labels']:
            right += 1
    return right/total

train_dataset = ds['train']


  return torch.load(checkpoint_file, map_location="cpu")


In [3]:
ds['train']

Dataset({
    features: ['Title', 'labels'],
    num_rows: 2329
})

In [46]:
test_dataset = ds['train']
model.to('cpu')
inputs = tokenizer(ds['train'][0]['Title'],return_tensors='pt').to('cpu')
model(**inputs)[1]


(tensor([[[[6.2043e-04, 2.6172e-03, 8.9522e-04,  ..., 1.2980e-03,
            1.7472e-03, 9.6261e-01],
           [3.0339e-02, 7.0712e-02, 2.1275e-02,  ..., 2.1203e-02,
            4.5651e-02, 6.6954e-03],
           [3.1019e-02, 2.7490e-02, 9.1975e-03,  ..., 4.5575e-02,
            1.7051e-02, 6.8314e-03],
           ...,
           [4.5614e-02, 3.8275e-02, 3.3295e-02,  ..., 5.8891e-02,
            2.1311e-02, 1.8636e-02],
           [3.5039e-02, 1.5726e-02, 4.6151e-02,  ..., 6.7893e-02,
            6.2155e-03, 6.7138e-03],
           [9.1244e-01, 5.0548e-03, 1.4844e-03,  ..., 5.4512e-03,
            4.8061e-03, 2.7211e-03]],
 
          [[2.2416e-02, 1.3964e-02, 1.0259e-02,  ..., 8.6136e-03,
            4.5173e-03, 7.3297e-01],
           [1.0430e-01, 4.7057e-02, 3.1528e-01,  ..., 2.3341e-04,
            7.2508e-03, 8.3452e-03],
           [3.3869e-02, 4.7658e-01, 8.4863e-03,  ..., 3.8979e-04,
            4.2075e-04, 2.7585e-02],
           ...,
           [2.6577e-02, 1.9967e-03, 1.

In [16]:
AE=calculate_AE_matrix(torch.cat([(layer) for layer in model(**inputs)[1]]))


In [17]:
AE.size()

torch.Size([12, 12])

In [23]:
list(ds['train'][0].values())[0]

'日元上周净空头头寸创四个月新低 美元/日元低位盘整'

In [3]:
from tqdm import tqdm
AE_BERT=get_AE_matrix(model,ds['train'],heads_per_layer=12,layers=12)
AE_BERT

 77%|███████▋  | 1800/2329 [01:12<00:21, 24.95it/s]


tensor([[2.5923, 1.5561, 2.0099, 2.6973, 0.8541, 2.6222, 2.8923, 2.3042, 2.6127,
         2.6137, 2.2012, 2.0697],
        [0.5199, 2.5442, 1.5475, 1.8903, 1.9164, 2.1648, 0.4883, 1.0825, 1.6545,
         1.6226, 1.7700, 2.0162],
        [1.2914, 0.6142, 1.3902, 0.7059, 1.2123, 2.0710, 1.2120, 0.9345, 0.5865,
         1.1221, 1.0552, 0.7694],
        [1.6220, 1.9222, 1.3270, 2.0777, 1.6365, 2.3013, 0.8948, 0.9061, 1.6672,
         1.6411, 0.8219, 1.6394],
        [1.8834, 1.4369, 0.4182, 0.9051, 1.4056, 0.5375, 1.2151, 1.5664, 0.9732,
         1.9921, 1.2135, 1.6971],
        [0.7581, 1.7851, 1.2582, 1.3668, 1.0130, 1.2287, 1.0818, 0.8757, 1.5096,
         1.2374, 0.7788, 0.7061],
        [1.5522, 1.0557, 0.4227,    nan, 1.5409, 1.0541, 0.6438, 0.7255, 1.0248,
         0.9740, 0.7192, 0.4754],
        [0.4826, 0.8135, 1.5645, 1.7033, 0.5167, 0.3270, 0.8177, 1.2983, 1.2855,
         1.2419, 0.4703, 1.4300],
        [1.3956, 1.0947, 0.8130, 0.5790, 0.7177, 0.9751, 1.4121, 1.1594, 0.7287,

## 分批计算

In [4]:
from tqdm import tqdm
import torch.utils.data as Data

class MyDataset(Data.Dataset):
    def __init__(self,dataset):
        self.dataset = dataset
    def __len__(self):
        return len(self.dataset['labels'])
    def __getitem__(self,idx):
        keys = list(self.dataset.keys())
        values = list(self.dataset.values())
        return {keys[0]:values[0][idx],keys[1]:values[1][idx]}

In [None]:
dataset_=ds['train'][:1300]
# dataset_=ds['train'][1300:]
data_val = MyDataset(dataset_)

In [4]:

len(data_val)


1300

In [5]:
AE_BERT_1300=get_AE_matrix(model,data_val,heads_per_layer=12,layers=12)
AE_BERT_1300

100%|██████████| 1300/1300 [00:53<00:00, 24.09it/s]


tensor([[2.5733, 1.5489, 1.9908, 2.6849, 0.8528, 2.6035, 2.8764, 2.2905, 2.5992,
         2.5950, 2.1883, 2.0580],
        [0.5200, 2.5300, 1.5345, 1.8719, 1.8991, 2.1449, 0.4847, 1.0791, 1.6455,
         1.6176, 1.7588, 2.0017],
        [1.2789, 0.6137, 1.3807, 0.7062, 1.2101, 2.0585, 1.2067, 0.9304, 0.5866,
         1.1217, 1.0524, 0.7685],
        [1.6134, 1.9124, 1.3239, 2.0659, 1.6261, 2.2873, 0.8946, 0.9030, 1.6609,
         1.6354, 0.8169, 1.6369],
        [1.8620, 1.4244, 0.4146, 0.8953, 1.3967, 0.5329, 1.2030, 1.5551, 0.9651,
         1.9833, 1.2056, 1.6829],
        [0.7505, 1.7763, 1.2397, 1.3589, 1.0138, 1.2194, 1.0769, 0.8675, 1.4972,
         1.2380, 0.7760, 0.7037],
        [1.5486, 1.0500, 0.4174,    nan, 1.5292, 1.0432, 0.6414, 0.7189, 1.0224,
         0.9723, 0.7162, 0.4707],
        [0.4806, 0.8112, 1.5596, 1.6928, 0.5155, 0.3226, 0.8207, 1.2946, 1.2774,
         1.2389, 0.4657, 1.4253],
        [1.3947, 1.0846, 0.8034, 0.5748, 0.7155, 0.9705, 1.4104, 1.1547, 0.7272,

In [6]:
# torch.save(AE_BERT_1300,"./BERT_1300AE.pt")

In [5]:
dataset_=ds['train'][1300:]
# dataset_=ds['train'][1300:]
data_val = MyDataset(dataset_)

In [None]:
len(data_val)

In [6]:
AE_BERT_1029=get_AE_matrix(model,data_val,heads_per_layer=12,layers=12)
AE_BERT_1029

100%|██████████| 1029/1029 [00:42<00:00, 24.33it/s]


tensor([[2.6053, 1.5640, 2.0297, 2.7009, 0.8552, 2.6395, 2.9051, 2.3188, 2.6247,
         2.6279, 2.2114, 2.0729],
        [0.5219, 2.5554, 1.5603, 1.9269, 1.9452, 2.1911, 0.4886, 1.0849, 1.6617,
         1.6322, 1.7862, 2.0481],
        [1.3105, 0.6066, 1.4112, 0.7041, 1.2184, 2.0746, 1.2191, 0.9379, 0.5893,
         1.1219, 1.0580, 0.7709],
        [1.6385, 1.9381, 1.3334, 2.0964, 1.6491, 2.3310, 0.8967, 0.9106, 1.6754,
         1.6492, 0.8303, 1.6455],
        [1.9122, 1.4634, 0.4257, 0.9326, 1.4200, 0.5505, 1.2423, 1.5851, 1.0017,
         2.0043, 1.2405, 1.7199],
        [0.7802, 1.8214, 1.3231, 1.3877, 1.0071, 1.2804, 1.0980, 0.9036, 1.5637,
         1.2551, 0.7820, 0.7039],
        [1.5597, 1.0728, 0.4300,    nan, 1.5933, 1.0620, 0.6550, 0.7434, 1.0363,
         0.9786, 0.7136, 0.4856],
        [0.4849, 0.8250, 1.5636, 1.7377, 0.5206, 0.3269, 0.8168, 1.3085, 1.3191,
         1.2624, 0.4630, 1.4556],
        [1.4129, 1.1226, 0.8353, 0.5916, 0.7379, 0.9988, 1.4209, 1.1902, 0.7390,

In [7]:
# torch.save(AE_BERT_1029,"./BERT_1029AE.pt")

In [2]:
import torch
AE1_1300=torch.load('BERT_1300AE.pt')
AE1301_2329=torch.load('BERT_1029AE.pt')
AE_matrix=(AE1_1300*1300+AE1301_2329*(2329-1300))/2329

  AE1_1300=torch.load('BERT_1300AE.pt')
  AE1301_2329=torch.load('BERT_1029AE.pt')


In [4]:
# torch.save(AE_matrix,"./BERT_2329AE.pt")

In [None]:
get_acc(train_dataset,model,tokenizer,device='cuda')

# ALBERT

In [4]:
def calculate_AE_matrix(attention_matrix):
    return (torch.log(attention_matrix) * attention_matrix * (-1)).sum(dim=3).mean(dim=2)

# 函数原型如下，这里的if条件判断则是为了防止gpu上放置数据过多，导致核崩溃
# 这里需要基于这个函数来微微调整得到各个模型的AE矩阵计算的代码

def get_AE_matrix(model,dataset,heads_per_layer,layers):
    model.to('cpu')
    attention_entropy=torch.tensor([[0.]*heads_per_layer]*layers)
    AE_matrix = torch.tensor([[0.]*heads_per_layer]*layers)
    data_amount = 0
    for data in tqdm(dataset):
        if data_amount/len(dataset)<=0.25:
            if model.device!=torch.device(type='cuda',index=0):
                model.to('cuda')
            attention_entropy=attention_entropy.to('cuda')
            input_text_premise = data['premise']
            input_text_hypothesis = data['hypothesis']
            inputs = tokenizer(input_text_premise,input_text_hypothesis,return_tensors='pt').to('cuda')
            
            output = model(**inputs)
            
            attentions = output[1]
            attentions = torch.cat([(layer) for layer in attentions])
            attentions.to('cuda')
            AE = calculate_AE_matrix(attentions)
            AE.to('cuda')
            attention_entropy += AE
            data_amount+=1
        else:
            model.to('cpu')
            attention_entropy=attention_entropy.to('cpu')
            input_text_premise = data['premise']
            input_text_hypothesis = data['hypothesis']
            inputs = tokenizer(input_text_premise,input_text_hypothesis,return_tensors='pt').to('cpu')
            output = model(**inputs)
            attentions = output[1]
            attentions = torch.cat([(layer) for layer in attentions])
            attention_entropy += calculate_AE_matrix(attentions.to('cpu'))
            data_amount+=1
    AE_matrix = attention_entropy / data_amount    
    return AE_matrix

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("Alireza1044/albert-base-v2-mnli")
model = AutoModelForSequenceClassification.from_pretrained("Alireza1044/albert-base-v2-mnli",output_attentions=True)

  return torch.load(checkpoint_file, map_location="cpu")


In [3]:
from datasets import load_dataset

ds = load_dataset("nyu-mll/glue", "mnli")

In [51]:
ds['train'][0]

{'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.',
 'hypothesis': 'Product and geography are what make cream skimming work. ',
 'label': 1,
 'idx': 0}

In [12]:
ds# 数据量太大，这里选择使用验证集

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

In [52]:
model.to('cpu')
inputs=tokenizer(data_val[0]['premise'],data_val[0]['hypothesis'],return_tensors='pt')
attentions = model(**inputs)[1]
attentions = torch.cat([(layer) for layer in attentions])
attentions.to('cuda')
AE = calculate_AE_matrix(attentions)

In [63]:
from textpruner import summary

In [2]:
model.state_dict

<bound method Module.state_dict of AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear

In [69]:
print(summary(model))


LAYER NAME       	        #PARAMS	     RATIO	 MEM(MB)
--model:         	     11,685,891	   100.00%	   44.58
  --albert:      	     11,683,584	    99.98%	   44.57
    --embeddings:	      3,906,048	    33.43%	   14.90
    --encoder:   	      7,186,944	    61.50%	   27.42
    --pooler:    	        590,592	     5.05%	    2.25
  --classifier:  	          2,307	     0.02%	    0.01
    --weight:    	          2,304	     0.02%	    0.01
    --bias:      	              3	     0.00%	    0.00



In [73]:
pruner=TransformerPruner(model)
head_mask=torch.tensor(12*[12*[0]])
pruner.prune(save_model=False,head_mask=head_mask)

IndexError: index 1 is out of range

In [70]:
pruner=TransformerPruner(model)
head_mask=torch.tensor(1*[12*[0]])
pruner.prune(save_model=False,head_mask=head_mask)



In [71]:
print(summary(model))


LAYER NAME       	        #PARAMS	     RATIO	 MEM(MB)
--model:         	      9,324,291	   100.00%	   35.57
  --albert:      	      9,321,984	    99.98%	   35.56
    --embeddings:	      3,906,048	    41.89%	   14.90
    --encoder:   	      4,825,344	    51.75%	   18.41
    --pooler:    	        590,592	     6.33%	    2.25
  --classifier:  	          2,307	     0.02%	    0.01
    --weight:    	          2,304	     0.02%	    0.01
    --bias:      	              3	     0.00%	    0.00



In [86]:
pruner=TransformerPruner(model)
head_mask=torch.tensor(1*[12*[12*[0]]])
pruner.prune(save_model=False,head_mask=head_mask)

In [87]:
print(summary(model)) # albert的pruner很奇怪，不能只剪一个头，不然怎么可能12个掩码就够了……但它还是144个头的,猜测是跨层共享参数导致的，可能一个头12层共享


LAYER NAME       	        #PARAMS	     RATIO	 MEM(MB)
--model:         	     11,685,891	   100.00%	   44.58
  --albert:      	     11,683,584	    99.98%	   44.57
    --embeddings:	      3,906,048	    33.43%	   14.90
    --encoder:   	      7,186,944	    61.50%	   27.42
    --pooler:    	        590,592	     5.05%	    2.25
  --classifier:  	          2,307	     0.02%	    0.01
    --weight:    	          2,304	     0.02%	    0.01
    --bias:      	              3	     0.00%	    0.00



In [54]:
AE.size()

torch.Size([12, 12])

In [41]:
len(model(**inputs)[1])

12

In [43]:
model(**inputs)[1]

(tensor([[[[3.9471e-02, 1.4022e-02, 2.5336e-02,  ..., 9.5757e-02,
            4.2413e-02, 2.9814e-01],
           [4.2149e-02, 1.7616e-02, 9.4237e-02,  ..., 1.1961e-01,
            4.0377e-02, 6.3734e-03],
           [1.5737e-01, 3.9418e-02, 1.1913e-01,  ..., 1.1628e-01,
            2.8315e-02, 1.1212e-02],
           ...,
           [1.0692e-01, 5.1957e-03, 2.3589e-02,  ..., 3.8372e-01,
            7.1231e-02, 5.4784e-02],
           [7.4828e-02, 5.1109e-03, 1.5864e-02,  ..., 1.7942e-01,
            2.4952e-01, 1.2513e-01],
           [2.0523e-01, 5.8701e-02, 1.6576e-02,  ..., 5.7049e-02,
            7.7924e-02, 6.7908e-02]],
 
          [[9.0055e-01, 1.8377e-02, 6.0108e-03,  ..., 6.5669e-04,
            5.8390e-04, 1.1724e-02],
           [7.6343e-01, 7.3960e-02, 1.1563e-02,  ..., 1.5341e-04,
            1.1493e-04, 1.0035e-03],
           [5.7970e-01, 2.7745e-01, 3.6470e-02,  ..., 8.4274e-05,
            2.8290e-05, 2.0670e-04],
           ...,
           [9.9369e-02, 1.6989e-02, 8.

In [1]:

def calculate_AE_matrix(attention_matrix):
    return (torch.log(attention_matrix) * attention_matrix * (-1)).sum(dim=3).mean(dim=2)

# 函数原型如下，这里的if条件判断则是为了防止gpu上放置数据过多，导致核崩溃
# 这里需要基于这个函数来微微调整得到各个模型的AE矩阵计算的代码

def get_AE_matrix(model,dataset,heads_per_layer,layers):
    model.to('cpu')
    attention_entropy=torch.tensor([[0.]*heads_per_layer]*layers)
    AE_matrix = torch.tensor([[0.]*heads_per_layer]*layers)
    data_amount = 0
    for data in tqdm(dataset):
        if data_amount/len(dataset)<=0.5:
            if model.device!=torch.device(type='cuda',index=0):
                model.to('cuda')
            attention_entropy=attention_entropy.to('cuda')
            input_text_premise = data['premise']
            input_text_hypothesis = data['hypothesis']
            inputs = tokenizer(input_text_premise,input_text_hypothesis,return_tensors='pt').to('cuda')
            
            output = model(**inputs)
            
            attentions = output[1]
            attentions = torch.cat([(layer) for layer in attentions])
            attentions.to('cuda')
            AE = calculate_AE_matrix(attentions)
            AE.to('cuda')
            attention_entropy += AE
            data_amount+=1
        else:
            model.to('cpu')
            attention_entropy=attention_entropy.to('cpu')
            input_text_premise = data['premise']
            input_text_hypothesis = data['hypothesis']
            inputs = tokenizer(input_text_premise,input_text_hypothesis,return_tensors='pt').to('cpu')
            output = model(**inputs)
            attentions = output[1]
            attentions = torch.cat([(layer) for layer in attentions])
            attention_entropy += calculate_AE_matrix(attentions.to('cpu'))
            data_amount+=1
    AE_matrix = attention_entropy / data_amount    
    return AE_matrix
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("Alireza1044/albert-base-v2-mnli")
model = AutoModelForSequenceClassification.from_pretrained("Alireza1044/albert-base-v2-mnli",output_attentions=True)
from datasets import load_dataset

ds = load_dataset("nyu-mll/glue", "mnli")
from tqdm import tqdm
import torch
import torch.utils.data as Data

class MyDataset(Data.Dataset):
    def __init__(self,dataset):
        self.dataset = dataset
    def __len__(self):
        return len(self.dataset['label'])
    def __getitem__(self,idx):
        keys = list(self.dataset.keys())
        values = list(self.dataset.values())
        return {keys[0]:values[0][idx],keys[1]:values[1][idx],keys[2]:values[2][idx]}

slices = [500*i for i in range(1,21)]
# dataset_=ds['validation_matched'][:slices[0]] #500
# dataset_=ds['validation_matched'][slices[0]:slices[1]] #500-1000
# dataset_=ds['validation_matched'][slices[1]:slices[2]] #1000-1500
# dataset_=ds['validation_matched'][slices[2]:slices[3]] #1500-2000
# dataset_=ds['validation_matched'][slices[3]:slices[4]] #2000-2500
# dataset_=ds['validation_matched'][slices[4]:slices[5]] #2500-3000
# dataset_=ds['validation_matched'][slices[5]:slices[6]] #3000-3500
# dataset_=ds['validation_matched'][slices[6]:slices[7]] #3500-4000
# dataset_=ds['validation_matched'][slices[7]:slices[8]] #4000-4500
# dataset_=ds['validation_matched'][slices[8]:slices[9]] #4500-5000
# dataset_=ds['validation_matched'][slices[9]:slices[10]] #5000-5500
# dataset_=ds['validation_matched'][slices[10]:slices[11]] #5500-6000
# dataset_=ds['validation_matched'][slices[11]:slices[12]] #6000-6500
# dataset_=ds['validation_matched'][slices[12]:slices[13]] #6500-7000
# dataset_=ds['validation_matched'][slices[13]:slices[14]] #7000-7500
# dataset_=ds['validation_matched'][slices[14]:slices[15]] #7500-8000
# dataset_=ds['validation_matched'][slices[15]:slices[16]] #8000-8500
# dataset_=ds['validation_matched'][slices[16]:slices[17]] #8500-9000
# dataset_=ds['validation_matched'][slices[17]:slices[18]] #9000-9500
dataset_=ds['validation_matched'][slices[18]:slices[19]] #9500-10000
data_val = MyDataset(dataset_)

  return torch.load(checkpoint_file, map_location="cpu")


In [2]:
AE_ALBERT=get_AE_matrix(model,data_val,heads_per_layer=12,layers=12)
AE_ALBERT

100%|██████████| 315/315 [00:17<00:00, 18.38it/s]


tensor([[3.0224, 2.1710, 1.8527, 0.6527, 0.7127, 3.1449, 2.0625, 2.1028, 2.4925,
         2.7966, 1.4605, 2.3825],
        [2.3129, 1.9487, 1.5824, 1.1471, 1.1490, 2.1990, 1.6244, 1.7697, 2.3100,
         2.3633, 0.8459, 1.8759],
        [2.3871, 2.0191, 1.6216, 1.2562, 1.2409, 2.2774, 1.6369, 1.7791, 2.2870,
         2.3291, 0.8378, 1.9776],
        [2.3471, 1.9955, 1.6162, 1.2800, 1.2775, 2.2823, 1.6111, 1.8245, 2.2594,
         2.3187, 0.8968, 1.9425],
        [2.3449, 2.0319, 1.6648, 1.3451, 1.3447, 2.3173, 1.6479, 1.8410, 2.2736,
         2.2930, 0.9327, 1.9993],
        [2.3186, 2.0453, 1.6857, 1.3974, 1.3911, 2.3278, 1.6563, 1.8490, 2.2493,
         2.2702, 0.9529, 1.9987],
        [2.2633, 2.0516, 1.6938, 1.4395, 1.4327, 2.3182, 1.6596, 1.8533, 2.2233,
         2.2241, 0.9701, 2.0102],
        [2.2203, 2.0680, 1.7373, 1.4951, 1.4823, 2.3323, 1.6900, 1.8319, 2.2289,
         2.2167, 0.9957, 2.0360],
        [2.1536, 2.0869, 1.7666, 1.5636, 1.5328, 2.2807, 1.6984, 1.8203, 2.2397,

In [5]:
# torch.save(AE_ALBERT,"./ALBERT_AE/ALBERT9500_9814AE.pt")

In [2]:
import torch
AE_ALBERT_LIST=[]
for i in range(1,20):
    AE_ALBERT_LIST.append(torch.load(f'./ALBERT_AE/ALBERT{(i-1)*500}_{i*500-1}AE.pt'))

  AE_ALBERT_LIST.append(torch.load(f'./ALBERT_AE/ALBERT{(i-1)*500}_{i*500-1}AE.pt'))


In [6]:
import torch
ALBERT9500_9814AE=torch.load('ALBERT_AE/ALBERT9500_9814AE.pt')
ALBERT9500_9814AE

  ALBERT9500_9814AE=torch.load('ALBERT_AE/ALBERT9500_9814AE.pt')


tensor([[3.0224, 2.1710, 1.8527, 0.6527, 0.7127, 3.1449, 2.0625, 2.1028, 2.4925,
         2.7966, 1.4605, 2.3825],
        [2.3129, 1.9487, 1.5824, 1.1471, 1.1490, 2.1990, 1.6244, 1.7697, 2.3100,
         2.3633, 0.8459, 1.8759],
        [2.3871, 2.0191, 1.6216, 1.2562, 1.2409, 2.2774, 1.6369, 1.7791, 2.2870,
         2.3291, 0.8378, 1.9776],
        [2.3471, 1.9955, 1.6162, 1.2800, 1.2775, 2.2823, 1.6111, 1.8245, 2.2594,
         2.3187, 0.8968, 1.9425],
        [2.3449, 2.0319, 1.6648, 1.3451, 1.3447, 2.3173, 1.6479, 1.8410, 2.2736,
         2.2930, 0.9327, 1.9993],
        [2.3186, 2.0453, 1.6857, 1.3974, 1.3911, 2.3278, 1.6563, 1.8490, 2.2493,
         2.2702, 0.9529, 1.9987],
        [2.2633, 2.0516, 1.6938, 1.4395, 1.4327, 2.3182, 1.6596, 1.8533, 2.2233,
         2.2241, 0.9701, 2.0102],
        [2.2203, 2.0680, 1.7373, 1.4951, 1.4823, 2.3323, 1.6900, 1.8319, 2.2289,
         2.2167, 0.9957, 2.0360],
        [2.1536, 2.0869, 1.7666, 1.5636, 1.5328, 2.2807, 1.6984, 1.8203, 2.2397,

In [18]:
AE_matrix=(sum([ae*500 for ae in AE_ALBERT_LIST])+ALBERT9500_9814AE*315)/9815
# torch.save(AE_matrix,"./ALBERT_AE/ALBERT_9815AE.pt")

# ROBERTA

In [31]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")
model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english",output_attentions=True)

  return torch.load(checkpoint_file, map_location="cpu")


In [32]:
from datasets import load_dataset
ds = load_dataset("rahmaabusalma/tweets_sentiment_analysis")

In [34]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 31232
    })
    validation: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 5205
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 5206
    })
})

In [33]:
ds['validation'][0]

{'id': 317,
 'text': 'Laying in bed til workkk... Oh the life. Definitely pinched a nerve.',
 'label': 0,
 'sentiment': 'negative'}

In [35]:
inputs = tokenizer(ds['validation'][0]['text'],return_tensors='pt')
len(model(**inputs)[1])

24

In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")
model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english",output_attentions=True)
from datasets import load_dataset
ds = load_dataset("rahmaabusalma/tweets_sentiment_analysis")
def calculate_AE_matrix(attention_matrix):
    return (torch.log(attention_matrix) * attention_matrix * (-1)).sum(dim=3).mean(dim=2)

# 函数原型如下，这里的if条件判断则是为了防止gpu上放置数据过多，导致核崩溃
# 这里需要基于这个函数来微微调整得到各个模型的AE矩阵计算的代码

def get_AE_matrix(model,dataset,heads_per_layer,layers):
    model.to('cpu')
    attention_entropy=torch.tensor([[0.]*heads_per_layer]*layers)
    AE_matrix = torch.tensor([[0.]*heads_per_layer]*layers)
    data_amount = 0
    for data in tqdm(dataset):
        if data_amount/len(dataset)<=0.4:
            if model.device!=torch.device(type='cuda',index=0):
                model.to('cuda')
            attention_entropy=attention_entropy.to('cuda')
            input_text = data['text']
            inputs = tokenizer(input_text,return_tensors='pt').to('cuda')
            output = model(**inputs)
            
            attentions = output[1]
            attentions = torch.cat([(layer) for layer in attentions])
            attentions.to('cuda')
            AE = calculate_AE_matrix(attentions)
            AE.to('cuda')
            attention_entropy += AE
            data_amount+=1
        else:
            model.to('cpu')
            attention_entropy=attention_entropy.to('cpu')
            input_text = data['text']
            inputs = tokenizer(input_text,return_tensors='pt').to('cpu')
            output = model(**inputs)
            attentions = output[1]
            attentions = torch.cat([(layer) for layer in attentions])
            attention_entropy += calculate_AE_matrix(attentions.to('cpu'))
            data_amount+=1
    AE_matrix = attention_entropy / data_amount    
    return AE_matrix

from tqdm import tqdm
import torch
import torch.utils.data as Data

class MyDataset(Data.Dataset):
    def __init__(self,dataset):
        self.dataset = dataset
    def __len__(self):
        return len(self.dataset['label'])
    def __getitem__(self,idx):
        keys = list(self.dataset.keys())
        values = list(self.dataset.values())
        return {keys[0]:values[0][idx],keys[1]:values[1][idx]}

  return torch.load(checkpoint_file, map_location="cpu")


In [2]:
dataset_ = ds['validation'][5000:5500]
data_val = MyDataset(dataset_)
AE_ROBERTA=get_AE_matrix(model,data_val,16,24)

100%|██████████| 205/205 [00:28<00:00,  7.08it/s]


In [4]:
torch.save(AE_ROBERTA,"./ROBERTA_AE/ROBERTA5000_5204AE.pt")

In [1]:
import torch
AE_ROBERTA_LIST=[]
for i in range(1,11):
    AE_ROBERTA_LIST.append(torch.load(f'./ROBERTA_AE/ROBERTA{(i-1)*500}_{i*500-1}AE.pt'))

  AE_ROBERTA_LIST.append(torch.load(f'./ROBERTA_AE/ROBERTA{(i-1)*500}_{i*500-1}AE.pt'))


In [2]:
AE_ROBERTA_205 = torch.load('./ROBERTA_AE/ROBERTA5000_5204AE.pt')

  AE_ROBERTA_205 = torch.load('./ROBERTA_AE/ROBERTA5000_5204AE.pt')


In [3]:
AE_matrix=(sum([ae*500 for ae in AE_ROBERTA_LIST])+205*AE_ROBERTA_205)/5205
torch.save(AE_matrix,"./ROBERTA_AE/ROBERTA_5205AE.pt")

In [4]:
AE_matrix

tensor([[2.6263e+00, 2.5077e+00, 2.5571e+00, 2.5777e+00, 2.5178e+00, 2.6367e+00,
         2.5353e+00, 2.4944e+00, 2.6276e+00, 2.1178e+00, 2.5601e+00, 1.6013e+00,
         2.5379e+00, 2.6502e+00, 2.5943e+00, 2.3914e+00],
        [1.0713e+00, 1.0350e+00, 1.5223e+00, 1.5613e+00, 7.4442e-01, 1.7089e+00,
         1.9589e+00, 1.7404e+00, 1.6072e+00,        nan, 1.1792e+00, 8.7922e-01,
         1.9684e+00, 1.3926e+00,        nan, 1.0701e+00],
        [1.6133e+00, 8.2368e-01, 3.8522e-03, 3.8029e-01, 1.2726e+00,        nan,
         4.2718e-01, 4.6054e-01, 1.0652e+00,        nan,        nan, 7.4415e-01,
                nan, 3.9545e-07, 1.2667e+00, 9.9539e-01],
        [5.7705e-01, 9.4560e-01, 1.1531e+00, 8.6386e-05, 8.0989e-01, 8.7567e-01,
         8.3881e-01, 1.4427e+00, 1.2176e+00, 8.1250e-01, 5.6353e-01, 8.0658e-01,
                nan, 6.5196e-01, 5.1093e-01, 1.1017e+00],
        [9.0117e-01, 1.8855e+00, 1.2955e+00, 6.7242e-01, 1.0615e+00, 1.1279e+00,
         7.7869e-01, 1.3825e+00, 2.0021

这里的问题是nan，个人猜测是下溢出，因为log(0)的存在导致下溢出问题进而出现nan，应该是计算注意力熵的时候出现的问题。后续可以对计算方式进行微调。

In [11]:
inputs=tokenizer(ds['train'][0]['text'],return_tensors='pt')

In [12]:
inputs

{'input_ids': tensor([[    0, 32963,   154, 28562, 26432,   281,     6,  1423, 22383,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [13]:
sum(model(**inputs)[1][0][0][0][0])

tensor(1., grad_fn=<AddBackward0>)

# XLM-ROBERTA

In [29]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
from datasets import load_dataset

ds = load_dataset("papluca/language-identification")



In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 70000
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
})

In [21]:
# Load model directly
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection",output_attentions=True)
from datasets import load_dataset

ds = load_dataset("papluca/language-identification")

def calculate_AE_matrix(attention_matrix):
    return (torch.log(attention_matrix) * attention_matrix * (-1)).sum(dim=3).mean(dim=2)

# 函数原型如下，这里的if条件判断则是为了防止gpu上放置数据过多，导致核崩溃
# 这里需要基于这个函数来微微调整得到各个模型的AE矩阵计算的代码

def get_AE_matrix(model,dataset,heads_per_layer,layers):
    model.to('cpu')
    attention_entropy=torch.tensor([[0.]*heads_per_layer]*layers)
    AE_matrix = torch.tensor([[0.]*heads_per_layer]*layers)
    data_amount = 0
    for data in tqdm(dataset):
        if data_amount/len(dataset)<=0.4:
            if model.device!=torch.device(type='cuda',index=0):
                model.to('cuda')
            attention_entropy=attention_entropy.to('cuda')
            input_text = data['text']
            inputs = tokenizer(input_text,return_tensors='pt').to('cuda')
            output = model(**inputs)
            
            attentions = output[1]
            attentions = torch.cat([(layer) for layer in attentions])
            attentions.to('cuda')
            AE = calculate_AE_matrix(attentions)
            AE.to('cuda')
            attention_entropy += AE
            data_amount+=1
        else:
            model.to('cpu')
            attention_entropy=attention_entropy.to('cpu')
            input_text = data['text']
            inputs = tokenizer(input_text,return_tensors='pt').to('cpu')
            output = model(**inputs)
            attentions = output[1]
            attentions = torch.cat([(layer) for layer in attentions])
            attention_entropy += calculate_AE_matrix(attentions.to('cpu'))
            data_amount+=1
    AE_matrix = attention_entropy / data_amount    
    return AE_matrix

from tqdm import tqdm
import torch
import torch.utils.data as Data

class MyDataset(Data.Dataset):
    def __init__(self,dataset):
        self.dataset = dataset
    def __len__(self):
        return len(self.dataset['labels'])
    def __getitem__(self,idx):
        keys = list(self.dataset.keys())
        values = list(self.dataset.values())
        return {keys[0]:values[0][idx],keys[1]:values[1][idx]}

In [2]:
dataset_ = ds['validation'][7724:8000]
data_val = MyDataset(dataset_)
AE_XLM_ROBERTA=get_AE_matrix(model,data_val,12,12)

100%|██████████| 276/276 [00:15<00:00, 17.48it/s]


In [3]:
torch.save(AE_XLM_ROBERTA,"./XLM_ROBERTA_AE/XLM_ROBERTA7724_7999AE.pt")

In [5]:
import torch
AE_XLM_ROBERTA_LIST=[]
for i in range(1,11):
    if i==8:
        continue
    AE_XLM_ROBERTA_LIST.append(torch.load(f'./XLM_ROBERTA_AE/XLM_ROBERTA{(i-1)*1000}_{i*1000-1}AE.pt'))

  AE_XLM_ROBERTA_LIST.append(torch.load(f'./XLM_ROBERTA_AE/XLM_ROBERTA{(i-1)*1000}_{i*1000-1}AE.pt'))


In [7]:
AE_XLM_ROBERTA_LIST

[tensor([[2.7208, 2.9040, 2.5561, 2.7971, 3.0390, 2.6836, 2.5233, 2.7726, 1.5327,
          2.7072, 2.9090, 2.9809],
         [0.7757, 1.3725, 2.3615, 1.5987, 0.6239, 1.5879, 2.1649, 0.8296,    nan,
          1.9505, 1.9480, 1.4741],
         [0.7764, 0.9156, 1.0146, 1.8412, 0.6824, 0.1279, 0.7044, 1.8126, 1.3499,
          1.9542, 1.9365, 1.3603],
         [0.4372, 2.2146, 1.4126, 1.3566, 1.1266, 0.2592, 1.6192, 2.2266, 0.6341,
          1.2258, 1.7459, 1.0886],
         [1.7033, 1.7393, 1.0916, 0.5789, 1.0968, 0.8719, 2.3253, 0.0756, 2.0827,
          1.2578, 2.4729, 1.1472],
         [1.4798, 1.8523, 1.8628, 2.5126, 1.2633, 1.4314, 1.1985, 2.2842, 1.1645,
          1.6478, 2.1991, 1.7491],
         [1.8639, 1.2761, 0.2099, 1.9127, 1.3415, 1.7540, 2.0983, 1.9856, 1.8368,
          1.7333, 1.5643, 0.2295],
         [1.8684, 0.7614, 1.5729, 0.5282, 1.9519, 2.3432, 2.2022, 1.8511, 2.4962,
          2.5041, 1.7224, 0.9746],
         [0.7545, 1.7521, 1.4338, 2.6584, 1.7590, 0.3367, 1.6397

In [17]:
a = torch.load('./XLM_ROBERTA_AE/XLM_ROBERTA7000_7722AE.pt')
b = torch.load('./XLM_ROBERTA_AE/XLM_ROBERTA7724_7999AE.pt')
AE_matrix=(sum([ae*1000 for ae in AE_XLM_ROBERTA_LIST])+a*723+b*276)/9999
torch.save(AE_matrix,"./XLM_ROBERTA_AE/XLM_ROBERTA_9999AE.pt")

  a = torch.load('./XLM_ROBERTA_AE/XLM_ROBERTA7000_7722AE.pt')
  b = torch.load('./XLM_ROBERTA_AE/XLM_ROBERTA7724_7999AE.pt')


这里选择跳过一个因为，第7724的位置那个数据长大于864，导致报错

In [19]:
AE_matrix

tensor([[2.7472, 2.9270, 2.5793, 2.8226, 3.0622, 2.7021, 2.5405, 2.7961, 1.5514,
         2.7265, 2.9318, 3.0028],
        [0.7816, 1.3866, 2.3770, 1.6064, 0.6307, 1.5959, 2.1630, 0.8361,    nan,
         1.9572, 1.9643, 1.4853],
        [0.7862, 0.9225, 1.0233, 1.8597, 0.6854,    nan, 0.7071, 1.8317, 1.3581,
         1.9713, 1.9579, 1.3653],
        [0.4398, 2.2347, 1.4329, 1.3632, 1.1460, 0.2639, 1.6362, 2.2474, 0.6381,
         1.2343, 1.7628, 1.1011],
        [1.7116, 1.7552, 1.0954, 0.5926, 1.1112, 0.8905, 2.3594, 0.0836, 2.1158,
         1.2604, 2.5000, 1.1761],
        [1.4902, 1.8560, 1.8729, 2.5291, 1.2911, 1.4341, 1.2276, 2.2982, 1.1720,
         1.6461, 2.2137, 1.7572],
        [1.8695, 1.2799, 0.2200, 1.9231, 1.3465, 1.7786, 2.1079, 2.0013, 1.8468,
         1.7569, 1.5979, 0.2396],
        [1.8985, 0.7705, 1.5725, 0.5332, 1.9640, 2.3576, 2.2119, 1.8527, 2.5077,
         2.5288, 1.7239, 0.9798],
        [0.7803, 1.7603, 1.4389, 2.6791, 1.7531, 0.3474, 1.6300, 1.0447, 0.8844,

这里的问题是nan，个人猜测是下溢出，因为log(0)的存在导致下溢出问题进而出现nan，应该是计算注意力熵的时候出现的问题。后续可以对计算方式进行微调。