In [7]:
import os
import torch

def check_nan_in_tensor(tensor):
    """检查张量中是否包含NaN值"""
    return torch.isnan(tensor).any().item()  # 返回True如果包含NaN

def check_pt_file_for_nan(file_path):
    """加载.pt文件并检查其中的张量是否包含NaN"""
    try:
        # 加载pt文件
        data = torch.load(file_path)
        
        # 如果数据是一个字典，可能包含多个张量
        if isinstance(data, dict):
            for key, tensor in data.items():
                if torch.is_tensor(tensor) and check_nan_in_tensor(tensor):
                    return True
        # 如果数据是一个张量
        elif torch.is_tensor(data):
            if check_nan_in_tensor(data):
                return True
    except Exception as e:
        print(f"无法加载文件: {file_path}, 错误: {e}")
    
    return False

def find_nan_in_pt_files(directory):
    """遍历文件夹下所有.pt文件，检查其中是否包含NaN"""
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".pt"):
                file_path = os.path.join(root, file)
                if check_pt_file_for_nan(file_path):
                    print(f"发现NaN值的文件: {file_path}")

# 示例使用：指定你要遍历的文件夹路径
directory_path = 'D:\\research\\research4\BERT_AE'  # 替换为你的文件夹路径
find_nan_in_pt_files(directory_path)


发现NaN值的文件: D:\research\research4\BERT_AE\BERT_1029AE.pt
发现NaN值的文件: D:\research\research4\BERT_AE\BERT_1300AE.pt
发现NaN值的文件: D:\research\research4\BERT_AE\BERT_2329AE.pt


  data = torch.load(file_path)


发现NaN值的文件: D:\research\research4\BERT_AE\BERT_1029AE.pt

发现NaN值的文件: D:\research\research4\BERT_AE\BERT_1300AE.pt

发现NaN值的文件: D:\research\research4\BERT_AE\BERT_2329AE.pt

C:\Users\m1830\AppData\Local\Temp\ipykernel_35696\3535023395.py:12: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  data = torch.load(file_path)

发现NaN值的文件: D:\research\research4\XLM_ROBERTA_AE\XLM_ROBERTA0_999AE.pt

发现NaN值的文件: D:\research\research4\XLM_ROBERTA_AE\XLM_ROBERTA1000_1999AE.pt

发现NaN值的文件: D:\research\research4\XLM_ROBERTA_AE\XLM_ROBERTA2000_2999AE.pt

发现NaN值的文件: D:\research\research4\XLM_ROBERTA_AE\XLM_ROBERTA3000_3999AE.pt

发现NaN值的文件: D:\research\research4\XLM_ROBERTA_AE\XLM_ROBERTA4000_4999AE.pt

发现NaN值的文件: D:\research\research4\XLM_ROBERTA_AE\XLM_ROBERTA5000_5999AE.pt

发现NaN值的文件: D:\research\research4\XLM_ROBERTA_AE\XLM_ROBERTA6000_6999AE.pt

发现NaN值的文件: D:\research\research4\XLM_ROBERTA_AE\XLM_ROBERTA7000_7722AE.pt

发现NaN值的文件: D:\research\research4\XLM_ROBERTA_AE\XLM_ROBERTA7724_7999AE.pt

发现NaN值的文件: D:\research\research4\XLM_ROBERTA_AE\XLM_ROBERTA8000_8999AE.pt

发现NaN值的文件: D:\research\research4\XLM_ROBERTA_AE\XLM_ROBERTA9000_9999AE.pt

发现NaN值的文件: D:\research\research4\XLM_ROBERTA_AE\XLM_ROBERTA_9999AE.pt

C:\Users\m1830\AppData\Local\Temp\ipykernel_35696\2206183519.py:12: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  data = torch.load(file_path)

发现NaN值的文件: D:\research\research4\ROBERTA_AE\ROBERTA0_499AE.pt

发现NaN值的文件: D:\research\research4\ROBERTA_AE\ROBERTA1000_1499AE.pt

发现NaN值的文件: D:\research\research4\ROBERTA_AE\ROBERTA1500_1999AE.pt

发现NaN值的文件: D:\research\research4\ROBERTA_AE\ROBERTA2000_2499AE.pt

发现NaN值的文件: D:\research\research4\ROBERTA_AE\ROBERTA2500_2999AE.pt

发现NaN值的文件: D:\research\research4\ROBERTA_AE\ROBERTA3000_3499AE.pt

发现NaN值的文件: D:\research\research4\ROBERTA_AE\ROBERTA3500_3999AE.pt

发现NaN值的文件: D:\research\research4\ROBERTA_AE\ROBERTA4000_4499AE.pt

发现NaN值的文件: D:\research\research4\ROBERTA_AE\ROBERTA4500_4999AE.pt

发现NaN值的文件: D:\research\research4\ROBERTA_AE\ROBERTA5000_5204AE.pt

发现NaN值的文件: D:\research\research4\ROBERTA_AE\ROBERTA500_999AE.pt

发现NaN值的文件: D:\research\research4\ROBERTA_AE\ROBERTA_5205AE.pt

C:\Users\m1830\AppData\Local\Temp\ipykernel_35696\2877183983.py:12: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  data = torch.load(file_path)

In [4]:

torch.load('D:/research/research4/ALBERT_AE/ALBERT_9815AE.pt')

  torch.load('D:/research/research4/ALBERT_AE/ALBERT_9815AE.pt')


tensor([[3.0348, 2.1736, 1.8624, 0.6527, 0.7145, 3.1504, 2.0715, 2.1156, 2.5074,
         2.8101, 1.4712, 2.3859],
        [2.3142, 1.9504, 1.5882, 1.1475, 1.1529, 2.2011, 1.6270, 1.7698, 2.3182,
         2.3726, 0.8521, 1.8782],
        [2.3922, 2.0219, 1.6210, 1.2543, 1.2461, 2.2791, 1.6327, 1.7790, 2.2965,
         2.3330, 0.8406, 1.9844],
        [2.3477, 1.9953, 1.6170, 1.2816, 1.2811, 2.2839, 1.6132, 1.8230, 2.2660,
         2.3233, 0.9001, 1.9471],
        [2.3393, 2.0225, 1.6595, 1.3418, 1.3434, 2.3190, 1.6433, 1.8442, 2.2761,
         2.2964, 0.9387, 1.9933],
        [2.3033, 2.0325, 1.6736, 1.3917, 1.3887, 2.3262, 1.6475, 1.8484, 2.2496,
         2.2668, 0.9579, 1.9905],
        [2.2510, 2.0424, 1.6839, 1.4366, 1.4302, 2.3152, 1.6514, 1.8477, 2.2223,
         2.2203, 0.9761, 2.0008],
        [2.2106, 2.0600, 1.7294, 1.4945, 1.4819, 2.3300, 1.6855, 1.8293, 2.2300,
         2.2103, 1.0012, 2.0320],
        [2.1412, 2.0779, 1.7568, 1.5585, 1.5277, 2.2744, 1.6878, 1.8105, 2.2349,

# BERT修正

In [8]:
torch.load('D:/research/research4/BERT_AE/BERT_2329AE.pt')

  torch.load('D:/research/research4/BERT_AE/BERT_2329AE.pt')


tensor([[2.5875, 1.5556, 2.0080, 2.6920, 0.8539, 2.6194, 2.8890, 2.3030, 2.6104,
         2.6095, 2.1985, 2.0646],
        [0.5209, 2.5413, 1.5459, 1.8962, 1.9195, 2.1653, 0.4864, 1.0817, 1.6527,
         1.6240, 1.7709, 2.0222],
        [1.2929, 0.6105, 1.3941, 0.7052, 1.2138, 2.0656, 1.2122, 0.9337, 0.5878,
         1.1218, 1.0549, 0.7695],
        [1.6245, 1.9238, 1.3281, 2.0794, 1.6363, 2.3066, 0.8955, 0.9064, 1.6673,
         1.6415, 0.8228, 1.6407],
        [1.8841, 1.4416, 0.4195, 0.9118, 1.4070, 0.5407, 1.2204, 1.5684, 0.9813,
         1.9926, 1.2210, 1.6992],
        [0.7636, 1.7962, 1.2765, 1.3716, 1.0108, 1.2463, 1.0862, 0.8835, 1.5266,
         1.2456, 0.7787, 0.7038],
        [1.5535, 1.0601, 0.4230,    nan, 1.5575, 1.0515, 0.6474, 0.7297, 1.0285,
         0.9750, 0.7151, 0.4773],
        [0.4825, 0.8173, 1.5613, 1.7127, 0.5177, 0.3245, 0.8190, 1.3007, 1.2958,
         1.2493, 0.4645, 1.4387],
        [1.4027, 1.1014, 0.8175, 0.5822, 0.7254, 0.9830, 1.4150, 1.1704, 0.7324,

In [11]:
torch.load('D:/research/research4/BERT_AE/BERT_1300AE.pt')

  torch.load('D:/research/research4/BERT_AE/BERT_1300AE.pt')


tensor([[2.5733, 1.5489, 1.9908, 2.6849, 0.8528, 2.6035, 2.8764, 2.2905, 2.5992,
         2.5950, 2.1883, 2.0580],
        [0.5200, 2.5300, 1.5345, 1.8719, 1.8991, 2.1449, 0.4847, 1.0791, 1.6455,
         1.6176, 1.7588, 2.0017],
        [1.2789, 0.6137, 1.3807, 0.7062, 1.2101, 2.0585, 1.2067, 0.9304, 0.5866,
         1.1217, 1.0524, 0.7685],
        [1.6134, 1.9124, 1.3239, 2.0659, 1.6261, 2.2873, 0.8946, 0.9030, 1.6609,
         1.6354, 0.8169, 1.6369],
        [1.8620, 1.4244, 0.4146, 0.8953, 1.3967, 0.5329, 1.2030, 1.5551, 0.9651,
         1.9833, 1.2056, 1.6829],
        [0.7505, 1.7763, 1.2397, 1.3589, 1.0138, 1.2194, 1.0769, 0.8675, 1.4972,
         1.2380, 0.7760, 0.7037],
        [1.5486, 1.0500, 0.4174,    nan, 1.5292, 1.0432, 0.6414, 0.7189, 1.0224,
         0.9723, 0.7162, 0.4707],
        [0.4806, 0.8112, 1.5596, 1.6928, 0.5155, 0.3226, 0.8207, 1.2946, 1.2774,
         1.2389, 0.4657, 1.4253],
        [1.3947, 1.0846, 0.8034, 0.5748, 0.7155, 0.9705, 1.4104, 1.1547, 0.7272,

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from textpruner import TransformerPruner
from tqdm import tqdm

# Load model directly
tokenizer = AutoTokenizer.from_pretrained("hw2942/bert-base-chinese-finetuning-financial-news-sentiment-v2")
model = AutoModelForSequenceClassification.from_pretrained("hw2942/bert-base-chinese-finetuning-financial-news-sentiment-v2",output_attentions=True)

# load the dataset 
ds = load_dataset("hw2942/financial-news-sentiment")


  return torch.load(checkpoint_file, map_location="cpu")


In [2]:
def calculate_AE_matrix(attention_matrix):
    e=0.0000001
    if torch.isnan(attention_matrix).any().item():
        print('attention_matrix',attention_matrix)
    out = ((torch.log(1/(attention_matrix+e)))* (attention_matrix+e)).sum(dim=3).mean(dim=2)
    if torch.isnan(out).any().item():
        print('out',out)
    return out

# 函数原型如下，这里的if条件判断则是为了防止gpu上放置数据过多，导致核崩溃
# 这里需要基于这个函数来微微调整得到各个模型的AE矩阵计算的代码

def get_AE_matrix(model,dataset,heads_per_layer,layers):
    model.to('cpu')
    attention_entropy=torch.tensor([[0.]*heads_per_layer]*layers)
    AE_matrix = torch.tensor([[0.]*heads_per_layer]*layers)
    data_amount = 0
    for data in tqdm(dataset):
        if data_amount/len(dataset)<=0.35:
            if model.device!=torch.device(type='cuda',index=0):
                model.to('cuda')
            attention_entropy=attention_entropy.to('cuda')
            input_text = data['Title']
            
            inputs = tokenizer(input_text,return_tensors='pt').to('cuda')
            
            output = model(**inputs)
            
            attentions = output[1]
            attentions = torch.stack(attentions)
            if torch.isnan(attentions).any().item():
                print(attentions)
            attentions = torch.cat([(layer) for layer in attentions])
            attentions.to('cuda')

            AE = calculate_AE_matrix(attentions)
            AE.to('cuda')
            attention_entropy += AE
            data_amount+=1
        else:
            model.to('cpu')
            attention_entropy=attention_entropy.to('cpu')
            input_text = data['Title']
            inputs = tokenizer(input_text,return_tensors='pt').to('cpu')
            output = model(**inputs)
            attentions = output[1]
            attentions = torch.stack(attentions)
            if torch.isnan(attentions).any().item():
                print(attentions)
            attentions = torch.cat([(layer) for layer in attentions])
            attention_entropy += calculate_AE_matrix(attentions.to('cpu'))
            data_amount+=1
    if torch.isnan(attention_entropy).any().item():
            print(attention_entropy)
    AE_matrix = attention_entropy / data_amount    
    return AE_matrix
from tqdm import tqdm
import torch.utils.data as Data

class MyDataset(Data.Dataset):
    def __init__(self,dataset):
        self.dataset = dataset
    def __len__(self):
        return len(self.dataset['labels'])
    def __getitem__(self,idx):
        keys = list(self.dataset.keys())
        values = list(self.dataset.values())
        return {keys[0]:values[0][idx],keys[1]:values[1][idx]}



In [3]:
inputs = tokenizer(ds['train'][0]['Title'],return_tensors='pt').to('cpu')
output = model(**inputs)
attentions = output[1]
attentions[0].size()

torch.Size([1, 12, 26, 26])

In [19]:
1/attentions[6][0][3]

tensor(inf, grad_fn=<MulBackward0>)

In [10]:
(1/attentions[0]).size()

torch.Size([1, 12, 26, 26])

In [4]:
# dataset_=ds['train'][:1300]
dataset_=ds['train'][1300:]
data_val = MyDataset(dataset_)

In [5]:
AE_BERT_1300=get_AE_matrix(model,data_val,heads_per_layer=12,layers=12)
AE_BERT_1300

100%|██████████| 1029/1029 [00:47<00:00, 21.67it/s]


tensor([[2.6053, 1.5641, 2.0297, 2.7009, 0.8552, 2.6395, 2.9051, 2.3188, 2.6247,
         2.6279, 2.2114, 2.0729],
        [0.5219, 2.5554, 1.5603, 1.9269, 1.9452, 2.1911, 0.4886, 1.0849, 1.6617,
         1.6322, 1.7862, 2.0481],
        [1.3105, 0.6066, 1.4112, 0.7041, 1.2184, 2.0746, 1.2191, 0.9379, 0.5893,
         1.1219, 1.0581, 0.7709],
        [1.6385, 1.9381, 1.3334, 2.0964, 1.6491, 2.3310, 0.8967, 0.9107, 1.6754,
         1.6492, 0.8303, 1.6455],
        [1.9122, 1.4634, 0.4257, 0.9326, 1.4200, 0.5505, 1.2423, 1.5851, 1.0017,
         2.0043, 1.2405, 1.7199],
        [0.7803, 1.8214, 1.3231, 1.3877, 1.0071, 1.2804, 1.0980, 0.9036, 1.5637,
         1.2551, 0.7820, 0.7040],
        [1.5597, 1.0728, 0.4300, 0.0714, 1.5933, 1.0620, 0.6550, 0.7434, 1.0363,
         0.9786, 0.7137, 0.4856],
        [0.4849, 0.8250, 1.5636, 1.7377, 0.5206, 0.3269, 0.8168, 1.3085, 1.3191,
         1.2625, 0.4630, 1.4556],
        [1.4129, 1.1226, 0.8353, 0.5917, 0.7379, 0.9988, 1.4209, 1.1902, 0.7391,

In [6]:
torch.save(AE_BERT_1300,'D:/research/research4/BERT_AE - rectified - rectified/BERT_1029AE.pt')

In [7]:
import torch
AE1_1300=torch.load('D:/research/research4/BERT_AE - rectified - rectified/BERT_1300AE.pt')
AE1301_2329=torch.load('D:/research/research4/BERT_AE - rectified - rectified/BERT_1029AE.pt')
AE_matrix=(AE1_1300*1300+AE1301_2329*(2329-1300))/2329

  AE1_1300=torch.load('D:/research/research4/BERT_AE - rectified - rectified/BERT_1300AE.pt')
  AE1301_2329=torch.load('D:/research/research4/BERT_AE - rectified - rectified/BERT_1029AE.pt')


In [9]:
AE_matrix

tensor([[2.5875, 1.5556, 2.0080, 2.6920, 0.8539, 2.6194, 2.8891, 2.3030, 2.6105,
         2.6095, 2.1985, 2.0646],
        [0.5209, 2.5413, 1.5459, 1.8962, 1.9195, 2.1653, 0.4865, 1.0817, 1.6527,
         1.6240, 1.7709, 2.0222],
        [1.2929, 0.6106, 1.3942, 0.7053, 1.2138, 2.0656, 1.2122, 0.9337, 0.5878,
         1.1218, 1.0549, 0.7696],
        [1.6245, 1.9238, 1.3281, 2.0794, 1.6363, 2.3066, 0.8956, 0.9064, 1.6673,
         1.6415, 0.8228, 1.6407],
        [1.8842, 1.4416, 0.4195, 0.9118, 1.4070, 0.5407, 1.2204, 1.5684, 0.9813,
         1.9926, 1.2210, 1.6992],
        [0.7637, 1.7962, 1.2765, 1.3716, 1.0108, 1.2464, 1.0862, 0.8835, 1.5266,
         1.2456, 0.7787, 0.7038],
        [1.5535, 1.0601, 0.4230, 0.0724, 1.5575, 1.0515, 0.6474, 0.7297, 1.0286,
         0.9751, 0.7151, 0.4773],
        [0.4825, 0.8173, 1.5613, 1.7127, 0.5177, 0.3245, 0.8190, 1.3007, 1.2958,
         1.2493, 0.4645, 1.4387],
        [1.4027, 1.1014, 0.8175, 0.5822, 0.7254, 0.9830, 1.4150, 1.1704, 0.7324,

In [10]:
# torch.save(AE_matrix,"D:/research/research4/BERT_AE - rectified - rectified/BERT_2329AE.pt")

In [6]:
from bertviz import model_view


sentence = 'I am a good student.'
inputs = tokenizer(sentence, return_tensors='pt')

# 获取模型输出（包括注意力权重）
outputs = model(**inputs)

# 可视化注意力
attention = outputs.attentions
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
model_view(attention, tokens)

<IPython.core.display.Javascript object>

# ROBERTA修正

In [10]:
torch.load("D:/research/research4/ROBERTA_AE/ROBERTA_5205AE.pt")

  torch.load("D:/research/research4/ROBERTA_AE/ROBERTA_5205AE.pt")


tensor([[2.6263e+00, 2.5077e+00, 2.5571e+00, 2.5777e+00, 2.5178e+00, 2.6367e+00,
         2.5353e+00, 2.4944e+00, 2.6276e+00, 2.1178e+00, 2.5601e+00, 1.6013e+00,
         2.5379e+00, 2.6502e+00, 2.5943e+00, 2.3914e+00],
        [1.0713e+00, 1.0350e+00, 1.5223e+00, 1.5613e+00, 7.4442e-01, 1.7089e+00,
         1.9589e+00, 1.7404e+00, 1.6072e+00,        nan, 1.1792e+00, 8.7922e-01,
         1.9684e+00, 1.3926e+00,        nan, 1.0701e+00],
        [1.6133e+00, 8.2368e-01, 3.8522e-03, 3.8029e-01, 1.2726e+00,        nan,
         4.2718e-01, 4.6054e-01, 1.0652e+00,        nan,        nan, 7.4415e-01,
                nan, 3.9545e-07, 1.2667e+00, 9.9539e-01],
        [5.7705e-01, 9.4560e-01, 1.1531e+00, 8.6386e-05, 8.0989e-01, 8.7567e-01,
         8.3881e-01, 1.4427e+00, 1.2176e+00, 8.1250e-01, 5.6353e-01, 8.0658e-01,
                nan, 6.5196e-01, 5.1093e-01, 1.1017e+00],
        [9.0117e-01, 1.8855e+00, 1.2955e+00, 6.7242e-01, 1.0615e+00, 1.1279e+00,
         7.7869e-01, 1.3825e+00, 2.0021

In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")
model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english",output_attentions=True)
from datasets import load_dataset
ds = load_dataset("rahmaabusalma/tweets_sentiment_analysis")
def calculate_AE_matrix(attention_matrix):
    e=0.0000001
    return (torch.log(attention_matrix+e) * (attention_matrix+e) * (-1)).sum(dim=3).mean(dim=2)

# 函数原型如下，这里的if条件判断则是为了防止gpu上放置数据过多，导致核崩溃
# 这里需要基于这个函数来微微调整得到各个模型的AE矩阵计算的代码

def get_AE_matrix(model,dataset,heads_per_layer,layers):
    model.to('cpu')
    attention_entropy=torch.tensor([[0.]*heads_per_layer]*layers)
    AE_matrix = torch.tensor([[0.]*heads_per_layer]*layers)
    data_amount = 0
    for data in tqdm(dataset):
        if data_amount/len(dataset)<=0.8:
            if model.device!=torch.device(type='cuda',index=0):
                model.to('cuda')
            attention_entropy=attention_entropy.to('cuda')
            input_text = data['text']
            inputs = tokenizer(input_text,return_tensors='pt').to('cuda')
            output = model(**inputs)
            
            attentions = output[1]
            attentions = torch.cat([(layer) for layer in attentions])
            attentions.to('cuda')
            AE = calculate_AE_matrix(attentions)
            AE.to('cuda')
            attention_entropy += AE
            data_amount+=1
        else:
            model.to('cpu')
            attention_entropy=attention_entropy.to('cpu')
            input_text = data['text']
            inputs = tokenizer(input_text,return_tensors='pt').to('cpu')
            output = model(**inputs)
            attentions = output[1]
            attentions = torch.cat([(layer) for layer in attentions])
            attention_entropy += calculate_AE_matrix(attentions.to('cpu'))
            data_amount+=1
    if torch.isnan(attention_entropy).any().item():
            print('-------------',attention_entropy)
    AE_matrix = attention_entropy / data_amount    
    return AE_matrix

from tqdm import tqdm
import torch
import torch.utils.data as Data

class MyDataset(Data.Dataset):
    def __init__(self,dataset):
        self.dataset = dataset
    def __len__(self):
        return len(self.dataset['label'])
    def __getitem__(self,idx):
        keys = list(self.dataset.keys())
        values = list(self.dataset.values())
        return {keys[0]:values[0][idx],keys[1]:values[1][idx]}

  return torch.load(checkpoint_file, map_location="cpu")


In [2]:
dataset_ = ds['validation'][5000:5500]
data_val = MyDataset(dataset_)
AE_ROBERTA=get_AE_matrix(model,data_val,16,24)

100%|██████████| 205/205 [00:15<00:00, 13.36it/s]


In [3]:
torch.save(AE_ROBERTA,"D:/research/research4/ROBERTA_AE - rectified - rectified/ROBERTA5000_5204AE.pt")

In [4]:
import torch
AE_ROBERTA_LIST=[]
for i in range(1,11):
    AE_ROBERTA_LIST.append(torch.load(f'./ROBERTA_AE - rectified - rectified/ROBERTA{(i-1)*500}_{i*500-1}AE.pt'))

  AE_ROBERTA_LIST.append(torch.load(f'./ROBERTA_AE - rectified - rectified/ROBERTA{(i-1)*500}_{i*500-1}AE.pt'))


In [5]:
AE_ROBERTA_205 = torch.load('./ROBERTA_AE - rectified - rectified/ROBERTA5000_5204AE.pt')

  AE_ROBERTA_205 = torch.load('./ROBERTA_AE - rectified - rectified/ROBERTA5000_5204AE.pt')


In [6]:
AE_matrix=(sum([ae*500 for ae in AE_ROBERTA_LIST])+205*AE_ROBERTA_205)/5205
torch.save(AE_matrix,"./ROBERTA_AE - rectified - rectified/ROBERTA_5205AE.pt")

In [7]:
AE_matrix

tensor([[2.6275e+00, 2.5082e+00, 2.5576e+00, 2.5776e+00, 2.5184e+00, 2.6369e+00,
         2.5364e+00, 2.4943e+00, 2.6277e+00, 2.1178e+00, 2.5613e+00, 1.6010e+00,
         2.5381e+00, 2.6508e+00, 2.5941e+00, 2.3907e+00],
        [1.0645e+00, 1.0340e+00, 1.5193e+00, 1.5590e+00, 7.4441e-01, 1.7081e+00,
         1.9572e+00, 1.7394e+00, 1.6062e+00, 8.5963e-02, 1.1782e+00, 8.7814e-01,
         1.9689e+00, 1.3915e+00, 2.2189e-03, 1.0688e+00],
        [1.6139e+00, 8.2095e-01, 3.8718e-03, 3.8042e-01, 1.2713e+00, 4.1220e-05,
         4.2800e-01, 4.6045e-01, 1.0661e+00, 1.0304e-02, 4.0818e-05, 7.4533e-01,
         4.0825e-05, 4.1134e-05, 1.2675e+00, 9.9594e-01],
        [5.7747e-01, 9.4670e-01, 1.1534e+00, 1.3081e-04, 8.0885e-01, 8.7455e-01,
         8.3887e-01, 1.4428e+00, 1.2176e+00, 8.1248e-01, 5.6476e-01, 8.0764e-01,
         1.2984e-01, 6.5270e-01, 5.1071e-01, 1.1030e+00],
        [8.9959e-01, 1.8856e+00, 1.2966e+00, 6.7219e-01, 1.0602e+00, 1.1280e+00,
         7.7899e-01, 1.3805e+00, 2.0043

# XLM_ROBERTA修正

In [11]:
torch.load("D:\\research\\research4\XLM_ROBERTA_AE\XLM_ROBERTA_9999AE.pt")

  torch.load("D:\\research\\research4\XLM_ROBERTA_AE\XLM_ROBERTA_9999AE.pt")


tensor([[2.7472, 2.9270, 2.5793, 2.8226, 3.0622, 2.7021, 2.5405, 2.7961, 1.5514,
         2.7265, 2.9318, 3.0028],
        [0.7816, 1.3866, 2.3770, 1.6064, 0.6307, 1.5959, 2.1630, 0.8361,    nan,
         1.9572, 1.9643, 1.4853],
        [0.7862, 0.9225, 1.0233, 1.8597, 0.6854,    nan, 0.7071, 1.8317, 1.3581,
         1.9713, 1.9579, 1.3653],
        [0.4398, 2.2347, 1.4329, 1.3632, 1.1460, 0.2639, 1.6362, 2.2474, 0.6381,
         1.2343, 1.7628, 1.1011],
        [1.7116, 1.7552, 1.0954, 0.5926, 1.1112, 0.8905, 2.3594, 0.0836, 2.1158,
         1.2604, 2.5000, 1.1761],
        [1.4902, 1.8560, 1.8729, 2.5291, 1.2911, 1.4341, 1.2276, 2.2982, 1.1720,
         1.6461, 2.2137, 1.7572],
        [1.8695, 1.2799, 0.2200, 1.9231, 1.3465, 1.7786, 2.1079, 2.0013, 1.8468,
         1.7569, 1.5979, 0.2396],
        [1.8985, 0.7705, 1.5725, 0.5332, 1.9640, 2.3576, 2.2119, 1.8527, 2.5077,
         2.5288, 1.7239, 0.9798],
        [0.7803, 1.7603, 1.4389, 2.6791, 1.7531, 0.3474, 1.6300, 1.0447, 0.8844,

In [1]:
# Load model directly
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection",output_attentions=True)
from datasets import load_dataset

ds = load_dataset("papluca/language-identification")

In [2]:
def calculate_AE_matrix(attention_matrix):
    e=0.0000001
    return (torch.log(attention_matrix+e) * (attention_matrix+e) * (-1)).sum(dim=3).mean(dim=2)

# 函数原型如下，这里的if条件判断则是为了防止gpu上放置数据过多，导致核崩溃
# 这里需要基于这个函数来微微调整得到各个模型的AE矩阵计算的代码

def get_AE_matrix(model,dataset,heads_per_layer,layers):
    model.to('cpu')
    attention_entropy=torch.tensor([[0.]*heads_per_layer]*layers)
    AE_matrix = torch.tensor([[0.]*heads_per_layer]*layers)
    data_amount = 0
    for data in tqdm(dataset):
        if data_amount/len(dataset)<=0.4:
            if model.device!=torch.device(type='cuda',index=0):
                model.to('cuda')
            attention_entropy=attention_entropy.to('cuda')
            input_text = data['text']
            inputs = tokenizer(input_text,return_tensors='pt').to('cuda')
            output = model(**inputs)
            
            attentions = output[1]
            attentions = torch.cat([(layer) for layer in attentions])
            attentions.to('cuda')
            AE = calculate_AE_matrix(attentions)
            AE.to('cuda')
            attention_entropy += AE
            data_amount+=1
        else:
            model.to('cpu')
            attention_entropy=attention_entropy.to('cpu')
            input_text = data['text']
            inputs = tokenizer(input_text,return_tensors='pt').to('cpu')
            output = model(**inputs)
            attentions = output[1]
            attentions = torch.cat([(layer) for layer in attentions])
            attention_entropy += calculate_AE_matrix(attentions.to('cpu'))
            data_amount+=1
    if torch.isnan(attention_entropy).any().item():
        print('-------------',attention_entropy)
    AE_matrix = attention_entropy / data_amount    
    return AE_matrix

from tqdm import tqdm
import torch
import torch.utils.data as Data

class MyDataset(Data.Dataset):
    def __init__(self,dataset):
        self.dataset = dataset
    def __len__(self):
        return len(self.dataset['labels'])
    def __getitem__(self,idx):
        keys = list(self.dataset.keys())
        values = list(self.dataset.values())
        return {keys[0]:values[0][idx],keys[1]:values[1][idx]}

In [3]:
dataset_ = ds['validation'][9000:10000]
data_val = MyDataset(dataset_)
AE_XLM_ROBERTA=get_AE_matrix(model,data_val,12,12)

100%|██████████| 1000/1000 [00:50<00:00, 19.72it/s]


In [3]:
dataset_ = ds['validation'][7000:7723]
data_val = MyDataset(dataset_)
AE_XLM_ROBERTA=get_AE_matrix(model,data_val,12,12)

100%|██████████| 723/723 [00:37<00:00, 19.44it/s] 


In [3]:
dataset_ = ds['validation'][6000:7000]
data_val = MyDataset(dataset_)
AE_XLM_ROBERTA=get_AE_matrix(model,data_val,12,12)

100%|██████████| 1000/1000 [01:13<00:00, 13.52it/s]


In [4]:
AE_XLM_ROBERTA

tensor([[2.7209, 2.9040, 2.5561, 2.7971, 3.0390, 2.6836, 2.5233, 2.7726, 1.5327,
         2.7072, 2.9090, 2.9809],
        [0.7758, 1.3725, 2.3615, 1.5987, 0.6239, 1.5879, 2.1650, 0.8296, 0.1362,
         1.9505, 1.9480, 1.4741],
        [0.7764, 0.9156, 1.0146, 1.8413, 0.6824, 0.1279, 0.7045, 1.8126, 1.3499,
         1.9542, 1.9365, 1.3603],
        [0.4372, 2.2146, 1.4126, 1.3566, 1.1266, 0.2592, 1.6192, 2.2266, 0.6341,
         1.2258, 1.7459, 1.0886],
        [1.7033, 1.7394, 1.0916, 0.5790, 1.0968, 0.8719, 2.3253, 0.0756, 2.0827,
         1.2578, 2.4729, 1.1472],
        [1.4798, 1.8523, 1.8628, 2.5126, 1.2634, 1.4314, 1.1986, 2.2843, 1.1645,
         1.6478, 2.1991, 1.7491],
        [1.8639, 1.2761, 0.2099, 1.9128, 1.3416, 1.7540, 2.0983, 1.9856, 1.8368,
         1.7334, 1.5643, 0.2296],
        [1.8684, 0.7614, 1.5730, 0.5282, 1.9519, 2.3432, 2.2022, 1.8512, 2.4962,
         2.5041, 1.7224, 0.9746],
        [0.7545, 1.7521, 1.4338, 2.6584, 1.7591, 0.3367, 1.6397, 1.0442, 0.8489,

In [4]:
torch.save(AE_XLM_ROBERTA,"./XLM_ROBERTA_AE - rectified - rectified/XLM_ROBERTA9000_9999AE.pt")

In [5]:
import torch
AE_XLM_ROBERTA_LIST=[]
for i in range(1,11):
    if i==8:
        continue
    AE_XLM_ROBERTA_LIST.append(torch.load(f'./XLM_ROBERTA_AE - rectified - rectified/XLM_ROBERTA{(i-1)*1000}_{i*1000-1}AE.pt'))

  AE_XLM_ROBERTA_LIST.append(torch.load(f'./XLM_ROBERTA_AE - rectified - rectified/XLM_ROBERTA{(i-1)*1000}_{i*1000-1}AE.pt'))


In [6]:
AE_XLM_ROBERTA_LIST

[tensor([[2.7209, 2.9040, 2.5561, 2.7971, 3.0390, 2.6836, 2.5233, 2.7726, 1.5327,
          2.7072, 2.9090, 2.9809],
         [0.7758, 1.3725, 2.3615, 1.5987, 0.6239, 1.5879, 2.1650, 0.8296, 0.1362,
          1.9505, 1.9480, 1.4741],
         [0.7764, 0.9156, 1.0146, 1.8413, 0.6824, 0.1279, 0.7045, 1.8126, 1.3499,
          1.9542, 1.9365, 1.3603],
         [0.4372, 2.2146, 1.4126, 1.3566, 1.1266, 0.2592, 1.6192, 2.2266, 0.6341,
          1.2258, 1.7459, 1.0886],
         [1.7033, 1.7394, 1.0916, 0.5790, 1.0968, 0.8719, 2.3253, 0.0756, 2.0827,
          1.2578, 2.4729, 1.1472],
         [1.4798, 1.8523, 1.8628, 2.5126, 1.2634, 1.4314, 1.1986, 2.2843, 1.1645,
          1.6478, 2.1991, 1.7491],
         [1.8639, 1.2761, 0.2099, 1.9128, 1.3416, 1.7540, 2.0983, 1.9856, 1.8368,
          1.7334, 1.5643, 0.2296],
         [1.8684, 0.7614, 1.5730, 0.5282, 1.9519, 2.3432, 2.2022, 1.8512, 2.4962,
          2.5041, 1.7224, 0.9746],
         [0.7545, 1.7521, 1.4338, 2.6584, 1.7591, 0.3367, 1.6397

In [7]:
a = torch.load('./XLM_ROBERTA_AE - rectified - rectified/XLM_ROBERTA7000_7722AE.pt')
b = torch.load('./XLM_ROBERTA_AE - rectified - rectified/XLM_ROBERTA7724_7999AE.pt')
AE_matrix=(sum([ae*1000 for ae in AE_XLM_ROBERTA_LIST])+a*723+b*276)/9999
torch.save(AE_matrix,"./XLM_ROBERTA_AE - rectified - rectified/XLM_ROBERTA_9999AE.pt")

  a = torch.load('./XLM_ROBERTA_AE - rectified - rectified/XLM_ROBERTA7000_7722AE.pt')
  b = torch.load('./XLM_ROBERTA_AE - rectified - rectified/XLM_ROBERTA7724_7999AE.pt')


In [9]:
a,b

(tensor([[2.7276, 2.9076, 2.5530, 2.7999, 3.0425, 2.6882, 2.5219, 2.7747, 1.5488,
          2.7103, 2.9091, 2.9817],
         [0.7860, 1.4009, 2.3634, 1.5975, 0.6379, 1.6013, 2.1637, 0.8366, 0.1370,
          1.9515, 1.9639, 1.4746],
         [0.7848, 0.9216, 1.0235, 1.8462, 0.6876, 0.1273, 0.7069, 1.8221, 1.3574,
          1.9662, 1.9524, 1.3691],
         [0.4350, 2.2199, 1.4253, 1.3463, 1.1483, 0.2665, 1.6229, 2.2214, 0.6324,
          1.2220, 1.7606, 1.1109],
         [1.6842, 1.7368, 1.0752, 0.5938, 1.0941, 0.8854, 2.3403, 0.0822, 2.0961,
          1.2377, 2.4906, 1.1594],
         [1.4785, 1.8475, 1.8587, 2.5257, 1.2802, 1.4293, 1.2529, 2.2785, 1.1596,
          1.6329, 2.1989, 1.7546],
         [1.8450, 1.2644, 0.2198, 1.9080, 1.3311, 1.7522, 2.0864, 1.9824, 1.8153,
          1.7444, 1.5832, 0.2381],
         [1.8904, 0.7627, 1.5532, 0.5348, 1.9588, 2.3438, 2.2000, 1.8352, 2.4898,
          2.5234, 1.7157, 0.9606],
         [0.7734, 1.7710, 1.4854, 2.6600, 1.7632, 0.3627, 1.6723

In [10]:
torch.load('./XLM_ROBERTA_AE - rectified/XLM_ROBERTA_9999AE.pt')

  torch.load('./XLM_ROBERTA_AE - rectified/XLM_ROBERTA_9999AE.pt')


tensor([[2.7472, 2.9270, 2.5793, 2.8226, 3.0622, 2.7021, 2.5405, 2.7961, 1.5514,
         2.7265, 2.9318, 3.0027],
        [0.7816, 1.3866, 2.3770, 1.6063, 0.6307, 1.5959, 2.1630, 0.8361, 0.1342,
         1.9572, 1.9643, 1.4853],
        [0.7862, 0.9225, 1.0233, 1.8597, 0.6853, 0.1256, 0.7071, 1.8317, 1.3581,
         1.9713, 1.9579, 1.3653],
        [0.4398, 2.2347, 1.4329, 1.3632, 1.1460, 0.2639, 1.6362, 2.2474, 0.6381,
         1.2343, 1.7628, 1.1011],
        [1.7116, 1.7552, 1.0954, 0.5926, 1.1112, 0.8905, 2.3594, 0.0836, 2.1158,
         1.2604, 2.5000, 1.1761],
        [1.4902, 1.8560, 1.8729, 2.5291, 1.2911, 1.4341, 1.2276, 2.2982, 1.1720,
         1.6461, 2.2137, 1.7572],
        [1.8695, 1.2799, 0.2200, 1.9231, 1.3465, 1.7786, 2.1079, 2.0013, 1.8468,
         1.7569, 1.5979, 0.2396],
        [1.8985, 0.7705, 1.5725, 0.5332, 1.9640, 2.3576, 2.2119, 1.8527, 2.5077,
         2.5288, 1.7239, 0.9798],
        [0.7803, 1.7603, 1.4389, 2.6791, 1.7531, 0.3474, 1.6300, 1.0447, 0.8844,

In [11]:
import os
import torch

def check_nan_in_tensor(tensor):
    """检查张量中是否包含NaN值"""
    return torch.isnan(tensor).any().item()  # 返回True如果包含NaN

def check_pt_file_for_nan(file_path):
    """加载.pt文件并检查其中的张量是否包含NaN"""
    try:
        # 加载pt文件
        data = torch.load(file_path)
        
        # 如果数据是一个字典，可能包含多个张量
        if isinstance(data, dict):
            for key, tensor in data.items():
                if torch.is_tensor(tensor) and check_nan_in_tensor(tensor):
                    return True
        # 如果数据是一个张量
        elif torch.is_tensor(data):
            if check_nan_in_tensor(data):
                return True
    except Exception as e:
        print(f"无法加载文件: {file_path}, 错误: {e}")
    
    return False

def find_nan_in_pt_files(directory):
    """遍历文件夹下所有.pt文件，检查其中是否包含NaN"""
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".pt"):
                file_path = os.path.join(root, file)
                if check_pt_file_for_nan(file_path):
                    print(f"发现NaN值的文件: {file_path}")

# 示例使用：指定你要遍历的文件夹路径
directory_path = 'D:\\research\\research4\\BERT_AE - rectified - rectified'  # 替换为你的文件夹路径
find_nan_in_pt_files(directory_path)


  data = torch.load(file_path)
