# 下载BERT

In [2]:
import torch 
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from textpruner import TransformerPruner

# Load model directly
tokenizer = AutoTokenizer.from_pretrained("hw2942/bert-base-chinese-finetuning-financial-news-sentiment-v2")
model = AutoModelForSequenceClassification.from_pretrained("hw2942/bert-base-chinese-finetuning-financial-news-sentiment-v2",output_attentions=True)

# load the dataset 
ds = load_dataset("hw2942/financial-news-sentiment")




In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BertConfig:
    def __init__(self, vocab_size=30522, num_layers=12, embed_size=768, max_position_embeddings=512, 
                 type_vocab_size=2, intermediate_size=3072, num_labels=3):
        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.embed_size = embed_size
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.intermediate_size = intermediate_size
        self.num_labels = num_labels
        self.model_type = 'bert'

class BertEmbeddings(nn.Module):
    def __init__(self, vocab_size, embed_size, max_position_embeddings, type_vocab_size):
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.position_embeddings = nn.Embedding(max_position_embeddings, embed_size)
        self.token_type_embeddings = nn.Embedding(type_vocab_size, embed_size)

        self.layer_norm = nn.LayerNorm(embed_size, eps=1e-12)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids, token_type_ids=None):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        words_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = words_embeddings + position_embeddings + token_type_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

class BertSelfAttention(nn.Module):
    def __init__(self, embed_size):
        super().__init__()
        self.query = nn.Linear(embed_size, embed_size)
        self.key = nn.Linear(embed_size, embed_size)
        self.value = nn.Linear(embed_size, embed_size)
        self.dropout = nn.Dropout(0.1)

    def forward(self, hidden_states, attention_mask=None):
        Q = self.query(hidden_states)
        K = self.key(hidden_states)
        V = self.value(hidden_states)

        attention_scores = torch.matmul(Q, K.transpose(-1, -2)) / torch.sqrt(torch.tensor(embed_size, dtype=torch.float32))
        if attention_mask is not None:
            attention_scores = attention_scores.masked_fill(attention_mask == 0, float('-inf'))
        attention_probs = F.softmax(attention_scores, dim=-1)
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, V)
        return context_layer

class BertSelfOutput(nn.Module):
    def __init__(self, embed_size):
        super().__init__()
        self.dense = nn.Linear(embed_size, embed_size)
        self.LayerNorm = nn.LayerNorm(embed_size, eps=1e-12)
        self.dropout = nn.Dropout(0.1)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

class BertAttention(nn.Module):
    def __init__(self, embed_size):
        super().__init__()
        self.self = BertSelfAttention(embed_size)
        self.output = BertSelfOutput(embed_size)

    def forward(self, hidden_states, attention_mask=None):
        self_outputs = self.self(hidden_states, attention_mask)
        attention_output = self.output(self_outputs, hidden_states)
        return attention_output

class BertIntermediate(nn.Module):
    def __init__(self, embed_size, intermediate_size):
        super().__init__()
        self.dense = nn.Linear(embed_size, intermediate_size)
        self.intermediate_act_fn = nn.GELU()

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states

class BertOutput(nn.Module):
    def __init__(self, intermediate_size, embed_size):
        super().__init__()
        self.dense = nn.Linear(intermediate_size, embed_size)
        self.LayerNorm = nn.LayerNorm(embed_size, eps=1e-12)
        self.dropout = nn.Dropout(0.1)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

class BertLayer(nn.Module):
    def __init__(self, embed_size, intermediate_size):
        super().__init__()
        self.attention = BertAttention(embed_size)
        self.intermediate = BertIntermediate(embed_size, intermediate_size)
        self.output = BertOutput(intermediate_size, embed_size)

    def forward(self, hidden_states, attention_mask=None):
        attention_output = self.attention(hidden_states, attention_mask)
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output

class BertEncoder(nn.Module):
    def __init__(self, num_layers, embed_size, intermediate_size):
        super().__init__()
        self.layer = nn.ModuleList([BertLayer(embed_size, intermediate_size) for _ in range(num_layers)])

    def forward(self, hidden_states, attention_mask=None):
        for layer in self.layer:
            hidden_states = layer(hidden_states, attention_mask)
        return hidden_states

class BertPooler(nn.Module):
    def __init__(self, embed_size):
        super().__init__()
        self.dense = nn.Linear(embed_size, embed_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # Pooler usually pools the first token (CLS token) hidden state
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

class BertModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embeddings = BertEmbeddings(config.vocab_size, config.embed_size, config.max_position_embeddings, config.type_vocab_size)
        self.encoder = BertEncoder(config.num_layers, config.embed_size, config.intermediate_size)
        self.pooler = BertPooler(config.embed_size)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        embeddings = self.embeddings(input_ids, token_type_ids)
        encoder_output = self.encoder(embeddings, attention_mask)
        pooled_output = self.pooler(encoder_output)
        return encoder_output, pooled_output

class BertForSequenceClassification(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(config.embed_size, config.num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        _, pooled_output = self.bert(input_ids, attention_mask, token_type_ids)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Model instantiation and example
vocab_size = 21128  # The size of the vocabulary
num_labels = 3      # The number of labels for the classification task
num_layers = 12     # Number of transformer layers
embed_size = 768    # The size of each embedding vector
max_position_embeddings = 512  # The maximum length of the input sequences
type_vocab_size = 2  # The size of the token type vocabulary
intermediate_size = 3072  # The size of the intermediate (feed forward) layer

config = BertConfig(vocab_size=21128, num_layers=12, embed_size=768, max_position_embeddings=512,
                    type_vocab_size=2, intermediate_size=3072, num_labels=3)

model = BertForSequenceClassification(config)

In [2]:
import torch 
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from textpruner import TransformerPruner

# Load model directly
tokenizer = AutoTokenizer.from_pretrained("hw2942/bert-base-chinese-finetuning-financial-news-sentiment-v2")
bert = AutoModelForSequenceClassification.from_pretrained("hw2942/bert-base-chinese-finetuning-financial-news-sentiment-v2",output_attentions=True)

# load the dataset 
ds = load_dataset("hw2942/financial-news-sentiment")


  return torch.load(checkpoint_file, map_location="cpu")


In [3]:
import torch
import torch.nn as nn

def adjust_keys(state_dict, old_key, new_key):
    """将状态字典中的键从old_key替换为new_key"""
    new_state_dict = {}
    for key, value in state_dict.items():
        new_key_name = key.replace(old_key, new_key)
        new_state_dict[new_key_name] = value
    return new_state_dict

# 加载预先保存的模型状态字典
state_dict = bert.state_dict()

# 调整状态字典中的键名，假设你的模型期望'LayerNorm'而不是'layer_norm'
adjusted_state_dict = adjust_keys(state_dict, "bert.embeddings.LayerNorm.weight", "bert.embeddings.layer_norm.weight")
adjusted_state_dict = adjust_keys(adjusted_state_dict, "bert.embeddings.LayerNorm.bias", "bert.embeddings.layer_norm.bias")

In [4]:
model.load_state_dict(adjusted_state_dict)

<All keys matched successfully>

In [8]:
def get_acc(test_dataset,model_,tokenizer,device='cuda'):
    total = 0
    right = 0
    model_.to(device)
    for data in test_dataset:
        inputs = tokenizer(data['Title'],return_tensors='pt').to(device)
        outputs = model_(**inputs)
        total  += 1
        if torch.max(outputs[0][0].softmax(0),dim=0).indices==data['labels']:
            right += 1
    return right/total

In [81]:
from textpruner import TransformerPruner
pruner = TransformerPruner(model,base_model_prefix='bert')

In [9]:
test_dataset = ds['train']
model.eval()  
get_acc(test_dataset,model,tokenizer,device='cuda')

0.22413052812365822

In [10]:
get_acc(test_dataset,bert,tokenizer,device='cuda')

0.9682267067410906

In [15]:
model.to('cpu')
inputs = tokenizer(ds['train'][0]['Title'],return_tensors='pt')
model.eval()  # 设置为评估模式
model(**inputs)


tensor([[ 3.5746,  0.5193, -3.3148]], grad_fn=<AddmmBackward0>)

In [19]:
bert.to('cpu')
inputs = tokenizer(ds['train'][0]['Title'],return_tensors='pt')
bert(**inputs)[0]


tensor([[-3.4763,  6.7623, -2.4192]], grad_fn=<AddmmBackward0>)

In [23]:
model.bert.embeddings

BertEmbeddings(
  (word_embeddings): Embedding(21128, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [24]:
model.bert.embeddings

BertEmbeddings(
  (word_embeddings): Embedding(21128, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
bert.bert

In [21]:
bert.bert(**inputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.2386, -0.7616, -0.0187,  ...,  0.0816, -1.0815,  0.8051],
         [-1.0530, -0.5591,  0.0589,  ...,  0.5825, -0.5621,  0.5845],
         [-0.8456, -0.6958,  0.1725,  ...,  0.7389, -0.1613,  0.4166],
         ...,
         [-0.5311, -0.7683,  0.2420,  ...,  0.1521, -1.4997,  0.7731],
         [-0.3663, -1.0121, -0.1525,  ...,  0.2732, -1.1671,  0.9551],
         [-0.7145, -0.6862, -0.0597,  ..., -0.0095, -0.9090,  0.3740]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 0.6541, -0.9768,  0.4716,  0.8724,  0.7517,  0.9834, -0.9666, -0.8758,
         -0.3037, -0.6854, -0.3155, -0.9984, -0.8938, -0.7400,  0.9437, -0.7695,
          1.0000,  0.9809, -0.9335,  0.6720,  0.9923,  0.9616, -0.8453, -0.9630,
          0.9764, -0.1965, -0.6682, -0.5618,  0.4427, -0.8411, -0.6409,  0.8537,
         -0.8919, -0.6777, -0.4055,  0.9908, -0.7387, -0.8960, -0.9827, -0.9881,
         -0.7105, -0.6747,  0.55