In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
from transformers import AutoTokenizer,AutoModelForCausalLM
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def get_hidden(model,tokenizer,text):
    '''
    reurn the last hidden state of the last token
    '''
    with torch.no_grad():
        model_res = model(**tokenizer(text,return_tensors='pt').to('cuda'),output_hidden_states=True)
        hidden_states = model_res.hidden_states
        last_hidden_state = hidden_states[-1].cpu()
        res = last_hidden_state[-1][-1]
        res = res
        return res

In [4]:
def calculate_class_center(embeddings):
    """
    计算类别中心 μ_y
    参数:
    - embeddings: tensor, 每个实例的4096维嵌入向量，形状为 (n, 4096)
    
    返回:
    - mu_y: tensor, 类别中心向量，形状为 (4096,)
    """
    mu_y = embeddings.mean(dim=0)
    return mu_y


In [5]:
import torch

def select_representative_samples(model, tokenizer, texts, num_exemplars):
    """
    选择代表性样本
    参数:
    - model: 预训练模型
    - tokenizer: 模型对应的tokenizer
    - texts: list of str, 文本列表
    - num_exemplars: int, 要选择的样本数量
    
    返回:
    - selected_texts: list of str, 选择出的代表性样本
    """
    # 获取所有文本的嵌入
    embeddings = torch.stack([get_hidden(model, tokenizer, text) for text in texts])

    # 计算类别中心
    mu_y = calculate_class_center(embeddings)

    selected_indices = []
    remaining_indices = list(range(len(embeddings)))

    for k in range(num_exemplars):
        if k == 0:
            # 第一个样例选择与类别中心最近的样本
            distances = torch.norm(embeddings - mu_y, dim=1)
        else:
            # 计算已选样本的平均向量
            already_selected = embeddings[selected_indices]
            mean_selected = already_selected.mean(dim=0)
            
            # 计算当前样本与更新后的类别中心的距离
            distances = torch.norm(embeddings - (mu_y - (mean_selected - embeddings)), dim=1)
        
        # 找到距离最小的样本
        selected_idx = remaining_indices[torch.argmin(distances)]
        selected_indices.append(selected_idx)
        remaining_indices.remove(selected_idx)

    # 返回选出的代表性样本的文本
    selected_texts = [texts[idx] for idx in selected_indices]
    return selected_texts


In [6]:
model_path = 'path_of_chatglm3_6b'
model = AutoModelForCausalLM.from_pretrained(model_path,trust_remote_code=True,device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(model_path,trust_remote_code=True)
model.eval()

Loading checkpoint shards: 100%|██████████| 7/7 [00:04<00:00,  1.52it/s]


ChatGLMForConditionalGeneration(
  (transformer): ChatGLMModel(
    (embedding): Embedding(
      (word_embeddings): Embedding(65024, 4096)
    )
    (rotary_pos_emb): RotaryEmbedding()
    (encoder): GLMTransformer(
      (layers): ModuleList(
        (0-27): 28 x GLMBlock(
          (input_layernorm): RMSNorm()
          (self_attention): SelfAttention(
            (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
            (core_attention): CoreAttention(
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (dense): Linear(in_features=4096, out_features=4096, bias=False)
          )
          (post_attention_layernorm): RMSNorm()
          (mlp): MLP(
            (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
            (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
          )
        )
      )
      (final_layernorm): RMSNorm()
    )
    (output_layer): Linear(in_

In [7]:
texts = [
    'text1',
    'text2',
    'text3',
    '......'
]

# 我们想选择 3 个代表性样本
num_exemplars = 3

# 调用选择函数
selected_texts = select_representative_samples(model, tokenizer, texts, num_exemplars)
print("Selected Representative Samples:", selected_texts)

Selected Representative Samples: ['医生正在考虑为患者调整治疗方案。', '病人感觉胸痛，医生怀疑是心脏问题。', '病人报告有轻微头痛和疲劳。']
