In [2]:
# 调用环境
import numpy as np
from torch.utils.data import DataLoader 
from compute_pretrained_embeddings import get_embeddings,get_nl_embeddings
from transformers import AutoConfig, AutoModel, AutoTokenizer
from torch.utils.data import Dataset
import json
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 加载数据集
class JsonlDataset(Dataset):
    def __init__(self, file_path):
        self.data = []
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                self.data.append(json.loads(line))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # 这里假设 JSONL 文件中每行是一个包含文本的 JSON 对象
        text = self.data[idx]['content']        # 要根据实际情况把数据都结合起来
        return text

# 定义DataLoader
path_data = "/home/guochuanzhe/Megatron-LM-gjn/data/starcoder/chatml_data/starcoder-shell-dev.jsonl"
dataset=JsonlDataset(path_data)

In [4]:
# 加载OPT-125模型及其分词器
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
model = AutoModel.from_pretrained("facebook/opt-125m")

# 开启模型的评估模式
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

OPTModel(
  (decoder): OPTDecoder(
    (embed_tokens): Embedding(50272, 768, padding_idx=1)
    (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
    (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (layers): ModuleList(
      (0-11): 12 x OPTDecoderLayer(
        (self_attn): OPTAttention(
          (k_proj): Linear(in_features=768, out_features=768, bias=True)
          (v_proj): Linear(in_features=768, out_features=768, bias=True)
          (q_proj): Linear(in_features=768, out_features=768, bias=True)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (activation_fn): ReLU()
        (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
)

In [10]:
print(len(dataset[0]))

4383


In [22]:
# 准备文本数据
text=['1','1','2']
# text.append(dataset[0])
# text.append(dataset[1])
# if(len(text)) >= 2048:
#     continue
# 对文本进行编码
inputs = tokenizer(text, return_tensors="pt")
# print(inputs)
# # 确保你的模型在CPU或GPU上运行
inputs = {k: v.to(device) for k, v in inputs.items()}
print(inputs)
# 获取隐藏层状态
with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)
    last_hidden_state = outputs.last_hidden_state
# 获取最后一个token的隐藏状态
last_token_state = last_hidden_state[:, -1, :]
print(last_token_state.size())

{'input_ids': tensor([[  2, 134],
        [  2, 134],
        [  2, 176]], device='cuda:0'), 'attention_mask': tensor([[1, 1],
        [1, 1],
        [1, 1]], device='cuda:0')}
torch.Size([3, 768])


In [27]:
embeddings=[]
print(len(dataset))
for i in range(len(dataset)):
    # 准备文本数据
    text = dataset[i]
    # if(len(text)) >= 2048:
    #     continue
    # 对文本进行编码
    inputs = tokenizer(text, return_tensors="pt",padding=True,truncation=True, max_length=2048)
    # print(inputs)
    # # 确保你的模型在CPU或GPU上运行
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # 获取隐藏层状态
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        hidden_states = outputs.hidden_states
        last_hidden_state = hidden_states[-1]

    # 获取最后一个token的隐藏状态
    last_token_state = last_hidden_state[:, -1, :]
    embeddings.append(last_token_state)
print(len(embeddings))

2206
2206


In [46]:
from torch.nn.functional import normalize
from tqdm.auto import tqdm 
# 定义collate_fn
def collate_fn(examples):
    inputs = tokenizer(examples, return_tensors="pt",padding=True,truncation=True, max_length=2048)
    return inputs

# 加载Dataloader
dataloader = DataLoader(
    dataset=dataset,
    batch_size=1,
    shuffle=False,
    collate_fn=collate_fn
)

path_str_type = int
emb_memory_loc = "/home/guochuanzhe/data-process/SemDeDup/memory/emb_memory_loc.dat"
paths_memory_loc = "/home/guochuanzhe/data-process/SemDeDup/memory/paths_memory.dat"  
dataset_size = len(dataset)                                                           # starcoder
emb_size = 768
# emd_memmap = np.memmap(emb_memory_loc, dtype='float32', mode='w+', shape=(dataset_size, emb_size))
# paths_memmap = np.memmap(paths_memory_loc, dtype=path_str_type, mode='w+', shape=(dataset_size,))
emd_memmap = np.memmap(emb_memory_loc, dtype='float32', mode='r', shape=(dataset_size, emb_size))
paths_memmap = np.memmap(paths_memory_loc, dtype=path_str_type, mode='r', shape=(dataset_size,))
def get_nl_embeddings_test(model, dataloader, emd_memmap, paths_memmap):
    """
    function to compute and store representations for the data from pretrained model. It is preferable to parallelize this function on mulitiple devices (GPUs). Each device will process part of the data.
    model: pretrained model
    dataloader: should return   1) data_batch: batch of data examples
                                2) paths_batch: path to location where the example is stored (unique identifier). For example, this could be "n04235860_14959.JPEG" for imagenet.
                                3) batch_indices: global index for each example (between 0 and of size <dataset_size>-1).
    emd_memmap: numpy memmap to store embeddings of size <dataset_size>.
    paths_memmap: numpy memmap to store paths of size <dataset_size>.

    """

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)
    model.eval()
    count =0 
    with torch.no_grad():
        for data_batch in tqdm(dataloader):
            if count >= 0:
                break
            # data_batch = data_batch.to(device)
            # print(data_batch)
            # 获取隐藏层状态
            input_ids = data_batch['input_ids'].to(device)
            attention_mask = data_batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask,output_hidden_states=True)
            hidden_states = outputs.hidden_states
            last_layer_states = hidden_states[-1]
            last_token_state = last_layer_states[:, -1, :]
            # 您可以在此处对 last_layer_states 进行处理，例如归一化
            normalized_last_layer_states = normalize(last_token_state, dim=1)
            
            # 存储编码和路径

            emd_memmap[count] = normalized_last_layer_states.cpu().numpy()
            count +=1
            # paths_memmap[batch_indices] = paths_batch

    # 读取memmap存储的数据
    print("memmap:")
    # emd_memmap = np.memmap(emb_memory_loc, dtype='float32', mode='r', shape=(dataset_size, emb_size))
    # paths_memmap = np.memmap(paths_memory_loc, dtype=path_str_type, mode='r', shape=(dataset_size,))
    print(emd_memmap[0])           

# dataloader[0]
get_nl_embeddings_test(model, dataloader, emd_memmap, paths_memmap)

  0%|          | 0/2206 [00:00<?, ?it/s]

  0%|          | 0/2206 [00:00<?, ?it/s]

memmap:
[-2.54975986e-02 -7.21029937e-02  1.66285578e-02  8.26566145e-02
 -5.11781871e-02  1.25267431e-02  3.49115604e-03 -3.15127778e-04
  5.13186380e-02  1.80433935e-03 -1.13264211e-02  1.98029988e-02
 -7.19255861e-03 -2.59566959e-02  6.11639321e-02 -3.20849307e-02
  1.54433146e-04 -9.00619701e-02 -3.17193195e-02 -2.13452894e-02
  1.23969754e-02 -7.36208027e-03  2.46950127e-02 -2.74652755e-03
 -7.75805339e-02  4.47592773e-02 -1.97108909e-02 -3.05027538e-03
 -4.95513082e-02 -7.45921955e-02 -8.76889005e-03 -3.20358239e-02
 -2.04661377e-02 -1.65516473e-02 -1.01848440e-02  2.02654824e-02
 -6.47709817e-02 -5.34884334e-02 -2.21250504e-02 -7.78072421e-03
  4.10935581e-02 -7.42635690e-03 -1.40313804e-02 -2.53158156e-02
  3.33801913e-03 -1.22634396e-02  8.68368987e-03 -5.50216846e-02
  6.96195513e-02 -1.62526052e-02  2.96577718e-02  1.03403283e-02
 -1.44330561e-02 -1.09123765e-02 -4.12502699e-02 -2.13855077e-02
  1.64831914e-02  1.54955555e-02 -4.64866348e-02 -3.40397027e-03
 -2.63818894e-02 




In [30]:
# 定义collate_fn
def collate_fn(examples):
    inputs = tokenizer(examples, return_tensors="pt",padding=True,truncation=True, max_length=2048)

    return inputs

{'input_ids': tensor([[    2, 10431, 47813,  ...,     1,     1,     1],
        [    2, 41552,  4147,  ..., 47992, 24303, 50140]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])}