In [5]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch

# 加载文本向量化模型
tokenizer = BertTokenizer.from_pretrained('/data1/dxw_data/llm/text2vec-large-chinese')
model = BertModel.from_pretrained('/data1/dxw_data/llm/text2vec-large-chinese')

def get_vector(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# 读取 JSON 文件
with open('./responses_caption.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 计算所有描述的向量
vectors = []
keys = list(data.keys())
for key in keys:
    text = data[key]
    vectors.append(get_vector(text))

vectors = np.vstack(vectors)

# 计算相邻元素的相似度并进行分批次处理
batches = []
current_batch = [keys[0]]
threshold = 0.8  # 相似度阈值，可以根据需要调整

for i in range(1, len(vectors)):
    sim = cosine_similarity(vectors[i-1].reshape(1, -1), vectors[i].reshape(1, -1))[0, 0]
    if sim >= threshold:
        current_batch.append(keys[i])
    else:
        batches.append(current_batch)
        current_batch = [keys[i]]

# 最后一批处理
if current_batch:
    batches.append(current_batch)

# 输出批次结果
for idx, batch in enumerate(batches):
    print(f"Batch {idx + 1}: {batch}")

# 保存结果到文件
with open('/data1/dxw_data/llm/tiktok/batches.json', 'w', encoding='utf-8') as f:
    json.dump(batches, f, ensure_ascii=False, indent=4)


Batch 1: ['0', '1', '2']
Batch 2: ['3']
Batch 3: ['4', '5', '6']
Batch 4: ['7', '8', '9', '10']
Batch 5: ['11']
Batch 6: ['12']
Batch 7: ['13', '14']
Batch 8: ['15']
Batch 9: ['16']
Batch 10: ['17']
Batch 11: ['18', '19', '20', '21']
Batch 12: ['22']
Batch 13: ['23']
Batch 14: ['24', '25']
Batch 15: ['26']
Batch 16: ['27', '28', '29', '30', '31', '32', '33']
Batch 17: ['34', '35', '36', '37', '38', '39', '40', '41']
Batch 18: ['42']
Batch 19: ['43']
Batch 20: ['44']
Batch 21: ['45']
Batch 22: ['46']
Batch 23: ['47']
Batch 24: ['48']
Batch 25: ['49', '50', '51']
Batch 26: ['52']
Batch 27: ['53', '54']
Batch 28: ['55']


In [6]:
# 合并代码

In [9]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import json

# 定义读取和合并JSON文件的函数
def merge_json(batches_file, captions_file, comments_file):
    with open(batches_file) as bf:
        batches = json.load(bf)

    with open(captions_file) as cf:
        captions = json.load(cf)
    
    with open(comments_file) as cmf:
        comments = json.load(cmf)

    merged_data = {}

    for batch in batches:
        combined_caption = " ".join([captions[item] for item in batch])
        combined_comment = " ".join([comments[item] for item in batch])
        merged_data[batch[0]] = {
            "caption": combined_caption,
            "comment": combined_comment
        }

    return merged_data

# 读取和合并JSON文件
batches_file_path = '/data1/dxw_data/llm/tiktok/batches.json'
captions_file_path = '/data1/dxw_data/llm/tiktok/data/output_txt/responses_caption.json'
comments_file_path = '/data1/dxw_data/llm/tiktok/data/output_txt/responses_comments.json'
merged_data = merge_json(batches_file_path, captions_file_path, comments_file_path)

# 将合并后的数据保存到新的JSON文件中
merged_data_file_path = "/data1/dxw_data/llm/tiktok/merged_captions.json"
with open(merged_data_file_path, 'w', encoding='utf-8') as merged_file:
    json.dump(merged_data, merged_file, ensure_ascii=False, indent=4)

print(f"Merged JSON file saved to: {merged_data_file_path}")


Merged JSON file saved to: /data1/dxw_data/llm/tiktok/merged_captions.json
