In [30]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    "/home/lyz/hf-models/Qwen/Qwen1.5-1.8B-Chat/",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("/home/lyz/hf-models/Qwen/Qwen1.5-1.8B-Chat/")

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [31]:
response

"\n\nLarge Language Models (LLMs) are artificial intelligence systems that have been developed to perform complex natural language processing tasks with unprecedented accuracy and scalability. They are designed to understand, generate, and generate human-like text based on input data, which can include structured or unstructured text such as sentences, paragraphs, or even entire documents.\n\nThe idea behind LLMs is to create a neural network architecture that allows them to learn from vast amounts of text data and adapt their responses to changing language patterns and contexts. These models are typically trained using supervised learning algorithms, which involve feeding the LLM large datasets of labeled text examples, allowing it to identify patterns and relationships between words and phrases in the training data.\n\nOnce trained, an LLM can be fine-tuned on specific domains or applications, such as question-answering, language translation, sentiment analysis, or even language gene

In [39]:
import pandas as pd
import string
import numpy as np

data = pd.read_csv('dataset/test.txt', sep='\t', header=None)

In [40]:
data

Unnamed: 0,0,1
0,在变电所和供电系统的设计和运行中，基于如下用途必须进行短路电流的计算：,In the design and operation of substations and...
1,The Prostate Testing for Cancer and Treatment ...,前列腺癌检测与治疗（ProtecT）研究比较了PSA检测出前列腺癌的男性患者的前列腺切除术与...
2,Particles with nonzero electric charge interac...,电电荷为非零的粒子通过交换光子（电磁力的载体）相互作用。
3,"中国的一个租船人,租了一条10万吨的美国船东的油轮,从上海装货去美国。",A Chinese charterer chartered a 100000 ton tan...
4,为了节省成本，运营商在5G建网初期都会选择NSA。,"In order to save costs, operators will choose ..."
...,...,...
495,在周五黄昏巨浪冲击海岸时，有数百人聚集在海滩庆祝节日，很多人被巨浪冲走，巨浪所到之处一切尽毁。,Hundreds of people gathered on the beach to ce...
496,我不确定。但在我们解决之前不要做任何事。,"Well, I'm not sure. But don't do anything unti..."
497,在本法规定的范围以外需要悬挂国微或者使用国徽图案的，由全国人民代表大会常务委员会办公厅或者国...,If it is necessary to hang the national emblem...
498,贸易的吹鼓手们对此负有一定的责任。,The trade drummers are responsible for this.


In [41]:
from sentence_transformers import SentenceTransformer
sentences_1 = ["样例数据-1", "样例数据-2"]
sentences_2 = ["样例数据-3", "样例数据-4"]
bge_model = SentenceTransformer('/home/lyz/hf-models/bge-small-zh-v1.5/')
embeddings_1 = bge_model.encode(sentences_1, normalize_embeddings=True)
embeddings_2 = bge_model.encode(sentences_2, normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

[[0.86921406 0.8665448 ]
 [0.899487   0.8784326 ]]


In [46]:
for row in data.iterrows():
    if row[1][0][0].lower() in string.ascii_letters:
        s1 = row[1][0]
        s2 = row[1][1]
    else:
        s1 = row[1][1]
        s2 = row[1][0]

    messages = [
        {"role": "user", "content": f"将英文翻译为中文，不要有其他输出：{s1}"},
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    
    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    embeddings_1 = bge_model.encode([response, s2], normalize_embeddings=True)
    score = np.dot(embeddings_1[0], embeddings_1[1])
    score = int(score * 100)

    with open('a.csv', 'a') as up:
        up.write(f'{score}\n')