In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

In [2]:
file_path = 'sql_input'

In [3]:
# 读取问题和答案文本文件
with open(f"{file_path}/question.txt", "r", encoding="utf-8") as f:
    questions = f.read().splitlines()
    
with open(f"{file_path}/answer.txt", "r", encoding="utf-8") as f:
    answers = f.read().splitlines()

# 将问题和答案拼接起来形成输入序列
data = [q + " " + a for q, a in zip(questions, answers)]

# 将拼接后的输入序列保存到文件
with open(f"{file_path}/data.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(data))

In [4]:
# import random

# # 读取问题和答案文本文件
# with open("question.txt", "r", encoding="utf-8") as f:
#     questions = f.read().splitlines()

# with open("answer.txt", "r", encoding="utf-8") as f:
#     answers = f.read().splitlines()

# # 生成更多的样例数据
# num_examples = 10000
# generated_data = []
# for i in range(num_examples):
#     q = random.choice(questions)
#     a = random.choice(answers)
#     generated_data.append(q + " " + a)

# # 将生成的样例数据保存到文件
# with open("generated_data.txt", "w", encoding="utf-8") as f:
#     f.write("\n".join(generated_data))

In [5]:
# 选择预训练模型
checkpoint = "bigscience/bloomz-560m"

# 加载分词器和模型
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint)

# 准备训练数据
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=f"{file_path}/data.txt", # 保存拼接后的输入序列到文件
    block_size=128
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# 准备训练参数
training_args = TrainingArguments(
    output_dir="./results", # 训练结果保存目录
    overwrite_output_dir=True,
    num_train_epochs=3, # 训练轮数
    per_device_train_batch_size=16,
    save_steps=1, # 每训练多少步保存一次模型
    save_total_limit=2,
    learning_rate=1e-4, # 学习率
    warmup_steps=500, # 热身步数
)

# 训练模型
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)
trainer.train()

***** Running training *****
  Num examples = 41
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 6
  Number of trainable parameters = 559214592


Step,Training Loss


Saving model checkpoint to ./results/checkpoint-1
Configuration saved in ./results/checkpoint-1/config.json
Configuration saved in ./results/checkpoint-1/generation_config.json
Model weights saved in ./results/checkpoint-1/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-2] due to args.save_total_limit
Saving model checkpoint to ./results/checkpoint-2
Configuration saved in ./results/checkpoint-2/config.json
Configuration saved in ./results/checkpoint-2/generation_config.json
Model weights saved in ./results/checkpoint-2/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-3] due to args.save_total_limit
Saving model checkpoint to ./results/checkpoint-3
Configuration saved in ./results/checkpoint-3/config.json
Configuration saved in ./results/checkpoint-3/generation_config.json
Model weights saved in ./results/checkpoint-3/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-1] due to args.save_total_limit
Saving model checkpoint to ./results/checkp

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.83 GiB (GPU 0; 23.69 GiB total capacity; 15.55 GiB already allocated; 1.65 GiB free; 20.91 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# 加载分词器和模型
checkpoint = "bigscience/bloomz-560m"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModelForCausalLM.from_pretrained("./results/checkpoint-3")

# 准备生成文本的输入
prompt = "查询 Invoice 表中发票总金额的最大值"
inputs = tokenizer.encode(prompt, return_tensors="pt")

# 生成文本
outputs = model.generate(inputs, max_length=120, do_sample=True)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generated_text)

loading file tokenizer.json from cache at /home/nvidia/.cache/huggingface/hub/models--bigscience--bloomz-560m/snapshots/64a6f1765615d2c38a7fe4474b8bff5a6c0e14e6/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/nvidia/.cache/huggingface/hub/models--bigscience--bloomz-560m/snapshots/64a6f1765615d2c38a7fe4474b8bff5a6c0e14e6/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/nvidia/.cache/huggingface/hub/models--bigscience--bloomz-560m/snapshots/64a6f1765615d2c38a7fe4474b8bff5a6c0e14e6/tokenizer_config.json
loading configuration file ./results/checkpoint-3/config.json
Model config BloomConfig {
  "_name_or_path": "./results/checkpoint-3",
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "BloomForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_softmax_in_fp32": true,
  "bias_dropout_fusion": true,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_dropou

查询 Invoice 表中发票总金额的最大值


In [7]:
generated_text

'查询 Invoice 表中发票总金额的最大值'