In [1]:
# 检查 Arrow 数据集的列名
from datasets import load_from_disk

dataset_path = r"D:/桌面/DL/Pro/poetry_foundation"
dataset = load_from_disk(dataset_path)
train_dataset = dataset["train"]

print(train_dataset.column_names)  # 查看真实列名
print(train_dataset[0])  # 查看第一条记录


['Unnamed: 0', 'Title', 'Poem', 'Poet', 'Tags']
{'Unnamed: 0': 0, 'Title': '\r\r\n                    Objects Used to Prop Open a Window\r\r\n                ', 'Poem': "\r\r\nDog bone, stapler,\r\r\ncribbage board, garlic press\r\r\n     because this window is loose—lacks\r\r\nsuction, lacks grip.\r\r\nBungee cord, bootstrap,\r\r\ndog leash, leather belt\r\r\n     because this window had sash cords.\r\r\nThey frayed. They broke.\r\r\nFeather duster, thatch of straw, empty\r\r\nbottle of Elmer's glue\r\r\n     because this window is loud—its hinges clack\r\r\nopen, clack shut.\r\r\nStuffed bear, baby blanket,\r\r\nsingle crib newel\r\r\n     because this window is split. It's dividing\r\r\nin two.\r\r\nVelvet moss, sagebrush,\r\r\nwillow branch, robin's wing\r\r\n     because this window, it's pane-less. It's only\r\r\na frame of air.\r\r\n", 'Poet': 'Michelle Menting', 'Tags': None}


In [31]:
from datasets import load_from_disk
import json
from pathlib import Path
from collections import Counter
import random

# -----------------------------
# 配置路径
# -----------------------------
dataset_path = r"D:/桌面/DL/Pro/poetry_foundation"
dataset = load_from_disk(dataset_path)["train"]
output_path = Path("D:/桌面/DL/Pro/poetry_sft_enhanced.jsonl")

# -----------------------------
# 清洗文本函数
# -----------------------------
def clean(text):
    if text is None:
        return ""
    return text.replace("\r", "").strip()

# -----------------------------
# 统计每个作者的诗歌数量
# -----------------------------
author_counter = Counter()
for item in dataset:
    poet = clean(item.get("Poet", "Unknown"))
    author_counter[poet] += 1

# -----------------------------
# 灵活生成 user prompt（增强版）
# -----------------------------
def make_user_prompt(title, tags, poet):
    """
    增强版：
    - 随机选择模板：只风格、只主题、风格+主题
    - 对诗歌数 < 10 的作者，不生成风格 prompt
    """
    prompt_types = []

    # 判断是否可以生成风格相关 prompt
    if poet and author_counter[poet] >= 10:
        prompt_types.append("style")
        prompt_types.append("style+theme")  # 风格+主题

    if tags or title:
        prompt_types.append("theme_only")  # 只指定主题

    if not prompt_types:
        prompt_types.append("theme_only")  # 兜底

    # 随机选择一种模板
    choice = random.choice(prompt_types)

    if choice == "style":
        return f"Write a poem in the style of {poet}."
    elif choice == "theme_only":
        theme = tags if tags else title
        return f"Write a poem using the theme: {theme}."
    elif choice == "style+theme":
        theme = tags if tags else title
        return f"Write a poem using the theme: {theme}, in the style of {poet}."
    else:
        return f"Write a poem using the theme: {title}."

# -----------------------------
# 写入增强版 JSONL
# -----------------------------
with open(output_path, "w", encoding="utf-8") as f:
    for item in dataset:
        title = clean(item.get("Title", ""))
        poem = clean(item.get("Poem", ""))
        poet = clean(item.get("Poet", ""))
        tags = clean(item.get("Tags")) if item.get("Tags") else ""

        user_prompt = make_user_prompt(title, tags, poet)

        record = {
            "messages": [
                {"role": "user", "content": user_prompt},
                {"role": "assistant", "content": poem}
            ]
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"转换完成！共 {len(dataset)} 条，输出到 {output_path}")


转换完成！共 13854 条，输出到 D:\桌面\DL\Pro\poetry_sft_enhanced.jsonl


In [20]:
import json
from pathlib import Path

# -----------------------------
# 配置路径
# -----------------------------
input_path = Path("D:/桌面/DL/Pro/gutenberg-poetry-v001.ndjson")
output_path = Path("D:/桌面/DL/Pro/gutenberg_poem_line_sft.jsonl")

# -----------------------------
# Step 1: 定义过滤规则
# -----------------------------
def is_poem_line(line: str) -> bool:
    """
    改进过滤规则：
    - 排除明显前言/注释
    - 限制单行长度
    - 排除说明性开头
    - 单行单词数少于15
    """
    skip_words = ["copyright", "collected by", "chapter", "gutenberg", "introduction"]
    skip_phrases = ["is based on", "legends and stories", "collected by", "notes on the text"]
    
    lower_line = line.lower()
    
    # 含 skip_words
    if any(word in lower_line for word in skip_words):
        return False
    
    # 含 skip_phrases
    if any(phrase in lower_line for phrase in skip_phrases):
        return False
    
    # 太长的行（通常不是诗句）
    if len(line) > 70:
        return False
    
    # 单词数限制
    if len(line.split()) >= 15:
        return False
    
    return True

# -----------------------------
# Step 2: 读取每行并生成 SFT 样本
# -----------------------------
with open(input_path, "r", encoding="utf-8") as fin, \
     open(output_path, "w", encoding="utf-8") as fout:
    
    for line in fin:
        if not line.strip():
            continue
        item = json.loads(line)
        s = item.get("s", "").strip()
        if not s:
            continue
        if not is_poem_line(s):
            continue
        
        # 每行诗句生成一条 SFT 样本
        sft_record = {
            "messages": [
                {
                    "role": "user",
                    "content": "Write a poem in the same style as the following example."
                },
                {
                    "role": "assistant",
                    "content": s
                }
            ]
        }
        
        fout.write(json.dumps(sft_record, ensure_ascii=False) + "\n")

print(f"✔ 已生成每行诗句风格学习 SFT JSONL: {output_path}")


✔ 已生成每行诗句风格学习 SFT JSONL: D:\桌面\DL\Pro\gutenberg_poem_line_sft.jsonl


In [None]:
from datasets import load_from_disk
from collections import Counter

# -----------------------------
# 加载数据集
# -----------------------------
dataset_path = r"D:/桌面/DL/Pro/poetry_foundation"
dataset = load_from_disk(dataset_path)
train_dataset = dataset["train"]

# -----------------------------
# 统计作者信息
# -----------------------------
author_counter = Counter()

for record in train_dataset:
    author = record.get("Poet", "Unknown")  # 根据真实列名调整
    author_counter[author] += 1

unique_authors = len(author_counter)

# -----------------------------
# 输出结果
# -----------------------------
total_poems = len(train_dataset)
print(f"转换完成！共 {total_poems} 条记录")
print(f"唯一作者数: {unique_authors}")

print("\n前 10 位诗歌最多的作者:")
for author, count in author_counter.most_common(10):
    print(f"{author}: {count} 首")


转换完成！共 13854 条记录
唯一作者数: 3128

前 10 位诗歌最多的作者:
William Shakespeare: 85 首
Alfred, Lord Tennyson: 73 首
Emily Dickinson: 51 首
William Wordsworth: 51 首
Rae Armantrout: 49 首
John Ashbery: 42 首
Yusef Komunyakaa: 42 首
William Butler Yeats: 41 首
John Donne: 38 首
Robert Browning: 35 首
Percy Bysshe Shelley: 35 首
Walt Whitman: 35 首
Algernon Charles Swinburne: 35 首
Kay Ryan: 34 首
John Milton: 33 首
William Blake: 33 首
Edmund Spenser: 33 首
Samuel Menashe: 33 首
W. S. Merwin: 32 首
Edward Thomas: 32 首
Thomas Hardy: 32 首
Henry Wadsworth Longfellow: 32 首
W. S. Di Piero: 32 首
Samuel Taylor Coleridge: 32 首
Edgar Lee Masters: 31 首
Edna St. Vincent Millay: 29 首
Robert Frost: 28 首
Wallace Stevens: 28 首
Dean Young: 28 首
Jane Hirshfield: 28 首
Billy Collins: 27 首
William Carlos Williams: 27 首
Kahlil Gibran: 27 首
Sir Philip Sidney: 27 首
George Herbert: 27 首
Amy Lowell: 27 首
Matthew Arnold: 27 首
Frank Stanford: 27 首
Elizabeth Barrett Browning: 26 首
Carl Sandburg: 26 首
Ben Jonson: 26 首
Naomi Shihab Nye: 25 首
Sir  Tho

In [36]:
import json
from pathlib import Path

# SFT JSONL 文件路径
sft_path = Path("D:/桌面/DL/Pro/poetry_sft.jsonl")

print("=== 前 15 条 SFT 样本 ===\n")

with open(sft_path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 15:
            break
        record = json.loads(line.strip())
        messages = record.get("messages", [])
        
        print(f"--- SFT 样本 {i+1} ---")
        for msg in messages:
            role = msg.get("role")
            content = msg.get("content")
            print(f"{role}: {content}\n")
        print("------------------------------\n")


=== 前 15 条 SFT 样本 ===

--- SFT 样本 1 ---
user: Write a poem in the style of Michelle Menting.

assistant: Dog bone, stapler,
cribbage board, garlic press
     because this window is loose—lacks
suction, lacks grip.
Bungee cord, bootstrap,
dog leash, leather belt
     because this window had sash cords.
They frayed. They broke.
Feather duster, thatch of straw, empty
bottle of Elmer's glue
     because this window is loud—its hinges clack
open, clack shut.
Stuffed bear, baby blanket,
single crib newel
     because this window is split. It's dividing
in two.
Velvet moss, sagebrush,
willow branch, robin's wing
     because this window, it's pane-less. It's only
a frame of air.

------------------------------

--- SFT 样本 2 ---
user: Write a poem in the style of Lucia Cherciu.

assistant: The old cupola glinted above the clouds, shone
among fir trees, but it took him an hour
for the half mile all the way up the hill. As he trailed,
the village passed him by, greeted him,
asked about his healt

In [8]:
import json
import torch
from pathlib import Path
from tokenizers import Tokenizer

# ---------------------------
# 本地 tokenizer
# ---------------------------
tokenizer_path = r"D:\桌面\课题\大模型\qwen2.5-7b\tokenizer.json"
tokenizer = Tokenizer.from_file(tokenizer_path)

# ---------------------------
# 读取 JSONL 的第一条样本
# ---------------------------
sft_path = Path(r"D:/桌面/DL/Pro/poetry_sft.jsonl")
with open(sft_path, "r", encoding="utf-8") as f:
    first_line = f.readline()
    sample_jsonl = json.loads(first_line.strip())

# ---------------------------
# 拼接 user 和 assistant
# ---------------------------
user_text = ""
assistant_text = ""

for msg in sample_jsonl.get("messages", []):
    if msg["role"] == "user":
        user_text += msg["content"].strip() + "\n"
    elif msg["role"] == "assistant":
        assistant_text += msg["content"].strip() + "\n"

full_text = user_text + assistant_text

# ---------------------------
# Tokenization
# ---------------------------
input_ids = tokenizer.encode(full_text).ids
user_len = len(tokenizer.encode(user_text).ids)

input_ids = torch.tensor(input_ids).unsqueeze(0)  # batch=1
labels = input_ids.clone()
labels[:, :user_len] = -100  # user 部分不计算 loss

# ---------------------------
# 假设你有一个本地 PyTorch LLM 模型（Qwen2.5-7b）
# model.forward 返回 logits (batch, seq_len, vocab_size)
# 这里演示如何计算 assistant loss
# ---------------------------

# logits 这里用随机张量模拟，实际用你本地模型前向
batch, seq_len = input_ids.shape
vocab_size = tokenizer.get_vocab_size()
logits = torch.randn(batch, seq_len, vocab_size)  # 模拟模型输出

# CrossEntropyLoss 计算
loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
loss = loss_fct(logits.view(-1, vocab_size), labels.view(-1))

print("=== 第一条样本 assistant loss ===")
print(loss.item())


=== 第一条样本 assistant loss ===
12.38481330871582


In [None]:
# %%
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from peft import LoraConfig, get_peft_model
import sentencepiece as spm  # 如果 tokenizer 是 spm

# -------------------------
# 配置
# -------------------------
CONFIG = {
    "model_path": r"D:/桌面/课题/大模型/qwen2.5-7b",
    "tokenizer_path": r"D:/桌面/课题/大模型/qwen2.5-7b/tokenizer.model",
    "data_path": r"D:/桌面/DL/Pro/poetry_sft.jsonl",
    "output_dir": r"D:/桌面/课题/大模型/qwen2.5-sft",
    "num_train_epochs": 3,
    "batch_size": 1,
    "learning_rate": 2e-5,
    "max_len": 1024,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
}

# -------------------------
# 1. 加载 tokenizer（SentencePiece）
# -------------------------
sp = spm.SentencePieceProcessor()
sp.load(CONFIG["tokenizer_path"])

def tokenize(text, max_len=1024):
    ids = sp.encode(text, out_type=int)
    if len(ids) > max_len:
        ids = ids[:max_len]
    return ids

# -------------------------
# 2. 构建 Dataset
# -------------------------
class PoetryDataset(Dataset):
    def __init__(self, jsonl_path, tokenizer, max_len=1024):
        self.data = []
        self.tokenizer = tokenizer
        self.max_len = max_len
        with open(jsonl_path, "r", encoding="utf-8") as f:
            for line in f:
                self.data.append(json.loads(line))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        user_text = sample["messages"][0]["content"]
        assistant_text = sample["messages"][1]["content"]
        
        prompt = f"<|im_start|>user\n{user_text}<|im_end|>\n<|im_start|>assistant\n{assistant_text}<|im_end|>"
        input_ids = self.tokenizer(prompt, max_len=self.max_len)
        # user 部分 loss=-100
        user_len = len(self.tokenizer(f"<|im_start|>user\n{user_text}<|im_end|>\n", max_len=self.max_len))
        labels = input_ids.copy()
        labels[:user_len] = -100

        # padding
        if len(input_ids) < self.max_len:
            pad_len = self.max_len - len(input_ids)
            input_ids += [0]*pad_len
            labels += [-100]*pad_len

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long)
        }

dataset = PoetryDataset(CONFIG["data_path"], tokenize, CONFIG["max_len"])
loader = DataLoader(dataset, batch_size=CONFIG["batch_size"], shuffle=True)

# -------------------------
# 3. 加载 Qwen2.5-7B 模型
# -------------------------
# 这里假设你有官方 QwenForCausalLM 类
from qwen_model import QwenForCausalLM

model = QwenForCausalLM()
state_dict = torch.load(CONFIG["model_path"], map_location=CONFIG["device"])
model.load_state_dict(state_dict)
model.to(CONFIG["device"])
model.train()

# -------------------------
# 4. LoRA 配置
# -------------------------
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj","v_proj","k_proj","o_proj"],
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_cfg)

# -------------------------
# 5. Optimizer + Loss
# -------------------------
optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG["learning_rate"])
loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

# -------------------------
# 6. 训练循环
# -------------------------
for epoch in range(CONFIG["num_train_epochs"]):
    total_loss = 0.0
    for batch in loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(CONFIG["device"])
        labels = batch["labels"].to(CONFIG["device"])
        
        outputs = model(input_ids=input_ids)  # logits
        logits = outputs.logits
        loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} 平均 loss: {total_loss/len(loader):.4f}")

# -------------------------
# 7. 保存 LoRA 模型
# -------------------------
model.save_pretrained(CONFIG["output_dir"])
# tokenizer 保存方法视你具体实现
sp.save(CONFIG["output_dir"] + "/tokenizer.model")
print("🎉 LoRA 微调完成！")


ImportError: cannot import name 'list_repo_tree' from 'huggingface_hub' (c:\Users\DELL\AppData\Local\Programs\Python\Python313\Lib\site-packages\huggingface_hub\__init__.py)