# 安装依赖库

- 使用Miniconda创建环境  
conda create -n traffic_forecast python=3.10  
conda activate traffic_forecast   

- 安装核心库  
pip install torch transformers datasets peft accelerate pandas numpy swanlab  



# 生成模拟交通流量数据

In [None]:
import pandas as pd
import numpy as np

def generate_traffic_data(num_samples=1000, seq_len=24, pred_len=6):
    """生成模拟交通流量时间序列数据"""
    timestamps = pd.date_range(start="2024-01-01", periods=num_samples+seq_len+pred_len, freq="H")
    data = {
        "time": [],
        "road_id": [],
        "traffic_flow": [],
        "prompt": [],
        "target": []
    }
    
    for i in range(num_samples):
        # 生成路段ID和基础流量
        road_id = np.random.choice(["A1", "B2", "C3"])
        base_flow = np.random.randint(50, 200)
        
        # 生成带周期性的时间序列
        historical = base_flow + 20 * np.sin(np.linspace(0, 2*np.pi, seq_len)) + np.random.normal(0, 5, seq_len)
        future = base_flow + 20 * np.sin(np.linspace(2*np.pi, 4*np.pi, pred_len)) + np.random.normal(0, 5, pred_len)
        
        # 构建Prompt
        prompt = f"已知过去{seq_len}小时交通流量为：{historical.tolist()}，预测未来{pred_len}小时流量为："
        target = ", ".join(map(str, future.round().astype(int)))
        
        # 填充数据
        data["time"].append(timestamps[i:i+seq_len])
        data["road_id"].append(road_id)
        data["traffic_flow"].append(historical.tolist())
        data["prompt"].append(prompt)
        data["target"].append(target)
    
    return pd.DataFrame(data)

# 生成并保存数据
traffic_df = generate_traffic_data()
traffic_df.to_csv("./data/traffic_dataset.csv", index=False)

# read data

In [None]:
# 数据格式转换

from datasets import Dataset
import pandas as pd


traffic_df = pd.read_csv("./data/traffic_dataset.csv")
# Convert the 'time' column to a list of strings
traffic_df['time'] = traffic_df['time'].apply(lambda x: [str(t) for t in x])

# Create the Dataset
dataset = Dataset.from_pandas(traffic_df)
dataset = dataset.train_test_split(test_size=0.5)

# 模型准备与LoRA配置  
下载Qwen2-0.5B基座模型  
首次下载很慢


In [None]:
# from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer
# login()
model_name = "Qwen/Qwen2-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
# 配置LoRA参数

from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,                  # 秩大小
    lora_alpha=32,        # 缩放因子
    target_modules=["q_proj", "k_proj", "v_proj"],  # 修改注意力层
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # 应显示可训练参数约0.1%


# 模型微调

In [None]:
# 数据预处理函数

def preprocess_function(examples):
    texts = [p + t for p, t in zip(examples["prompt"], examples["target"])]
    tokenized = tokenizer(
        texts,
        max_length=128,# 减少序列长度
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
import swanlab
# SwanLab可视化配置
swanlab.init(project="Traffic-Forecast", experiment_name="Qwen2-LoRA")

In [None]:
# 训练参数配置
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,# 减少训练轮数
    per_device_train_batch_size=2, # 减小批量大小
    gradient_accumulation_steps=1, # 禁用梯度累积
    learning_rate=5e-4,# 调整学习率
    fp16=False,
    bf16=True,
    logging_steps=200,# 减少日志记录频率
    report_to="swanlab"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"]
)

# 开始训练
trainer.train()

# 模型调用与预测

In [None]:
# 加载微调后的模型
from peft import PeftModel

model = AutoModelForCausalLM.from_pretrained("qwen/Qwen2-0.5B")
# 自动加载 adapter_config.json，注意目录位置
model = PeftModel.from_pretrained(model, "./results/checkpoint-500")
model = model.merge_and_unload()  # 合并LoRA权重

In [None]:
# 预测函数
def predict_traffic(prompt, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=max_new_tokens,
        temperature=0.7,
        do_sample=False
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 示例预测
test_prompt = "已知过去24小时交通流量为：[160, 167, 164, 176, 175, 179, 189, 172, 163, 171, 161, 157, 160, 140, 145, 139, 133, 137, 145, 146, 143, 148, 141, 153]，预测未来6小时流量为："
prediction = predict_traffic(test_prompt)
print(prediction)  # 输出形如："132, 128, 140, 145, 138, 130"