In [1]:
import requests
from bs4 import BeautifulSoup
from akshare.utils import demjson
from akshare.utils.tqdm import get_tqdm
import pandas as pd
from datetime import datetime, timedelta

In [None]:
start_year = "2020"

headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/89.0.4389.90 Safari/537.36",
    }
headers_bk = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/89.0.4389.90 Safari/537.36",
            "Referer": "https://q.10jqka.com.cn",
            "Host": "d.10jqka.com.cn",
        }

link_code_ls = []
bk_data_ls = []
today_data_ls = []

for suffix in ["gn/","thshy/"]:
    url = f"https://q.10jqka.com.cn/{suffix}"
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, features="lxml")
    divs = soup.find_all(name="div", attrs={"class": "cate_items"})
    for div in divs:
        # 在每个div中查找所有的a标签
        links = div.find_all('a') 
        for link in links:
            # 获取href属性
            href = link.get('href') 
            # 获取文本内容
            text = link.text 
            link_code_ls.append((href,  text))

print(len(link_code_ls))
for symbol_url, name in link_code_ls[:2]:
    r = requests.get(symbol_url, headers=headers)
    soup = BeautifulSoup(r.text, features="lxml")
    symbol_code = soup.find(name="div", attrs={"class": "board-hq"}).find("span").text
    big_df = pd.DataFrame()
    current_year = datetime.now().year
    dd_texts = [dd.text for dd in soup.find_all("dd")] 
    close = soup.find("span",  class_="board-xj arr-fall").text 
    open, low, high, volume, amount = dd_texts[0], dd_texts[2], dd_texts[3], dd_texts[4], dd_texts[9]
    today_data_ls.append((name, symbol_code, open, high, low, close, volume, amount))

    tqdm = get_tqdm()
    for year in tqdm(range(int(start_year), current_year + 1), leave=False):
        url = f"https://d.10jqka.com.cn/v4/line/bk_{symbol_code}/01/{year}.js"
        r = requests.get(url, headers=headers_bk)
        data_text = r.text
        try:
            demjson.decode(data_text[data_text.find("{") : -1])
        except:  # noqa: E722
            continue
        temp_df = demjson.decode(data_text[data_text.find("{") : -1])
        temp_df = pd.DataFrame(temp_df["data"].split(";"))
        temp_df = temp_df.iloc[:, 0].str.split(",", expand=True)
        big_df = pd.concat(objs=[big_df, temp_df], ignore_index=True)
    if big_df.columns.shape[0] == 12:
        big_df.columns = ["date","open","high","low","close","volume","amount","_","_","_","_","_",]
    else:
        big_df.columns = ["date","open","high","low","close","volume","amount","_","_","_","_",]
    big_df = big_df[["date","open","high","low","close","volume","amount",]]
    big_df["date"] = pd.to_datetime(big_df["date"], errors="coerce").dt.date
    big_df["open"] = pd.to_numeric(big_df["open"], errors="coerce")
    big_df["high"] = pd.to_numeric(big_df["high"], errors="coerce")
    big_df["low"] = pd.to_numeric(big_df["low"], errors="coerce")
    big_df["close"] = pd.to_numeric(big_df["close"], errors="coerce")
    big_df["volume"] = pd.to_numeric(big_df["volume"], errors="coerce")
    big_df["amount"] = pd.to_numeric(big_df["amount"], errors="coerce")
    bk_data_ls.append((name, big_df))

442


In [6]:
print(bk_data_ls[0][0])
print(bk_data_ls[0][1].tail())

阿尔茨海默概念
           date      open      high       low     close     volume  \
472  2025-09-02  1353.379  1353.603  1320.547  1330.656  982424350   
473  2025-09-03  1333.546  1343.817  1317.655  1322.170  813860100   
474  2025-09-04  1326.883  1337.293  1294.136  1313.114  837025920   
475  2025-09-05  1312.889  1335.951  1290.293  1335.951  693464640   
476  2025-09-08  1336.437  1356.452  1335.287  1353.549  705241630   

           amount  
472  1.789247e+10  
473  1.646862e+10  
474  1.476401e+10  
475  1.385096e+10  
476  1.465946e+10  


In [None]:
import torch
import torch.nn  as nn
import numpy as np 
import pandas as pd
from sklearn.preprocessing  import MinMaxScaler
from torch.utils.data  import Dataset, DataLoader
 
# ===== 1. 数据预处理 ===== 
import numpy as np
import torch
from torch.utils.data  import Dataset
from sklearn.preprocessing  import MinMaxScaler
 
class StockDataset(Dataset):
    def __init__(self, bk_data_ls, seq_length=16, forecast_gap=2):
        """
        bk_data_ls: List of tuples (code, data), where:
            - code: 股票代码（str）
            - data: DataFrame，列顺序为 [open, high, low, close, volume, amount]
        seq_length_short: 短期序列长度（可选）
        seq_length: 长期依赖序列长度（用于输入）
        forecast_gap: 预测几天后的收盘价，如后天 = 2
        """
        self.seq_length  = seq_length
        self.forecast_gap  = forecast_gap
        self.scalers  = dict()  # 存储每个股票的归一化器
 
        all_X = []
        all_y = []
 
        # 遍历多个股票数据 
        for code, data in bk_data_ls:
            # 数据标准化
            scaler = MinMaxScaler(feature_range=(-1, 1))
            scaled_data = scaler.fit_transform(data) 
            self.scalers[code]  = scaler  # 保存 scaler 供后续使用 
 
            # 构造样本
            X, y = [], []
            max_index = len(data) - seq_length - forecast_gap
            for i in range(max_index):
                # 输入序列：seq_length 天的特征
                seq_features = scaled_data[i:i + seq_length]
                # 标签：forecast_gap 天后的收盘价（第3列）
                target_idx = i + seq_length + forecast_gap - 1
                target_close = scaled_data[target_idx, 3]
 
                X.append(seq_features) 
                y.append(target_close) 
 
            # 转换为 numpy 并保存 
            all_X.extend(X) 
            all_y.extend(y) 
 
        # 统一转换为 tensor
        self.X = torch.tensor(np.array(all_X),  dtype=torch.float32) 
        self.y = torch.tensor(np.array(all_y),  dtype=torch.float32).view(-1,  1)
 
    def __len__(self):
        return len(self.X)
 
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
 
    def get_scaler(self, code):
        """
        获取指定股票的 scaler
        """
        return self.scalers.get(code,  None)
    def update_scaler(self, new_data_ls):
        """用新数据增量更新各股票的Scaler"""
        for code, new_data in new_data_ls:
            if code in self.scalers: 
                scaler = self.scalers[code] 
                # 增量更新：扩展 min/max 范围 
                scaler.partial_fit(new_data)   # 关键步骤 


class MultiScaleAttentionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size = 64, num_layers_long = 3, num_layers_short = 2):
        super().__init__()
        
        self.lstm_long  = nn.LSTM(input_size, hidden_size, num_layers=num_layers_long, batch_first=True, dropout=0.2 if num_layers_long > 1 else 0)
        self.lstm_short  = nn.LSTM(input_size, hidden_size, num_layers=num_layers_short, batch_first=True, dropout=0.2 if num_layers_short > 1 else 0)
        self.attn  = nn.MultiheadAttention(hidden_size, num_heads=4)  # 4头注意力 
        
        self.regressor  = nn.Sequential(
            nn.Linear(hidden_size, 32),
            nn.Linear(32, 1)
        )
 
    def forward(self, x1, seq_length_short=7):
        out1, _ = self.lstm_long(x1) 
        out2, _ = self.lstm_short(x1[:, -seq_length_short:, :]) # lstm_out: [batch, seq_len, hidden]
        lstm_out = torch.cat((out1,  out2), dim=1)
        attn_out = self.attn(lstm_out,  lstm_out, lstm_out, batch_first=True, need_weights=False) 
        out = self.regressor(attn_out[:, -1, :]) 
        return out

 
# ===== 3. 训练配置 =====
if __name__ == "__main__":
    # 加载数据 (示例)
    data = pd.read_csv('stock.csv',  usecols=['open','high','low','close','volume','amount'])
    
    # 创建数据集 
    dataset = StockDataset(data, seq_length=10, forecast_gap=2)
    train_size = int(0.8 * len(dataset))
    train_set, test_set = torch.utils.data.random_split( 
        dataset, [train_size, len(dataset) - train_size]
    )
    
    # 数据加载器 
    train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_set, batch_size=32)
    
    # 模型初始化
    device = torch.device('cuda'  if torch.cuda.is_available()  else 'cpu')
    model = MultiScaleAttentionLSTM(
        input_size=6,  # 6个特征 
        hidden_size=64,
        num_layers=2
    ).to(device)
    
    # 训练参数 
    criterion = nn.MSELoss()
    
    optimizer = torch.optim.Adam(model.parameters(),  lr=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( 
        optimizer, 
        mode='min',       # 监控验证损失
        factor=0.5,       # 学习率衰减系数 
        patience=5,       # 容忍5个epoch无改善
        verbose=True
    )
    
    # ===== 4. 训练循环 =====
    for epoch in range(100):
        model.train() 
        train_loss = 0 
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device),  y_batch.to(device) 
            
            optimizer.zero_grad() 
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward() 
            nn.utils.clip_grad_norm_(model.parameters(),  1.0)  # 梯度裁剪
            optimizer.step() 
            train_loss += loss.item() 
        
        # 验证
        model.eval() 
        test_loss = 0
        with torch.no_grad(): 
            for X_test, y_test in test_loader:
                X_test, y_test = X_test.to(device),  y_test.to(device) 
                preds = model(X_test)
                test_loss += criterion(preds, y_test).item()
        
        scheduler.step(test_loss) 
        print(f'Epoch {epoch} | Train Loss: {train_loss/len(train_loader):.6f} | Test Loss: {test_loss/len(test_loader):.6f}')
 
    # ===== 5. 预测示例 =====
    model.eval() 
    with torch.no_grad(): 
        sample = test_set[0][0].unsqueeze(0).to(device)
        prediction = model(sample)
        scaled_pred = prediction.cpu().numpy() 
        
        # 逆归一化收盘价 
        dummy = np.zeros((1,  6))
        dummy[:, 3] = scaled_pred  # 将预测值放入close列
        real_pred = dataset.scaler_close.inverse_transform(dummy)[0,  3]
        print(f'预测的后天收盘价: {real_pred:.2f}')


# 假设已存在训练好的 dataset 对象 
new_data = [('000001.SZ', df_new_000001), ('600000.SH', df_new_600000)]
 
# 增量更新Scaler并微调模型 
dataset.update_scaler(new_data)   # 更新Scaler范围
new_X = dataset.transform_new_data(new_data)   # 用新Scaler转换数据
 
# 模型微调（非重新训练）
optimizer = torch.optim.SGD(model.parameters(),  lr=0.001)  # 使用更小学习率 
for epoch in range(5):
    for X_batch, y_batch in DataLoader(new_X, batch_size=32):
        optimizer.zero_grad() 
        loss = model(X_batch, y_batch)
        loss.backward() 
        optimizer.step() 