# 4.1 建模前准备（数据清洗 + 时间切分 + 缺失填补）

本 notebook 固定执行以下步骤：

1. 读取宽表 `call_features_wide`
2. 删除 `2015Q1-Q3`（仅保留 `2015Q4-2025Q4`）
3. 删除 `eps_beat` 缺失的整行
4. 时间切分：
   - 训练集：`2015Q4-2022Q4`
   - 样本外：`2023Q1-2025Q4`
5. 极少数 feature 缺失：用**训练集均值**填充（并应用到训练/样本外）
6. 导出建模数据与填补参数

In [1]:
# ========== 配置 ==========

import json
import sqlite3
from pathlib import Path

import numpy as np
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
WIDE_DB = PROJECT_ROOT / "data" / "earnings_calls_features_wide.db"
WIDE_TABLE = "call_features_wide"

OUT_DIR = PROJECT_ROOT / "data" / "modeling"
OUT_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_START = "2015-Q4"
TRAIN_END = "2022-Q4"
OOS_START = "2023-Q1"
OOS_END = "2025-Q4"

LABEL_COL = "eps_beat"
META_COLS = ["ticker", "quarter", "next_quarter_estimated", "next_quarter_actual", LABEL_COL]

print("WIDE_DB:", WIDE_DB)
print("WIDE_TABLE:", WIDE_TABLE)
print("OUT_DIR:", OUT_DIR)

WIDE_DB: /Users/xinyuewang/Desktop/1.27/data/earnings_calls_features_wide.db
WIDE_TABLE: call_features_wide
OUT_DIR: /Users/xinyuewang/Desktop/1.27/data/modeling


In [2]:
# ========== 1) 读取宽表 ==========

conn = sqlite3.connect(WIDE_DB)
df = pd.read_sql_query(f"SELECT * FROM {WIDE_TABLE}", conn)
conn.close()

print(f"原始行数: {len(df)}")
print(f"原始列数: {len(df.columns)}")
print(df[["ticker", "quarter", LABEL_COL]].head().to_string(index=False))

原始行数: 1201
原始列数: 74
ticker quarter  eps_beat
  AAPL 2015-Q1       1.0
  AAPL 2015-Q3       1.0
  AAPL 2015-Q4       1.0
  AAPL 2016-Q1       0.0
  AAPL 2016-Q2       1.0


In [3]:
# ========== 2) quarter 解析 + 删除 2015Q1-Q3 ==========

def quarter_to_index(q):
    """'YYYY-Qn' -> year*4 + qnum（可比较的时间索引）"""
    if pd.isna(q):
        return np.nan
    s = str(q).strip()
    try:
        y, qq = s.split("-")
        return int(y) * 4 + int(qq.replace("Q", ""))
    except Exception:
        return np.nan

def qidx(qstr):
    return quarter_to_index(qstr)

df["_qidx"] = df["quarter"].apply(quarter_to_index)

min_keep = qidx("2015-Q4")
max_keep = qidx("2025-Q4")

before = len(df)
df = df[df["_qidx"].between(min_keep, max_keep, inclusive="both")].copy()
print(f"删除 2015Q1-Q3 和范围外季度: {before} -> {len(df)}")

删除 2015Q1-Q3 和范围外季度: 1201 -> 1126


In [4]:
# ========== 3) 删除缺失 label 的整行 ==========

if LABEL_COL not in df.columns:
    raise ValueError(f"缺少标签列: {LABEL_COL}")

before = len(df)
df = df[df[LABEL_COL].notna()].copy()
print(f"删除 label 缺失行: {before} -> {len(df)}")
print(f"当前 Beat 比例: {df[LABEL_COL].mean():.2%}")

删除 label 缺失行: 1126 -> 1074
当前 Beat 比例: 79.70%


In [5]:
# ========== 4) 时间切分：训练 / 样本外 ==========

train_mask = df["_qidx"].between(qidx(TRAIN_START), qidx(TRAIN_END), inclusive="both")
oos_mask = df["_qidx"].between(qidx(OOS_START), qidx(OOS_END), inclusive="both")

df_train = df[train_mask].copy()
df_oos = df[oos_mask].copy()

print(f"训练集区间: {TRAIN_START} ~ {TRAIN_END}, 行数={len(df_train)}")
print(f"样本外区间: {OOS_START} ~ {OOS_END}, 行数={len(df_oos)}")

print("\n训练集季度范围:", df_train["quarter"].min(), "->", df_train["quarter"].max())
print("样本外季度范围:", df_oos["quarter"].min(), "->", df_oos["quarter"].max())

训练集区间: 2015-Q4 ~ 2022-Q4, 行数=744
样本外区间: 2023-Q1 ~ 2025-Q4, 行数=330

训练集季度范围: 2015-Q4 -> 2022-Q4
样本外季度范围: 2023-Q1 -> 2025-Q4


In [6]:
# ========== 5) 特征缺失填补：仅用训练集均值 ==========

# 只对数值特征做均值填补
meta_cols_exist = [c for c in META_COLS + ["_qidx"] if c in df_train.columns]
num_cols = df_train.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in num_cols if c not in meta_cols_exist]

print(f"数值特征列数: {len(feature_cols)}")

miss_train_before = df_train[feature_cols].isna().sum().sort_values(ascending=False)
miss_oos_before = df_oos[feature_cols].isna().sum().sort_values(ascending=False)

print("训练集缺失>0的特征数:", int((miss_train_before > 0).sum()))
print("样本外缺失>0的特征数:", int((miss_oos_before > 0).sum()))

# 训练集均值
train_means = df_train[feature_cols].mean()

# 应用到训练集和样本外
df_train[feature_cols] = df_train[feature_cols].fillna(train_means)
df_oos[feature_cols] = df_oos[feature_cols].fillna(train_means)

miss_train_after = int(df_train[feature_cols].isna().sum().sum())
miss_oos_after = int(df_oos[feature_cols].isna().sum().sum())
print(f"填补后训练集缺失总数: {miss_train_after}")
print(f"填补后样本外缺失总数: {miss_oos_after}")

数值特征列数: 65
训练集缺失>0的特征数: 41
样本外缺失>0的特征数: 32
填补后训练集缺失总数: 0
填补后样本外缺失总数: 0


In [7]:
# ========== 6) 导出数据与参数 ==========

# 去掉临时列
df_train_out = df_train.drop(columns=["_qidx"], errors="ignore")
df_oos_out = df_oos.drop(columns=["_qidx"], errors="ignore")

train_csv = OUT_DIR / "train_2015Q4_2022Q4.csv"
oos_csv = OUT_DIR / "oos_2023Q1_2025Q4.csv"
means_json = OUT_DIR / "impute_means_from_train.json"
feature_list_txt = OUT_DIR / "feature_columns.txt"

df_train_out.to_csv(train_csv, index=False)
df_oos_out.to_csv(oos_csv, index=False)

with open(means_json, "w", encoding="utf-8") as f:
    json.dump({k: (None if pd.isna(v) else float(v)) for k, v in train_means.items()}, f, ensure_ascii=False, indent=2)

with open(feature_list_txt, "w", encoding="utf-8") as f:
    for c in feature_cols:
        f.write(c + "\n")

print("已导出:")
print(" -", train_csv)
print(" -", oos_csv)
print(" -", means_json)
print(" -", feature_list_txt)

print("\n训练集形状:", df_train_out.shape)
print("样本外形状:", df_oos_out.shape)

已导出:
 - /Users/xinyuewang/Desktop/1.27/data/modeling/train_2015Q4_2022Q4.csv
 - /Users/xinyuewang/Desktop/1.27/data/modeling/oos_2023Q1_2025Q4.csv
 - /Users/xinyuewang/Desktop/1.27/data/modeling/impute_means_from_train.json
 - /Users/xinyuewang/Desktop/1.27/data/modeling/feature_columns.txt

训练集形状: (744, 74)
样本外形状: (330, 74)


In [8]:
# ========== 7) 快速检查 ==========

print("训练集 Beat/Miss:")
print(df_train_out[LABEL_COL].value_counts(dropna=False).to_string())

print("\n样本外 Beat/Miss:")
print(df_oos_out[LABEL_COL].value_counts(dropna=False).to_string())

display_cols = [c for c in ["ticker", "quarter", LABEL_COL, "smog_index_pr", "smog_index_qa", "lm_net_sentiment_pr", "lm_net_sentiment_qa"] if c in df_train_out.columns]
df_train_out[display_cols].head(10)

训练集 Beat/Miss:
eps_beat
1.0    573
0.0    171

样本外 Beat/Miss:
eps_beat
1.0    283
0.0     47


Unnamed: 0,ticker,quarter,eps_beat,smog_index_pr,smog_index_qa,lm_net_sentiment_pr,lm_net_sentiment_qa
2,AAPL,2015-Q4,1.0,12.597885,10.476445,0.016568,0.002249
3,AAPL,2016-Q1,0.0,12.702494,10.332215,0.015711,-0.003617
4,AAPL,2016-Q2,1.0,12.55112,11.336766,0.00781,-0.000191
5,AAPL,2016-Q3,1.0,12.166066,10.27047,0.021209,0.001949
6,AAPL,2016-Q4,1.0,12.249069,10.184801,0.021081,0.004211
7,AAPL,2017-Q1,1.0,12.079253,10.222267,0.022028,0.007129
8,AAPL,2017-Q2,1.0,12.119037,10.104827,0.023824,0.002057
9,AAPL,2017-Q3,1.0,11.858489,10.304359,0.019608,0.003112
10,AAPL,2017-Q4,1.0,12.523193,9.648094,0.018304,0.007829
11,AAPL,2018-Q1,1.0,12.624789,10.457292,0.018686,0.001593
