In [None]:
import pandas as pd
import numpy as np
import datetime


In [None]:
# sample code
stock = pd.read_csv('/content/all_stock_data_sorted.csv')

In [None]:
def parse_date_flexible(date_str):
    formats = ['%Y-%m-%d','%d/%m/%Y']
    for fmt in formats:
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            continue
    return pd.NaT # Return Not a Time if no format matches


stock['日期'] = stock['日期'].apply(parse_date_flexible)

In [None]:
# -----------------------------
# Core per-Series building blocks
# -----------------------------

def _log_return(close: pd.Series) -> pd.Series:
    """ln(1 + pct_change) = ln(C_t / C_{t-1})"""
    return np.log(close / close.shift(1))

def _amplitude(high: pd.Series, low: pd.Series, close: pd.Series) -> pd.Series:
    """Intraday range by close."""
    return (high - low) / close

def _rsi(close: pd.Series, window: int = 14) -> pd.Series:
    """Wilder's RSI with EMA smoothing (14 days)"""
    delta = close.diff()
    gain  = delta.clip(lower=0)
    loss  = -delta.clip(upper=0)
    avg_gain = gain.ewm(alpha=1/window, adjust=False, min_periods=window).mean()
    avg_loss = loss.ewm(alpha=1/window, adjust=False, min_periods=window).mean()
    rs  = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def _macd(close: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9) -> pd.DataFrame:
    """MACD line, signal, and histogram."""
    ema_fast = close.ewm(span=fast, adjust=False).mean()
    ema_slow = close.ewm(span=slow, adjust=False).mean()
    macd_line = ema_fast - ema_slow # main signal

    # Other MACD features:
    sig_line  = macd_line.ewm(span=signal, adjust=False).mean()
    hist      = macd_line - sig_line
    return pd.DataFrame({"macd": macd_line, "macd_signal": sig_line, "macd_hist": hist})

def _bollinger_bandwidth(close: pd.Series, window: int = 20, k: float = 2.0) -> pd.Series:
    """(Upper - Lower) / Middle; Middle = rolling mean."""
    mid = close.rolling(window, min_periods=window).mean()
    sd  = close.rolling(window, min_periods=window).std(ddof=0)
    upper = mid + k * sd
    lower = mid - k * sd
    width = (upper - lower) / mid
    return width

def _obv(close: pd.Series, volume: pd.Series) -> pd.Series:
    """
    On-Balance Volume: cumulative sum of signed volume.
    If today's close > yesterday's close -> +Volume,
       < -> -Volume,
       == -> 0.
    """
    # Handle missing volume gracefully
    vol = volume.fillna(0)
    up   = (close > close.shift(1)).astype(int)
    down = (close < close.shift(1)).astype(int)
    signed = vol.where(up.eq(1), 0) - vol.where(down.eq(1), 0)
    return signed.cumsum()

In [None]:
# Map Columns:

column_map = {
    '日期': 'Date',
    '股票代码': 'StockCode',
    '开盘': 'Open',
    '收盘': 'Close',
    '最高': 'High',
    '最低': 'Low',
    '成交量': 'Volume',
    '成交额': 'Turnover',
    '振幅': 'Amplitude',
    '涨跌幅': 'PctChange',
    '涨跌额': 'Change',
    '换手率': 'TurnoverRate'
}


def prepare_ohlcv(
    df: pd.DataFrame,
    column_map: dict | None = None,
    date_col: str = "Date",
    ticker_col: str = "StockCode",
) -> pd.DataFrame:
    """
    - Renames to English columns (using your column_map),
    - Ensures Date is datetime,
    - Sorts by ticker/date,
    - Casts numeric columns.
    """
    out = df.copy()
    if column_map:
        out = out.rename(columns=column_map)

    needed = {date_col, ticker_col, "Open", "High", "Low", "Close", "Volume"}
    missing = needed - set(out.columns)

    if missing:
        raise ValueError(f"Missing required columns after rename: {sorted(missing)}")

    # Date handling + sort
    out[date_col] = out[date_col]

    # Make sure numeric cols are numeric
    for c in ["Open", "High", "Low", "Close", "Volume"]:
        out[c] = pd.to_numeric(out[c], errors="coerce")

    return out


sample_stock  = prepare_ohlcv(stock, column_map)

In [None]:
sample_stock

Unnamed: 0,Date,StockCode,Open,Close,High,Low,Volume,Turnover,Amplitude,PctChange,Change
0,2023-12-01,1,9.65,9.66,9.68,9.58,778057,7.496684e+08,1.03,-0.21,-0.02
1,2023-12-04,1,9.67,9.63,9.68,9.61,605438,5.836643e+08,0.72,-0.31,-0.03
2,2023-12-05,1,9.61,9.48,9.62,9.47,787378,7.509612e+08,1.56,-1.56,-0.15
3,2023-12-06,1,9.44,9.50,9.57,9.40,753056,7.140295e+08,1.79,0.21,0.02
4,2023-12-07,1,9.49,9.44,9.49,9.38,585380,5.522248e+08,1.16,-0.63,-0.06
...,...,...,...,...,...,...,...,...,...,...,...
60436,2024-12-25,688981,96.48,97.99,99.80,96.38,985329,9.679610e+09,3.53,1.17,1.13
60437,2024-12-26,688981,98.00,96.73,98.88,96.15,711046,6.914605e+09,2.79,-1.29,-1.26
60438,2024-12-27,688981,96.78,97.51,102.37,96.49,1144716,1.137894e+10,6.08,0.81,0.78
60439,2024-12-30,688981,96.60,99.29,100.53,96.00,906573,8.950495e+09,4.65,1.83,1.78


# Computing all features

In [None]:
# ===== 3) Master “apply-all” function (next-day aligned) =====

def compute_all_features(
    df: pd.DataFrame,
    date_col: str = "Date",
    ticker_col: str = "StockCode",
    align_to_next_day: bool = True,
    rsi_window: int = 14,
    macd_fast: int = 12,
    macd_slow: int = 26,
    macd_signal: int = 9,
    bb_window: int = 20,
    bb_k: float = 2.0,
) -> pd.DataFrame:
    """
    Adds:
      - log_return
      - amplitude
      - rsi_{rsi_window}
      - macd, macd_signal, macd_hist
      - bb_width_{bb_window}
      - obv
    All shifted to be known on the next trading day if align_to_next_day=True.
    """
    out = df.copy().sort_values([ticker_col, date_col]).reset_index(drop=True)

    def per_ticker(g: pd.DataFrame) -> pd.DataFrame:
        close, high, low, vol = g["Close"], g["High"], g["Low"], g["Volume"]

        lr   = _log_return(close).rename("log_return")
        amp  = _amplitude(high, low, close).rename("amplitude")
        rsi  = _rsi(close, rsi_window).rename(f"rsi_{rsi_window}")
        macd = _macd(close, macd_fast, macd_slow, macd_signal)
        bbw  = _bollinger_bandwidth(close, bb_window, bb_k).rename(f"bb_width_{bb_window}")
        obv  = _obv(close, vol).rename("obv")

        feat = pd.concat([lr, amp, rsi, macd, bbw, obv], axis=1)
        if align_to_next_day:
            feat = feat.shift(1)

        return pd.concat([g, feat], axis=1)

    return out.groupby(ticker_col, group_keys=False).apply(per_ticker)


In [None]:
final_stock = compute_all_features(sample_stock)

  return out.groupby(ticker_col, group_keys=False).apply(per_ticker)


In [None]:
stock_encoder = final_stock[['Date','StockCode','log_return', 'amplitude',	'rsi_14',	'macd','bb_width_20','obv']]

In [None]:
len(stock_encoder)

60441