### 特徵提取，針對多支股票萃取 VaR、MDD 風險指標，為等下的模型做準備資料

In [None]:
import pandas as pd
import yfinance as yf
import math
import numpy as np

# 計算 log return
def log_returns(stock_data):
    log_returns = np.log(1 + stock_data['Close'].pct_change())
    return log_returns[1:]

# 蒙地卡羅模擬
def monte_carlo_simulation(S0, mu, sigma, T=1, I=1000, M=252):
    dt = T / M
    S = np.zeros((M + 1, I))
    S[0] = S0
    for t in range(1, M + 1):
        S[t] = S[t - 1] * np.exp((mu - 0.5 * sigma ** 2) * dt + sigma * math.sqrt(dt) * np.random.standard_normal(I))
    return pd.DataFrame(S)

# 計算 VaR
def compute_var(final_prices, W0, confidence_level=0.95):
    returns = (final_prices - W0) / W0
    percentile = 1 - confidence_level
    R_star = np.percentile(returns, 100 * percentile)
    VaR_abs = -W0 * R_star
    return VaR_abs

# 計算最大回撤
def compute_max_drawdown(price_series):
    running_max = price_series.cummax()
    drawdown = (price_series - running_max) / running_max
    return drawdown.min()

# 對單一股票計算風險特徵
def extract_features(ticker, start, end):
    data = yf.Ticker(ticker).history(start=start, end=end)
    log_ret = log_returns(data)

    mu = np.mean(log_ret) * 252
    sigma = np.std(log_ret) * math.sqrt(252)
    W0 = data['Close'].iloc[-1]

    sim_df = monte_carlo_simulation(S0=W0, mu=mu, sigma=sigma)
    final_prices = sim_df.iloc[-1]

    var_95 = compute_var(final_prices, W0, confidence_level=0.95)
    var_95_pct = var_95 / W0  # 新增這行：VaR 佔資產比例

    sample_path = sim_df.iloc[:, 0]
    mdd = compute_max_drawdown(sample_path)

    return {
        'Ticker': ticker,
        'mu': mu,
        'sigma': sigma,
        'VaR_95': var_95,
        'VaR_95_pct': var_95_pct,
        'MDD': mdd
    }

# 多股票處理
def extract_features_for_tickers(ticker_list, start, end):
    features = []
    for ticker in ticker_list:
        try:
            feat = extract_features(ticker, start, end)
            features.append(feat)
        except Exception as e:
            # print(f"{ticker} 失敗: {e}")
            pass
    return pd.DataFrame(features)

def add_risk_label(df, var_col='VaR_95_pct', mdd_col='MDD'):
    def label_row(row):
        if row[var_col] > 0.30 or abs(row[mdd_col]) > 0.35:
            return 'High'
        elif row[var_col] > 0.15 or abs(row[mdd_col]) > 0.2:
            return 'Medium'
        else:
            return 'Low'
    df['Risk_Label'] = df.apply(label_row, axis=1)
    return df

if __name__ == '__main__':
    etf_tickers = ['VOO', 'QQQ', 'SPY', 'IVV', 'DIA', 'VTI']
    df = pd.read_csv('NASDAQ10B.csv')
    tickers = df['Symbol'].tolist()
    # df = pd.read_csv('NYSE10B.csv')
    # tickers += df['Symbol'].tolist()
    tickers += etf_tickers
    start_date = '2020-05-30'
    end_date = '2025-06-01'

    # TODO: 需要釐清一件事，到底需要多少 ticker? 必須有個數字，且解釋為什麼是這個數字。
    df = extract_features_for_tickers(tickers, start_date, end_date)

    df = add_risk_label(df)

df.round(3)

$SLDE: possibly delisted; no price data found  (1d 2020-05-30 -> 2025-06-01)


Unnamed: 0,Ticker,mu,sigma,VaR_95,VaR_95_pct,MDD,Risk_Label
0,AAL,0.005,0.539,7.465,0.654,-0.325,High
1,AAON,0.202,0.381,35.183,0.366,-0.384,High
2,AAPL,0.189,0.299,62.968,0.314,-0.222,High
3,ABNB,-0.026,0.489,79.929,0.620,-0.329,High
4,ACAD,-0.164,0.639,16.173,0.750,-0.658,High
...,...,...,...,...,...,...,...
618,QQQ,0.166,0.236,122.250,0.235,-0.138,Medium
619,SPY,0.146,0.179,88.557,0.150,-0.130,Medium
620,IVV,0.147,0.177,86.589,0.147,-0.120,Low
621,DIA,0.119,0.158,63.349,0.150,-0.061,Low


### 建立風險分類器

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics

# 選特徵與標籤
features = ['mu', 'sigma', 'VaR_95_pct', 'MDD']
X = df[features]
y = df['Risk_Label']

# Label Encoding：將 'Low' 'Medium' 'High' 轉為 0, 1, 2
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# 切分訓練集與測試集
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# 建立與訓練模型
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# 預測與評估
y_pred = clf.predict(X_test)
# 結果分析
print("正確率為", metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))
print(confusion_matrix(y_test, y_pred))

正確率為 0.976
              precision    recall  f1-score   support

        High       0.99      1.00      0.99        98
         Low       0.80      0.80      0.80         5
      Medium       0.95      0.91      0.93        22

    accuracy                           0.98       125
   macro avg       0.91      0.90      0.91       125
weighted avg       0.98      0.98      0.98       125

[[98  0  0]
 [ 0  4  1]
 [ 1  1 20]]
