In [21]:
import yfinance as yf
import pandas as pd

df = yf.download("AAPL", start="2020-01-01", end="2023-01-01", group_by="ticker")

df.columns = [col if isinstance(col, str) else col[1] for col in df.columns]

df = df.reset_index()

  df = yf.download("AAPL", start="2020-01-01", end="2023-01-01", group_by="ticker")
[*********************100%***********************]  1 of 1 completed


In [10]:
df 

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2020-01-02,71.545875,72.598876,71.292289,72.538498,135480400
1,2020-01-03,71.765674,72.594063,71.608692,71.833298,146322800
2,2020-01-06,70.954203,72.444336,70.703027,72.405693,118387200
3,2020-01-07,72.415322,72.671325,71.845354,72.065132,108872000
4,2020-01-08,71.768094,73.526310,71.768094,73.224419,132079200
...,...,...,...,...,...,...
751,2022-12-23,129.099285,130.578424,127.837087,130.026215,63814900
752,2022-12-27,129.552896,129.582478,126.929885,128.221664,69007800
753,2022-12-28,127.866686,129.207773,124.119536,124.287170,85438400
754,2022-12-29,126.210030,128.665399,125.953651,127.807503,75703700


In [11]:
df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')

In [12]:
import numpy as np 

df["Log_Returns"] = np.log(df["Close"] / df["Close"].shift(1))

# ===============================
# 2. Rolling Volatility
# ===============================
df["Volatility_20d"] = df["Log_Returns"].rolling(window=20).std()

# ===============================
# 3. Moving Averages
# ===============================
df["SMA_20"] = df["Close"].rolling(window=20).mean()
df["SMA_50"] = df["Close"].rolling(window=50).mean()
df["SMA_200"] = df["Close"].rolling(window=200).mean()

# ===============================
# 4. Relative Strength Index (RSI)
# ===============================
def compute_RSI(series, window=14):
    delta = series.diff()
    gain = np.where(delta > 0, delta, 0)
    loss = np.where(delta < 0, -delta, 0)
    avg_gain = pd.Series(gain).rolling(window=window).mean()
    avg_loss = pd.Series(loss).rolling(window=window).mean()
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))

df["RSI_14"] = compute_RSI(df["Close"], window=14)

# ===============================
# 5. MACD (Moving Average Convergence Divergence)
# ===============================
exp1 = df["Close"].ewm(span=12, adjust=False).mean()
exp2 = df["Close"].ewm(span=26, adjust=False).mean()
df["MACD"] = exp1 - exp2
df["Signal_Line"] = df["MACD"].ewm(span=9, adjust=False).mean()


In [13]:
df = df.fillna(0)

In [14]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Log_Returns,Volatility_20d,SMA_20,SMA_50,SMA_200,RSI_14,MACD,Signal_Line
0,2020-01-02,71.545875,72.598876,71.292289,72.538498,135480400,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2020-01-03,71.765674,72.594063,71.608692,71.833298,146322800,-0.009769,0.000000,0.000000,0.000000,0.000000,0.000000,-0.056255,-0.011251
2,2020-01-06,70.954203,72.444336,70.703027,72.405693,118387200,0.007937,0.000000,0.000000,0.000000,0.000000,0.000000,-0.054028,-0.019806
3,2020-01-07,72.415322,72.671325,71.845354,72.065132,108872000,-0.004715,0.000000,0.000000,0.000000,0.000000,0.000000,-0.078834,-0.031612
4,2020-01-08,71.768094,73.526310,71.768094,73.224419,132079200,0.015959,0.000000,0.000000,0.000000,0.000000,0.000000,-0.004892,-0.026268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,2022-12-23,129.099285,130.578424,127.837087,130.026215,63814900,-0.002802,0.020904,138.704820,142.055543,149.656975,26.261691,-3.603971,-2.512672
752,2022-12-27,129.552896,129.582478,126.929885,128.221664,69007800,-0.013976,0.020428,138.005187,141.895385,149.538628,27.960331,-3.866920,-2.783522
753,2022-12-28,127.866686,129.207773,124.119536,124.287170,85438400,-0.031166,0.020947,137.259209,141.577189,149.420782,26.152449,-4.342730,-3.095363
754,2022-12-29,126.210030,128.665399,125.953651,127.807503,75703700,0.027930,0.018673,136.351017,141.303017,149.298597,30.302147,-4.385201,-3.353331


In [15]:
df = df.sort_values("Date")

# Daily returns
df["Returns"] = df["Close"].pct_change()

# Log returns
df["Log_Returns"] = np.log(df["Close"] / df["Close"].shift(1))

# Rolling volatility (20-day)
df["Volatility_20d"] = df["Returns"].rolling(window=20).std()

# RSI (14-day)
window = 14
delta = df["Close"].diff()
gain = np.where(delta > 0, delta, 0)
loss = np.where(delta < 0, -delta, 0)
avg_gain = pd.Series(gain).rolling(window).mean()
avg_loss = pd.Series(loss).rolling(window).mean()
rs = avg_gain / avg_loss
df["RSI_14"] = 100 - (100 / (1 + rs))

# MACD and Signal Line
ema12 = df["Close"].ewm(span=12, adjust=False).mean()
ema26 = df["Close"].ewm(span=26, adjust=False).mean()
df["MACD"] = ema12 - ema26
df["Signal_Line"] = df["MACD"].ewm(span=9, adjust=False).mean()

# Bollinger Bands (20-day)
df["MA_20"] = df["Close"].rolling(window=20).mean()
df["BB_Upper"] = df["MA_20"] + 2*df["Close"].rolling(window=20).std()
df["BB_Lower"] = df["MA_20"] - 2*df["Close"].rolling(window=20).std()

# On-Balance Volume (OBV)
df["OBV"] = (np.sign(df["Returns"]) * df["Volume"]).fillna(0).cumsum()

In [16]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Log_Returns,Volatility_20d,SMA_20,SMA_50,SMA_200,RSI_14,MACD,Signal_Line,Returns,MA_20,BB_Upper,BB_Lower,OBV
0,2020-01-02,71.545875,72.598876,71.292289,72.538498,135480400,,,0.000000,0.000000,0.000000,,0.000000,0.000000,,,,,0.0
1,2020-01-03,71.765674,72.594063,71.608692,71.833298,146322800,-0.009769,,0.000000,0.000000,0.000000,,-0.056255,-0.011251,-0.009722,,,,-146322800.0
2,2020-01-06,70.954203,72.444336,70.703027,72.405693,118387200,0.007937,,0.000000,0.000000,0.000000,,-0.054028,-0.019806,0.007968,,,,-27935600.0
3,2020-01-07,72.415322,72.671325,71.845354,72.065132,108872000,-0.004715,,0.000000,0.000000,0.000000,,-0.078834,-0.031612,-0.004704,,,,-136807600.0
4,2020-01-08,71.768094,73.526310,71.768094,73.224419,132079200,0.015959,,0.000000,0.000000,0.000000,,-0.004892,-0.026268,0.016087,,,,-4728400.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,2022-12-23,129.099285,130.578424,127.837087,130.026215,63814900,-0.002802,0.020904,138.704820,142.055543,149.656975,26.261691,-3.603971,-2.512672,-0.002798,138.704820,150.129417,127.280223,107519800.0
752,2022-12-27,129.552896,129.582478,126.929885,128.221664,69007800,-0.013976,0.020438,138.005187,141.895385,149.538628,27.960331,-3.866920,-2.783522,-0.013878,138.005187,150.211889,125.798485,38512000.0
753,2022-12-28,127.866686,129.207773,124.119536,124.287170,85438400,-0.031166,0.020940,137.259209,141.577189,149.420782,26.152449,-4.342730,-3.095363,-0.030685,137.259209,150.896445,123.621972,-46926400.0
754,2022-12-29,126.210030,128.665399,125.953651,127.807503,75703700,0.027930,0.018540,136.351017,141.303017,149.298597,30.302147,-4.385201,-3.353331,0.028324,136.351017,149.964594,122.737440,28777300.0


In [17]:
df = df.fillna(0)

In [18]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Log_Returns,Volatility_20d,SMA_20,SMA_50,SMA_200,RSI_14,MACD,Signal_Line,Returns,MA_20,BB_Upper,BB_Lower,OBV
0,2020-01-02,71.545875,72.598876,71.292289,72.538498,135480400,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,2020-01-03,71.765674,72.594063,71.608692,71.833298,146322800,-0.009769,0.000000,0.000000,0.000000,0.000000,0.000000,-0.056255,-0.011251,-0.009722,0.000000,0.000000,0.000000,-146322800.0
2,2020-01-06,70.954203,72.444336,70.703027,72.405693,118387200,0.007937,0.000000,0.000000,0.000000,0.000000,0.000000,-0.054028,-0.019806,0.007968,0.000000,0.000000,0.000000,-27935600.0
3,2020-01-07,72.415322,72.671325,71.845354,72.065132,108872000,-0.004715,0.000000,0.000000,0.000000,0.000000,0.000000,-0.078834,-0.031612,-0.004704,0.000000,0.000000,0.000000,-136807600.0
4,2020-01-08,71.768094,73.526310,71.768094,73.224419,132079200,0.015959,0.000000,0.000000,0.000000,0.000000,0.000000,-0.004892,-0.026268,0.016087,0.000000,0.000000,0.000000,-4728400.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,2022-12-23,129.099285,130.578424,127.837087,130.026215,63814900,-0.002802,0.020904,138.704820,142.055543,149.656975,26.261691,-3.603971,-2.512672,-0.002798,138.704820,150.129417,127.280223,107519800.0
752,2022-12-27,129.552896,129.582478,126.929885,128.221664,69007800,-0.013976,0.020438,138.005187,141.895385,149.538628,27.960331,-3.866920,-2.783522,-0.013878,138.005187,150.211889,125.798485,38512000.0
753,2022-12-28,127.866686,129.207773,124.119536,124.287170,85438400,-0.031166,0.020940,137.259209,141.577189,149.420782,26.152449,-4.342730,-3.095363,-0.030685,137.259209,150.896445,123.621972,-46926400.0
754,2022-12-29,126.210030,128.665399,125.953651,127.807503,75703700,0.027930,0.018540,136.351017,141.303017,149.298597,30.302147,-4.385201,-3.353331,0.028324,136.351017,149.964594,122.737440,28777300.0


In [19]:
df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Log_Returns',
       'Volatility_20d', 'SMA_20', 'SMA_50', 'SMA_200', 'RSI_14', 'MACD',
       'Signal_Line', 'Returns', 'MA_20', 'BB_Upper', 'BB_Lower', 'OBV'],
      dtype='object')

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score, brier_score_loss

# ==========================================
# 1. Extra Risk Features
# ==========================================
df = df.copy()
df["Cumulative_Returns"] = (1 + df["Returns"]).cumprod()
df["Downside_Returns"] = np.where(df["Returns"] < 0, df["Returns"], 0)
df["Downside_Deviation"] = df["Downside_Returns"].rolling(20).std()
df["Sharpe_Ratio_20d"] = df["Returns"].rolling(20).mean() / df["Returns"].rolling(20).std()
df = df.fillna(0)

# ==========================================
# 2. Target: High Risk vs Low Risk
# ==========================================
df["Risk_Flag"] = np.where(
    (df["Volatility_20d"] > df["Volatility_20d"].median()) & (df["Returns"] < 0), 
    1, 0
)

# ==========================================
# 3. Features & Target
# ==========================================
features = [
    'Log_Returns','Volatility_20d','SMA_20','SMA_50','SMA_200',
    'RSI_14','MACD','Signal_Line','Returns','MA_20',
    'BB_Upper','BB_Lower','OBV',
    'Cumulative_Returns','Downside_Deviation','Sharpe_Ratio_20d'
]
X = df[features]
y = df["Risk_Flag"]

# ==========================================
# 4. Train-Test Split (time-series aware)
# ==========================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ==========================================
# 5. Stronger Model: Gradient Boosting
# ==========================================
base_model = HistGradientBoostingClassifier(
    max_iter=500,
    max_depth=6,
    learning_rate=0.05,
    random_state=42
)

# calibrated classifier for probability estimates
model = CalibratedClassifierCV(base_model, method="sigmoid", cv=3)
model.fit(X_train_scaled, y_train)

# ==========================================
# 6. Evaluation
# ==========================================
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:,1]

print(classification_report(y_test, y_pred, digits=3))
print(f"AUC: {roc_auc_score(y_test, y_prob):.4f}")
print(f"PR-AUC: {average_precision_score(y_test, y_prob):.4f}")
print(f"Brier Score: {brier_score_loss(y_test, y_prob):.4f}")

# ==========================================
# 7. Credit Risk Score (0-100)
# ==========================================
df.loc[X_test.index, "Risk_Score"] = (1 - y_prob) * 100  # Higher = safer

print(df[["Date", "Risk_Flag", "Risk_Score"]].tail(10))


              precision    recall  f1-score   support

           0      0.990     1.000     0.995        99
           1      1.000     0.981     0.990        53

    accuracy                          0.993       152
   macro avg      0.995     0.991     0.993       152
weighted avg      0.993     0.993     0.993       152

AUC: 1.0000
PR-AUC: 1.0000
Brier Score: 0.0032
          Date  Risk_Flag  Risk_Score
746 2022-12-16          1    2.582872
747 2022-12-19          1    2.582872
748 2022-12-20          1   67.138294
749 2022-12-21          0   99.052158
750 2022-12-22          1    2.467858
751 2022-12-23          1    3.302005
752 2022-12-27          1    2.682877
753 2022-12-28          1    2.480333
754 2022-12-29          0   99.382615
755 2022-12-30          0   99.382615
