In [None]:
import yfinance as yf
import ta
import pandas as pd
from datetime import datetime, timedelta

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, classification_report
import joblib

import xgboost as xgb


def build_training_data(ticker: str, lookahead_days: int = 5):
    end = datetime.today()
    start = end - timedelta(days=365*3)  # 3 years
    df = yf.download(
        ticker,
        start=start.strftime("%Y-%m-%d"),
        end=end.strftime("%Y-%m-%d"),
        progress=False,
        auto_adjust=True   # avoid yfinance warning, adjusted prices
    )

    if df.empty:
        return pd.DataFrame()

    close = df_raw["Close"][ticker]
    volume = df_raw["Volume"][ticker]

    features = pd.DataFrame(index=df.index)

    # Indicators
    features["RSI"] = ta.momentum.RSIIndicator(close).rsi()

    macd = ta.trend.MACD(close)
    features["MACD_diff"] = macd.macd_diff()

    sma50 = close.rolling(50).mean()
    sma200 = close.rolling(200).mean()
    features["SMA50_gt_SMA200"] = (sma50 > sma200).astype(int)

    high20 = close.rolling(20).max()
    features["BreakoutUp"] = (close > high20).astype(int)

    low20 = close.rolling(20).min()
    features["BreakoutDown"] = (close < low20).astype(int)

    avg_vol = volume.rolling(20).mean()
    std_vol = volume.rolling(20).std()
    features["HighVolume"] = (volume > avg_vol + 2 * std_vol).astype(int)

    # Target
    future_price = close.shift(-lookahead_days)
    features["Target"] = (future_price > close).astype(int)

    return features.dropna()


def train_model_cv(ticker, lookahead_days=5, n_splits=5, model_type="logistic", verbose=False):
    data = build_training_data(ticker, lookahead_days)
    if data.empty:
        raise ValueError("No training data available.")

    X = data.drop(columns=["Target"])
    y = data["Target"]

    if model_type == "logistic":
        model = Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(max_iter=500))
        ])
    elif model_type == "xgboost":
        model = xgb.XGBClassifier(
            n_estimators=200,
            max_depth=4,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="logloss",
        )
    else:
        raise ValueError("model_type must be 'logistic' or 'xgboost'")
    
    # TimeSeries CV
    tscv = TimeSeriesSplit(n_splits=n_splits)
    scores = []

    if verbose:
        print(f"\n {model_type}: TimeSeries Cross-Validation ({n_splits} folds)")
        print("=" * 50)

    for fold, (train_idx, test_idx) in enumerate(tscv.split(X), 1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        scores.append(acc)

        if verbose:
            print(f"Fold {fold}: Accuracy = {acc:.4f}")
            print(classification_report(y_test, preds, zero_division=0))

    avg_acc = sum(scores) / len(scores)
    if verbose:
        print("=" * 50)
        print(f"Average Accuracy across folds: {avg_acc:.4f}")

    # Retrain on all data before saving
    model.fit(X, y)
    joblib.dump((model, list(X.columns)), f"./predict/models/{model_type}_{ticker}_model.pkl")

    return model, X.columns


In [None]:
def get_model_decision(ticker: str, model_path=None):
    # Load trained model
    if model_path is None:
        model_path = f"{ticker}_model.pkl"
    model, feature_names = joblib.load(model_path)

    # Fetch latest data
    end = datetime.today()
    start = end - timedelta(days=365)
    df = yf.download(ticker, start=start.strftime("%Y-%m-%d"), end=end.strftime("%Y-%m-%d"), progress=False, auto_adjust=True)

    if df.empty:
        return "NOT NOW", {}

    close = df_raw["Close"][ticker]
    volume = df_raw["Volume"][ticker]

    # Build feature row (same as training features)
    features = {}

    features["RSI"] = ta.momentum.RSIIndicator(close).rsi().iloc[-1]

    macd = ta.trend.MACD(close)
    features["MACD_diff"] = macd.macd_diff().iloc[-1]

    sma50 = close.rolling(50).mean().iloc[-1]
    sma200 = close.rolling(200).mean().iloc[-1]
    features["SMA50_gt_SMA200"] = int(sma50 > sma200)

    last_price = close.iloc[-1]
    high20 = close.rolling(20).max().iloc[-1]
    features["BreakoutUp"] = int(last_price > high20)

    low20 = close.rolling(20).min().iloc[-1]
    features["BreakoutDown"] = int(last_price < low20)

    last_vol = volume.iloc[-1]
    avg_vol = volume.rolling(20).mean().iloc[-1]
    std_vol = volume.rolling(20).std().iloc[-1]
    features["HighVolume"] = int(last_vol > avg_vol + 2 * std_vol)

    # Convert to DataFrame in correct column order
    X_live = pd.DataFrame([features])[feature_names]

    # Predict
    prob = model.predict_proba(X_live)[0, 1]  # probability of "up"
    decision = "BUY" if prob > 0.5 else "NOT BUY"

    return decision, features, prob

In [23]:
# Logistic Regression
log_model, log_features = train_model_cv("AAPL", lookahead_days=5, model_type="logistic")

# XGBoost
xgb_model, xgb_features = train_model_cv("AAPL", lookahead_days=5, model_type="xgboost")

In [16]:
# Train once
ticker = "AAPL"
model, features = train_model_cv(ticker, lookahead_days=5, model_type="logistic")

# Predict live decision
decision, feat_values, prob = get_model_decision(ticker)
print("Decision:", decision)
print("Probability of going up:", prob)
print("Features:", feat_values)


 logistic: TimeSeries Cross-Validation (5 folds)
Fold 1: Accuracy = 0.6134
              precision    recall  f1-score   support

           0       0.62      0.10      0.18        48
           1       0.61      0.96      0.75        71

    accuracy                           0.61       119
   macro avg       0.62      0.53      0.46       119
weighted avg       0.62      0.61      0.52       119

Fold 2: Accuracy = 0.4286
              precision    recall  f1-score   support

           0       0.42      0.41      0.41        59
           1       0.44      0.45      0.44        60

    accuracy                           0.43       119
   macro avg       0.43      0.43      0.43       119
weighted avg       0.43      0.43      0.43       119

Fold 3: Accuracy = 0.6218
              precision    recall  f1-score   support

           0       0.60      0.07      0.12        46
           1       0.62      0.97      0.76        73

    accuracy                           0.62       119


In [34]:
info.get("trailingPE", None)

34.613983

In [15]:
import pandas as pd

url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
tables = pd.read_html(url, header=0)
tickers = tables[0]['Symbol'].tolist()[:10]

In [125]:
def get_fundamentals(ticker, get_train, lookahead_days=90):
    t = yf.Ticker(ticker)
    inc = t.quarterly_financials.T
    bal = t.quarterly_balance_sheet.T
    cf  = t.quarterly_cashflow.T

    df_raw = pd.concat([inc, bal, cf], axis=1)

    # Profitability Ratios
    df_extract = pd.DataFrame()
    df_extract['Gross_Margin'] = df_raw['Gross Profit'] / df_raw['Total Revenue']
    df_extract['Operating_Margin'] = df_raw['Operating Income'] / df_raw['Total Revenue']
    df_extract['Net_Margin'] = df_raw['Net Income'] / df_raw['Total Revenue']
    df_extract['EBITDA_Margin'] = df_raw['EBITDA'] / df_raw['Total Revenue']

    # Growth Ratios (YoY)
    df_extract['Revenue_Growth'] = df_raw['Total Revenue'].pct_change(fill_method=None)
    df_extract['Net_Income_Growth'] = df_raw['Net Income'].pct_change(fill_method=None)
    df_extract['EPS_Growth'] = df_raw['Diluted EPS'].pct_change(fill_method=None)

    # Return Ratios
    df_extract['ROE'] = df_raw['Net Income'] / df_raw['Stockholders Equity']
    df_extract['ROA'] = df_raw['Net Income'] / df_raw['Total Assets']
    df_extract['ROIC'] = df_raw['EBIT'] / (df_raw['Total Debt'] + df_raw['Stockholders Equity'] - df_raw['Cash And Cash Equivalents'])

    # Leverage Ratios
    df_extract['Debt_to_Equity'] = df_raw['Total Debt'] / df_raw['Stockholders Equity']
    df_extract['Net_Debt_to_EBITDA'] = df_raw['Net Debt'] / df_raw['EBITDA']

    # Liquidity Ratios
    df_extract['Current_Ratio'] = df_raw['Current Assets'] / df_raw['Current Liabilities']
    df_extract['Quick_Ratio'] = (df_raw['Current Assets'] - df_raw['Inventory']) / df_raw['Current Liabilities']

    # Cash Flow Metrics
    df_extract['FCF_to_Sales'] = df_raw['Free Cash Flow'] / df_raw['Total Revenue']
    df_extract['FCF_yield'] = df_raw['Free Cash Flow'] / df_raw['Stockholders Equity']

    df_extract['Report_Date'] = df_extract.index

    start_date = df_extract['Report_Date'].min()
    end_date = df_extract['Report_Date'].max() + pd.Timedelta(days=lookahead_days+30)
    prices = yf.download(ticker, start=start_date, end=end_date, progress=False, auto_adjust=True)["Close"][ticker]

    if get_train:
        df_extract['Price_Ahead'] = df_extract['Report_Date'].apply(
            lambda x: prices.get(x + pd.Timedelta(days=lookahead_days))
        )
        df_extract['Price_Today'] = df_extract['Report_Date'].apply(
            lambda x: prices.get(x)
        )
        df_extract = df_extract.dropna()

        df_extract['Price_Increase'] = (df_extract['Price_Ahead'] > df_extract['Price_Today']).astype(int)

        df_extract = df_extract.drop(columns=['Report_Date', 'Price_Today', 'Price_Ahead'])
    
    else:
        df_extract = df_extract.drop(columns=['Report_Date'])
        df_extract = df_extract.dropna()
    
    return df_extract

In [121]:
df_train = pd.DataFrame()
for ticker in tickers:
    try:
        df_fund = get_fundamentals(ticker, lookahead_days=90)
        df_train = pd.concat([df_train, df_fund], ignore_index=True)
    except Exception as e:
        pass
        # print(f"Error processing {ticker}: {e}")

            Price_Today  Price_Ahead  Price_Increase
2024-12-31   127.830276   146.144226               1
Empty DataFrame
Columns: [Price_Today, Price_Ahead, Price_Increase]
Index: []
            Price_Today  Price_Ahead  Price_Increase
2024-12-31   111.500229   131.448318               1
            Price_Today  Price_Ahead  Price_Increase
2024-12-31   172.934296   205.823746               1
            Price_Today  Price_Ahead  Price_Increase
2024-12-31     12.27946    12.038103               0
            Price_Today  Price_Ahead  Price_Increase
2024-07-31   140.320526   130.435501               0
2024-10-31   129.521072   146.469711               1
2025-01-31   150.881027   106.236267               0
2025-04-30   107.373871   119.839996               1
            Price_Today  Price_Ahead  Price_Increase
2024-12-31   284.702057   291.269775               1
            Price_Today  Price_Ahead  Price_Increase
2024-12-31    85.043625    71.553162               0
            Price_Tod

In [None]:
def train_fundamentals_model():

    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    tables = pd.read_html(url, header=0)
    tickers = tables[0]['Symbol'].tolist()

    df_train = pd.DataFrame()
    for ticker in tickers:
        try:
            df_fund = get_fundamentals(ticker, lookahead_days=90)
            df_train = pd.concat([df_train, df_fund], ignore_index=True)
        except Exception as e:
            pass
            # print(f"Error processing {ticker}: {e}")

    X = df_train.drop(columns=['Price_Increase'])

    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.fillna(X.median(), inplace=True)

    y = df_train['Price_Increase']

    # Define pipeline: scaler + classifier
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
    ])

    # K-Fold Cross Validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []

    for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Train pipeline
        pipeline.fit(X_train, y_train)
        
        # Predict
        y_pred = pipeline.predict(X_test)
        
        # Evaluate
        acc = accuracy_score(y_test, y_pred)
        accuracies.append(acc)
        
        print(f"Fold {fold} Accuracy: {acc:.4f}")
        print(classification_report(y_test, y_pred, zero_division=0))

    print(f"\nAverage Accuracy across folds: {sum(accuracies)/len(accuracies):.4f}")
    
    pipeline.fit(X, y)
    joblib.dump((model, list(X.columns)), f"./machine_learning/models/fundamentals_model.pkl")

In [123]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Features and target
X = df_train.drop(columns=['Price_Increase'])

X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.median(), inplace=True)

y = df_train['Price_Increase']

# Define pipeline: scaler + classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
])

# K-Fold Cross Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Train pipeline
    pipeline.fit(X_train, y_train)
    
    # Predict
    y_pred = pipeline.predict(X_test)
    
    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)
    
    print(f"Fold {fold} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred, zero_division=0))

pipeline.fit(X, y)
joblib.dump((model, list(X.columns)), f"./machine_learning/models/fundamentals_model.pkl")

Fold 1 Accuracy: 0.5686
              precision    recall  f1-score   support

           0       0.57      0.52      0.54        25
           1       0.57      0.62      0.59        26

    accuracy                           0.57        51
   macro avg       0.57      0.57      0.57        51
weighted avg       0.57      0.57      0.57        51

Fold 2 Accuracy: 0.6078
              precision    recall  f1-score   support

           0       0.57      0.84      0.68        25
           1       0.71      0.38      0.50        26

    accuracy                           0.61        51
   macro avg       0.64      0.61      0.59        51
weighted avg       0.64      0.61      0.59        51

Fold 3 Accuracy: 0.5882
              precision    recall  f1-score   support

           0       0.58      0.60      0.59        25
           1       0.60      0.58      0.59        26

    accuracy                           0.59        51
   macro avg       0.59      0.59      0.59        51
we

In [141]:
X_test = get_fundamentals("MSFT", get_train=False).iloc[[-1]]
prob = pipeline.predict_proba(X_test)[:, 1]
label = pipeline.predict(X_test)


In [142]:
prob, label

(array([0.62687539]), array([1]))

In [None]:

# result = df_raw[['Report_Date']].copy()
# result['Pred_Prob'] = prob
# result['Pred_Label'] = label

In [133]:
X_test

Unnamed: 0,Gross_Margin,Operating_Margin,Net_Margin,EBITDA_Margin,Revenue_Growth,Net_Income_Growth,EPS_Growth,ROE,ROA,ROIC,Debt_to_Equity,Net_Debt_to_EBITDA,Current_Ratio,Quick_Ratio,FCF_to_Sales,FCF_yield
7,0.519185,0.179856,0.128897,0.217026,-0.007733,-0.323899,-0.324324,0.035039,0.017684,0.035482,0.569589,5.549724,2.091451,1.598907,0.064149,0.017438
11,0.465164,0.226434,0.153689,0.472336,-0.097132,-0.491525,-0.495652,0.021416,0.006604,0.015004,1.485722,22.396963,0.436096,0.32744,0.260246,0.036265
15,0.366002,0.266224,0.219832,0.31916,0.031042,0.138184,0.136054,0.040108,0.026468,0.040852,0.221066,3.033835,1.240374,0.753516,0.282862,0.051608
19,0.59026,0.229121,0.161489,0.423594,-0.008199,-0.181471,-0.1875,0.011158,0.008158,0.01268,0.21576,5.082425,1.929681,1.433331,0.403533,0.027882
22,0.468825,0.344586,0.292277,0.369364,0.309386,1.465391,1.474227,0.544204,0.105584,0.321422,1.449999,1.448423,0.922938,0.875067,0.217176,0.404371
28,0.278721,0.12806,0.079608,0.159583,0.001053,-0.042378,-0.04,0.07525,0.020704,0.052693,1.363231,8.09375,1.075491,0.733808,0.124811,0.117978
40,0.640529,0.351857,0.30767,0.518714,0.075122,-3.306133,-3.25,0.063891,0.026104,0.037166,0.998345,7.980658,1.173564,1.068156,0.390067,0.081001
46,0.294374,0.112516,-0.001308,0.083297,-0.032081,-1.022556,-1.022727,-0.000791,-0.000197,0.006947,1.987876,36.675393,0.612416,0.224832,0.064108,0.038745
47,0.312771,0.142857,0.078644,0.17316,0.208897,-73.666667,-73.0,0.056741,0.01353,0.033878,2.060645,14.810417,0.905339,0.497547,0.041486,0.029932
51,0.351807,0.093287,-0.013425,0.165232,-0.021556,-1.257426,-1.26087,-0.008494,-0.001607,0.008329,3.091691,27.276042,0.257126,0.212386,0.054733,0.034629


In [108]:
def prepare_financial_ratios(df_raw):
    """
    Input: df_raw containing raw financial data for a ticker
    Output: DataFrame of features ready for the model
    """
    df = df_raw.copy()
    
    # Calculate ratios (must match training features)
    df['Gross_Margin'] = df['Gross Profit'] / df['Total Revenue']
    df['Operating_Margin'] = df['Operating Income'] / df['Total Revenue']
    df['Net_Margin'] = df['Net Income'] / df['Total Revenue']
    df['EBITDA_Margin'] = df['EBITDA'] / df['Total Revenue']
    
    df['Revenue_Growth'] = df['Total Revenue'].pct_change().fillna(0)
    df['Net_Income_Growth'] = df['Net Income'].pct_change().fillna(0)
    df['EPS_Growth'] = df['Diluted EPS'].pct_change(fill_method=None).fillna(0)
    
    df['ROE'] = df['Net Income'] / df['Stockholders Equity']
    df['ROA'] = df['Net Income'] / df['Total Assets']
    df['ROIC'] = df['EBIT'] / (df['Total Debt'] + df['Stockholders Equity'] - df['Cash And Cash Equivalents'])
    
    df['Debt_to_Equity'] = df['Total Debt'] / df['Stockholders Equity']
    df['Net_Debt_to_EBITDA'] = df['Net Debt'] / df['EBITDA']
    
    df['Current_Ratio'] = df['Current Assets'] / df['Current Liabilities']
    df['Quick_Ratio'] = (df['Current Assets'] - df['Inventory']) / df['Current Liabilities']
    
    df['FCF_to_Sales'] = df['Free Cash Flow'] / df['Total Revenue']
    df['FCF_yield'] = df['Free Cash Flow'] / df['Stockholders Equity']
    
    # Keep only features used in training
    feature_cols = ['Gross_Margin','Operating_Margin','Net_Margin','EBITDA_Margin',
                    'Revenue_Growth','Net_Income_Growth','EPS_Growth','ROE','ROA','ROIC',
                    'Debt_to_Equity','Net_Debt_to_EBITDA','Current_Ratio','Quick_Ratio',
                    'FCF_to_Sales','FCF_yield']
    
    X = df[feature_cols]
    
    # Handle inf/-inf and NaN
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.fillna(X.median(), inplace=True)
    
    return X

def predict_ticker(df_raw, pipeline):
    """
    Input: raw financial data for a ticker, trained pipeline
    Output: DataFrame with prediction probability and label
    """
    X_test = prepare_financial_ratios(df_raw)
    
    # Predict probability of price increase
    prob = pipeline.predict_proba(X_test)[:, 1]
    label = pipeline.predict(X_test)
    
    result = df_raw[['Report_Date']].copy()
    result['Pred_Prob'] = prob
    result['Pred_Label'] = label
    
    return result


In [None]:
# replace with your financial data extraction
# predictions = predict_ticker(new_ticker_df, pipeline)
# print(predictions)

In [112]:
new_ticker_df

Unnamed: 0,Gross_Margin,Operating_Margin,Net_Margin,EBITDA_Margin,Revenue_Growth,Net_Income_Growth,EPS_Growth,ROE,ROA,ROIC,Debt_to_Equity,Net_Debt_to_EBITDA,Current_Ratio,Quick_Ratio,FCF_to_Sales,FCF_yield,Price_Increase
2024-09-30,0.693543,0.465838,0.376107,0.582969,0.013256,0.119396,0.118644,0.085732,0.047163,0.093955,0.213671,0.634958,1.301441,1.287326,0.293619,0.066929,0
2024-12-31,0.68694,0.454575,0.34622,0.528292,0.061706,-0.022662,-0.021212,0.079645,0.045155,0.086229,0.205567,0.747241,1.35082,1.342472,0.093161,0.021431,0
2025-03-31,0.687166,0.456712,0.368567,0.581038,0.006233,0.07118,0.071207,0.080226,0.045899,0.090408,0.18816,0.345189,1.371592,1.364167,0.289713,0.063062,0
2025-06-30,0.685849,0.449013,0.356262,0.581285,0.090986,0.054562,0.054913,0.079286,0.043995,0.088895,0.176395,0.290521,1.353446,1.346804,0.33448,0.074438,0


In [80]:
get_fundamentals("AAPL", lookahead_days=30)

Unnamed: 0,Gross_Margin,Operating_Margin,Net_Margin,EBITDA_Margin,Revenue_Growth,Net_Income_Growth,EPS_Growth,ROE,ROA,ROIC,Debt_to_Equity,Net_Debt_to_EBITDA,Current_Ratio,Quick_Ratio,FCF_to_Sales,FCF_yield,Report_Date,Price_Ahead
2024-09-30,0.462225,0.311714,0.15523,0.342379,0.106707,-0.312943,-0.307143,0.258753,0.040375,0.22143,1.872327,2.359424,0.867313,0.826007,0.251796,0.419719,2024-09-30,229.034073
2024-12-31,0.468825,0.344586,0.292277,0.369364,0.309386,1.465391,1.474227,0.544204,0.105584,0.321422,1.449999,1.448423,0.922938,0.875067,0.217176,0.404371,2024-12-31,236.749557
2025-03-31,0.470506,0.310291,0.25986,0.338196,-0.232832,-0.317919,-0.3125,0.37098,0.074811,0.216262,1.469938,2.171287,0.82087,0.777507,0.218973,0.312609,2025-03-31,211.981125
2025-06-30,0.464907,0.299906,0.249202,0.330001,-0.013874,-0.054318,-0.048485,0.355978,0.070692,0.214858,1.544858,2.108436,0.867992,0.826006,0.259528,0.370728,2025-06-30,208.813019


In [None]:
# Profitability Ratios
df_extract = pd.DataFrame()
df_extract['Gross_Margin'] = df_raw['Gross Profit'] / df_raw['Total Revenue']
df_extract['Operating_Margin'] = df_raw['Operating Income'] / df_raw['Total Revenue']
df_extract['Net_Margin'] = df_raw['Net Income'] / df_raw['Total Revenue']
df_extract['EBITDA_Margin'] = df_raw['EBITDA'] / df_raw['Total Revenue']

# Growth Ratios (YoY)
df_extract['Revenue_Growth'] = df_raw['Total Revenue'].pct_change()
df_extract['Net_Income_Growth'] = df_raw['Net Income'].pct_change()
df_extract['EPS_Growth'] = df_raw['Diluted EPS'].pct_change()

# Return Ratios
df_extract['ROE'] = df_raw['Net Income'] / df_raw['Stockholders Equity']
df_extract['ROA'] = df_raw['Net Income'] / df_raw['Total Assets']
df_extract['ROIC'] = df_raw['EBIT'] / (df_raw['Total Debt'] + df_raw['Stockholders Equity'] - df_raw['Cash And Cash Equivalents'])

# Leverage Ratios
df_extract['Debt_to_Equity'] = df_raw['Total Debt'] / df_raw['Stockholders Equity']
df_extract['Net_Debt_to_EBITDA'] = df_raw['Net Debt'] / df_raw['EBITDA']

# Liquidity Ratios
df_extract['Current_Ratio'] = df_raw['Current Assets'] / df_raw['Current Liabilities']
df_extract['Quick_Ratio'] = (df_raw['Current Assets'] - df_raw['Inventory']) / df_raw['Current Liabilities']

# Cash Flow Metrics
df_extract['FCF_to_Sales'] = df_raw['Free Cash Flow'] / df_raw['Total Revenue']
df_extract['FCF_yield'] = df_raw['Free Cash Flow'] / df_raw['Stockholders Equity']

df_extract = df_extract.dropna()

In [45]:
df_extract.index

DatetimeIndex(['2024-09-30', '2024-12-31', '2025-03-31', '2025-06-30'], dtype='datetime64[ns]', freq='QE-DEC')

In [None]:
df_extract['Report_Date'] = df_extract.index

In [None]:
N = 1

start_date = df_extract['Report_Date'].min()
end_date = df_extract['Report_Date'].max() + pd.Timedelta(days=N+30)  # buffer
prices = yf.download(ticker, start=start_date, end=end_date, progress=False, auto_adjust=True)["Close"][ticker]

df_extract['Price_Ahead'] = df_extract['Report_Date'].apply(
    lambda x: prices.get(x + pd.Timedelta(days=N))
)

df_extract = df_extract.dropna()

In [77]:
df_extract

Unnamed: 0,Gross_Margin,Operating_Margin,Net_Margin,EBITDA_Margin,Revenue_Growth,Net_Income_Growth,EPS_Growth,ROE,ROA,ROIC,Debt_to_Equity,Net_Debt_to_EBITDA,Current_Ratio,Quick_Ratio,FCF_to_Sales,FCF_yield,Report_Date,Price_N_Days_Ahead,Price_Ahead
2024-09-30,0.462225,0.311714,0.15523,0.342379,0.106707,-0.312943,-0.307143,0.258753,0.040375,0.22143,1.872327,2.359424,0.867313,0.826007,0.251796,0.419719,2024-09-30,225.162094,225.162094
2025-03-31,0.470506,0.310291,0.25986,0.338196,-0.232832,-0.317919,-0.3125,0.37098,0.074811,0.216262,1.469938,2.171287,0.82087,0.777507,0.218973,0.312609,2025-03-31,222.64502,222.64502
2025-06-30,0.464907,0.299906,0.249202,0.330001,-0.013874,-0.054318,-0.048485,0.355978,0.070692,0.214858,1.544858,2.108436,0.867992,0.826006,0.259528,0.370728,2025-06-30,207.584412,207.584412


In [75]:
prices.get("2024-10-01")

np.float64(225.16209411621094)

In [61]:
df_extract

Unnamed: 0,Gross_Margin,Operating_Margin,Net_Margin,EBITDA_Margin,Revenue_Growth,Net_Income_Growth,EPS_Growth,ROE,ROA,ROIC,Debt_to_Equity,Net_Debt_to_EBITDA,Current_Ratio,Quick_Ratio,FCF_to_Sales,FCF_yield,Report_Date,Price_N_Days_Ahead
2024-09-30,0.462225,0.311714,0.15523,0.342379,0.106707,-0.312943,-0.307143,0.258753,0.040375,0.22143,1.872327,2.359424,0.867313,0.826007,0.251796,0.419719,2024-09-30,
2024-12-31,0.468825,0.344586,0.292277,0.369364,0.309386,1.465391,1.474227,0.544204,0.105584,0.321422,1.449999,1.448423,0.922938,0.875067,0.217176,0.404371,2024-12-31,221.587616
2025-03-31,0.470506,0.310291,0.25986,0.338196,-0.232832,-0.317919,-0.3125,0.37098,0.074811,0.216262,1.469938,2.171287,0.82087,0.777507,0.218973,0.312609,2025-03-31,
2025-06-30,0.464907,0.299906,0.249202,0.330001,-0.013874,-0.054318,-0.048485,0.355978,0.070692,0.214858,1.544858,2.108436,0.867992,0.826006,0.259528,0.370728,2025-06-30,


In [None]:
df_extract['Price_N_Days_Ahead'] = df_raw['Report_Date'].apply(
    lambda x: prices.get(x + pd.Timedelta(days=N), pd.NA)
)

In [39]:
prices

Ticker,AAPL
Date,Unnamed: 1_level_1
2024-09-30,231.920639
2024-10-01,225.162094
2024-10-02,225.729462
2024-10-03,224.624588
2024-10-04,225.749359
...,...
2025-08-18,230.889999
2025-08-19,230.559998
2025-08-20,226.009995
2025-08-21,224.899994


In [16]:
all_feats = []
for ticker in tickers:
    feats = get_fundamentals(ticker)
    all_feats.append(feats)
all_feats = pd.concat(all_feats, ignore_index=True)

# all_feats["target"] = all_feats.apply(lambda r: make_target(r["ticker"], r["date"]), axis=1)


Tax Effect Of Unusual Items               NaN
Tax Rate For Calcs                        NaN
Normalized EBITDA                         NaN
Total Unusual Items                       NaN
Total Unusual Items Excluding Goodwill    NaN
                                         ... 
Depreciation And Amortization             NaN
Operating Gains Losses                    NaN
Pension And Employee Benefit Expense      NaN
Gain Loss On Sale Of Business             0.0
Net Income From Continuing Operations     NaN
Name: 2023-12-31 00:00:00, Length: 194, dtype: float64
Tax Effect Of Unusual Items              NaN
Tax Rate For Calcs                       NaN
Normalized EBITDA                        NaN
Total Unusual Items                      NaN
Total Unusual Items Excluding Goodwill   NaN
                                          ..
Depreciation And Amortization            NaN
Operating Gains Losses                   NaN
Pension And Employee Benefit Expense     NaN
Gain Loss On Sale Of Business     

In [10]:
type(all_feats)

pandas.core.frame.DataFrame

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report

# Features & target
X = all_feats.drop(columns=["date", "ticker", "target"]).fillna(0)
y = all_feats["target"]

# Build pipeline: scaling -> PCA -> classifier
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.95)),  # keep enough components to explain 95% variance
    ("clf", RandomForestClassifier(n_estimators=200, random_state=42))
])

# K-fold CV
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    acc = accuracy_score(y_test, preds)
    scores.append(acc)

print("Cross-validated Accuracy:", np.mean(scores))


In [171]:
import joblib
import yfinance as yf
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report

def safe_div(df, num, den):
    if num in df.columns and den in df.columns:
        return df[num] / df[den]
    else:
        return pd.Series([np.nan] * len(df), index=df.index)
    
def get_fundamentals(ticker, get_train, lookahead_days=90):
    t = yf.Ticker(ticker)
    inc = t.quarterly_financials.T
    bal = t.quarterly_balance_sheet.T
    cf  = t.quarterly_cashflow.T

    df_raw = pd.concat([inc, bal, cf], axis=1)

    # Profitability Ratios
    df_extract = pd.DataFrame()

    # Profitability Ratios
    df_extract['Gross_Margin']      = safe_div(df_raw, 'Gross Profit', 'Total Revenue')
    df_extract['Operating_Margin']  = safe_div(df_raw, 'Operating Income', 'Total Revenue')
    df_extract['Net_Margin']        = safe_div(df_raw, 'Net Income', 'Total Revenue')
    df_extract['EBITDA_Margin']     = safe_div(df_raw, 'EBITDA', 'Total Revenue')

    # Growth Ratios (YoY)
    df_extract['Revenue_Growth']    = df_raw['Total Revenue'].pct_change(fill_method=None) if 'Total Revenue' in df_raw else np.nan
    df_extract['Net_Income_Growth'] = df_raw['Net Income'].pct_change(fill_method=None) if 'Net Income' in df_raw else np.nan
    df_extract['EPS_Growth']        = df_raw['Diluted EPS'].pct_change(fill_method=None) if 'Diluted EPS' in df_raw else np.nan

    # Return Ratios
    df_extract['ROE']   = safe_div(df_raw, 'Net Income', 'Stockholders Equity')
    df_extract['ROA']   = safe_div(df_raw, 'Net Income', 'Total Assets')
    df_extract['ROIC']  = safe_div(df_raw, 'EBIT', 'Total Debt') + safe_div(df_raw, 'EBIT', 'Stockholders Equity') - safe_div(df_raw, 'EBIT', 'Cash And Cash Equivalents')

    # Leverage Ratios
    df_extract['Debt_to_Equity']      = safe_div(df_raw, 'Total Debt', 'Stockholders Equity')
    df_extract['Net_Debt_to_EBITDA']  = safe_div(df_raw, 'Net Debt', 'EBITDA')

    # Liquidity Ratios
    df_extract['Current_Ratio'] = safe_div(df_raw, 'Current Assets', 'Current Liabilities')
    df_extract['Quick_Ratio']   = safe_div(df_raw, 'Current Assets', 'Current Liabilities') - safe_div(df_raw, 'Inventory', 'Current Liabilities')

    # Cash Flow Metrics
    df_extract['FCF_to_Sales']  = safe_div(df_raw, 'Free Cash Flow', 'Total Revenue')
    df_extract['FCF_yield']     = safe_div(df_raw, 'Free Cash Flow', 'Stockholders Equity')

    df_extract['Report_Date'] = df_extract.index

    for col in df_extract.columns:
        if df_extract[col].isna().all():
            df_extract[col] = df_extract[col].fillna(0)
        else:
            df_extract[col] = df_extract[col].fillna(df_extract[col].mean())

    start_date = df_extract['Report_Date'].min()
    end_date = df_extract['Report_Date'].max() + pd.Timedelta(days=lookahead_days+30)
    prices = yf.download(ticker, start=start_date, end=end_date, progress=False, auto_adjust=True)["Close"][ticker]

    if get_train:
        df_extract['Price_Ahead'] = df_extract['Report_Date'].apply(
            lambda x: prices.get(x + pd.Timedelta(days=lookahead_days))
        )
        df_extract['Price_Today'] = df_extract['Report_Date'].apply(
            lambda x: prices.get(x)
        )
        df_extract = df_extract.dropna()
        df_extract['Price_Increase'] = (df_extract['Price_Ahead'] > df_extract['Price_Today']).astype(int)
        df_extract = df_extract.drop(columns=['Report_Date', 'Price_Today', 'Price_Ahead'])
    
    else:
        df_extract = df_extract.drop(columns=['Report_Date'])
        df_extract = df_extract.dropna()
    
    return df_extract

def train_fundamentals_model():

    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    tables = pd.read_html(url, header=0)
    tickers = tables[0]['Symbol'].tolist()

    df_train = pd.DataFrame()
    for ticker in tickers:
        try:
            df_fund = get_fundamentals(ticker, get_train=True, lookahead_days=90)
            if df_fund:
                df_train = pd.concat([df_train, df_fund], ignore_index=True)
        except Exception as e:
            pass
            # print(f"Error processing {ticker}: {e}")

    X = df_train.drop(columns=['Price_Increase'])

    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.fillna(X.median(), inplace=True)

    y = df_train['Price_Increase']

    # Define pipeline: scaler + classifier
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
    ])

    # K-Fold Cross Validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []

    for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Train pipeline
        pipeline.fit(X_train, y_train)
        
        # Predict
        y_pred = pipeline.predict(X_test)
        
        # Evaluate
        acc = accuracy_score(y_test, y_pred)
        accuracies.append(acc)
        
        print(f"Fold {fold} Accuracy: {acc:.4f}")
        print(classification_report(y_test, y_pred, zero_division=0))

    print(f"\nAverage Accuracy across folds: {sum(accuracies)/len(accuracies):.4f}")
    
    pipeline.fit(X, y)
    joblib.dump((pipeline, list(X.columns)), f"./machine_learning/models/fundamentals_model.pkl")

def get_fundamentals_prediction(ticker):
    try:
        model, feature_cols = joblib.load(f"./machine_learning/models/fundamentals_model.pkl")
    except:
        train_fundamentals_model()
        model, feature_cols = joblib.load(f"./machine_learning/models/fundamentals_model.pkl")

    X_test = get_fundamentals(ticker, get_train=False).iloc[[-1]]
    prob = model.predict_proba(X_test)[0, 1]
    decision = "BUY" if prob > 0.5 else "DON'T BUY"

    return decision, prob


In [174]:
ticker = "TIGR"
# X_test = get_fundamentals(ticker, get_train=False)
decision, prob = get_fundamentals_prediction(ticker)

In [175]:
decision, prob

('BUY', np.float64(0.814426148322173))

In [173]:
X_test

Unnamed: 0,Gross_Margin,Operating_Margin,Net_Margin,EBITDA_Margin,Revenue_Growth,Net_Income_Growth,EPS_Growth,ROE,ROA,ROIC,Debt_to_Equity,Net_Debt_to_EBITDA,Current_Ratio,Quick_Ratio,FCF_to_Sales,FCF_yield
2025-03-31,0.680738,0.452799,0.248097,0.459399,-0.101345,0.613973,-0.781818,0.043618,0.004158,0.258538,0.246626,0.0,1.12413,0.0,0.0,0.0
2024-12-31,0.65143,0.410772,0.226026,0.455949,0.012168,-0.077874,-0.781818,0.04281,0.004389,0.265859,0.25878,0.0,1.137769,0.0,0.0,0.0
2024-09-30,0.680482,0.412809,0.175691,0.382852,-0.185717,-0.367054,-0.363636,0.033034,0.002785,0.197297,0.317144,0.0,1.113821,0.0,0.0,0.0
2024-06-30,0.640287,0.210527,0.029658,0.250723,-0.13475,-0.853938,-0.781818,0.005174,0.000541,0.105077,0.340703,0.0,1.150657,0.0,0.0,0.0
2024-03-31,0.619776,0.35677,0.15618,0.429724,-0.097082,3.75476,-0.781818,0.024715,0.003083,0.160957,0.342842,0.0,1.184993,0.0,0.0,0.0
2023-12-31,0.654543,0.368735,0.16713,0.39573,-0.101345,0.613973,-1.2,0.02987,0.002991,0.197545,0.301219,0.0,1.142274,0.0,0.0,0.0


In [166]:
t = yf.Ticker(ticker)
inc = t.quarterly_financials.T
bal = t.quarterly_balance_sheet.T
cf  = t.quarterly_cashflow.T

df_raw = pd.concat([inc, bal, cf], axis=1)

# Profitability Ratios
df_extract = pd.DataFrame()

# Profitability Ratios
df_extract['Gross_Margin']      = safe_div(df_raw, 'Gross Profit', 'Total Revenue')
df_extract['Operating_Margin']  = safe_div(df_raw, 'Operating Income', 'Total Revenue')
df_extract['Net_Margin']        = safe_div(df_raw, 'Net Income', 'Total Revenue')
df_extract['EBITDA_Margin']     = safe_div(df_raw, 'EBITDA', 'Total Revenue')

# Growth Ratios (YoY)
df_extract['Revenue_Growth']    = df_raw['Total Revenue'].pct_change(fill_method=None) if 'Total Revenue' in df_raw else np.nan
df_extract['Net_Income_Growth'] = df_raw['Net Income'].pct_change(fill_method=None) if 'Net Income' in df_raw else np.nan
df_extract['EPS_Growth']        = df_raw['Diluted EPS'].pct_change(fill_method=None) if 'Diluted EPS' in df_raw else np.nan

# Return Ratios
df_extract['ROE']   = safe_div(df_raw, 'Net Income', 'Stockholders Equity')
df_extract['ROA']   = safe_div(df_raw, 'Net Income', 'Total Assets')
df_extract['ROIC']  = safe_div(df_raw, 'EBIT', 'Total Debt') + safe_div(df_raw, 'EBIT', 'Stockholders Equity') - safe_div(df_raw, 'EBIT', 'Cash And Cash Equivalents')

# Leverage Ratios
df_extract['Debt_to_Equity']      = safe_div(df_raw, 'Total Debt', 'Stockholders Equity')
df_extract['Net_Debt_to_EBITDA']  = safe_div(df_raw, 'Net Debt', 'EBITDA')

# Liquidity Ratios
df_extract['Current_Ratio'] = safe_div(df_raw, 'Current Assets', 'Current Liabilities')
df_extract['Quick_Ratio']   = safe_div(df_raw, 'Current Assets', 'Current Liabilities') - safe_div(df_raw, 'Inventory', 'Current Liabilities')

# Cash Flow Metrics
df_extract['FCF_to_Sales']  = safe_div(df_raw, 'Free Cash Flow', 'Total Revenue')
df_extract['FCF_yield']     = safe_div(df_raw, 'Free Cash Flow', 'Stockholders Equity')

df_extract['Report_Date'] = df_extract.index


In [167]:
df_extract

Unnamed: 0,Gross_Margin,Operating_Margin,Net_Margin,EBITDA_Margin,Revenue_Growth,Net_Income_Growth,EPS_Growth,ROE,ROA,ROIC,Debt_to_Equity,Net_Debt_to_EBITDA,Current_Ratio,Quick_Ratio,FCF_to_Sales,FCF_yield,Report_Date
2025-03-31,0.680738,0.452799,0.248097,0.459399,,,,0.043618,0.004158,0.258538,0.246626,,1.12413,,,,2025-03-31
2024-12-31,0.65143,0.410772,0.226026,0.455949,0.012168,-0.077874,,0.04281,0.004389,0.265859,0.25878,,1.137769,,,,2024-12-31
2024-09-30,0.680482,0.412809,0.175691,0.382852,-0.185717,-0.367054,-0.363636,0.033034,0.002785,0.197297,0.317144,,1.113821,,,,2024-09-30
2024-06-30,0.640287,0.210527,0.029658,0.250723,-0.13475,-0.853938,,0.005174,0.000541,0.105077,0.340703,,1.150657,,,,2024-06-30
2024-03-31,0.619776,0.35677,0.15618,0.429724,-0.097082,3.75476,,0.024715,0.003083,0.160957,0.342842,,1.184993,,,,2024-03-31
2023-12-31,,,,,,,-1.2,,,,,,,,,,2023-12-31


In [170]:
df_extract

Unnamed: 0,Gross_Margin,Operating_Margin,Net_Margin,EBITDA_Margin,Revenue_Growth,Net_Income_Growth,EPS_Growth,ROE,ROA,ROIC,Debt_to_Equity,Net_Debt_to_EBITDA,Current_Ratio,Quick_Ratio,FCF_to_Sales,FCF_yield,Report_Date
2025-03-31,0.680738,0.452799,0.248097,0.459399,-0.101345,0.613973,-0.781818,0.043618,0.004158,0.258538,0.246626,0.0,1.12413,0.0,0.0,0.0,2025-03-31
2024-12-31,0.65143,0.410772,0.226026,0.455949,0.012168,-0.077874,-0.781818,0.04281,0.004389,0.265859,0.25878,0.0,1.137769,0.0,0.0,0.0,2024-12-31
2024-09-30,0.680482,0.412809,0.175691,0.382852,-0.185717,-0.367054,-0.363636,0.033034,0.002785,0.197297,0.317144,0.0,1.113821,0.0,0.0,0.0,2024-09-30
2024-06-30,0.640287,0.210527,0.029658,0.250723,-0.13475,-0.853938,-0.781818,0.005174,0.000541,0.105077,0.340703,0.0,1.150657,0.0,0.0,0.0,2024-06-30
2024-03-31,0.619776,0.35677,0.15618,0.429724,-0.097082,3.75476,-0.781818,0.024715,0.003083,0.160957,0.342842,0.0,1.184993,0.0,0.0,0.0,2024-03-31
2023-12-31,0.654543,0.368735,0.16713,0.39573,-0.101345,0.613973,-1.2,0.02987,0.002991,0.197545,0.301219,0.0,1.142274,0.0,0.0,0.0,2023-12-31
