In [1]:
!jupyter kernelspec list

Available kernels:
  ml-unified    /workspaces/smart_dev/ml-unified/.venv/share/jupyter/kernels/ml-unified
  python3       /workspaces/smart_dev/ml-unified/.venv/share/jupyter/kernels/python3


In [3]:
!python -c "import tensorflow as tf; print(f'TensorFlow GPU devices: {len(tf.config.list_physical_devices(\"GPU\"))}')"

2025-08-02 17:12:04.624725: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754154724.822492   48686 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754154724.878876   48686 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754154725.287856   48686 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754154725.287975   48686 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754154725.287985   48686 computation_placer.cc:177] computation placer alr

In [2]:
!python -c "import torch; print(f'PyTorch CUDA available: {torch.cuda.is_available()}')"

PyTorch CUDA available: True


In [1]:
import torch

print('--- PyTorch GPU Test ---')
is_available = torch.cuda.is_available()
print(f'CUDA Available: {is_available}')

if is_available:
    device_count = torch.cuda.device_count()
    print(f'Device Count: {device_count}')
    device_name = torch.cuda.get_device_name(0)
    print(f'Device Name: {device_name}')
    
    try:
        tensor = torch.tensor([1.0, 2.0]).to('cuda')
        print(f'Tensor on GPU: {tensor}')
        print('SUCCESS: PyTorch is utilizing the GPU.')
    except Exception as e:
        print(f'ERROR: PyTorch failed a CUDA operation: {e}')

--- PyTorch GPU Test ---
CUDA Available: True
Device Count: 1
Device Name: NVIDIA GeForce GTX 1050 Ti
Tensor on GPU: tensor([1., 2.], device='cuda:0')
SUCCESS: PyTorch is utilizing the GPU.


In [96]:
#SIMPLE DATA PULL
def get_stock_data(ticker: str) -> pd.DataFrame:
    with duckdb.connect() as con:
        if Path(f'data/{ticker}.parquet').exists():
            pass
            print(f"Loading data for {ticker} from local Parquet file.")
            return con.execute(f"SELECT * FROM read_parquet('data/{ticker}.parquet')").df()
        else:
            print(f"Downloading data for {ticker} from Yahoo Finance.")
            df = yf.download(ticker, start=start_date, end=end_date, progress=False)
            df = df[['Open', 'High', 'Low', 'Close', 'Volume']].dropna()
            df.to_parquet(f'data/{ticker}.parquet')
            return df
df = get_stock_data(symbol)  # Example usage to ensure data is downloaded

#type(df)

Loading data for RYCEY from local Parquet file.


In [None]:
def clean_column_names(df, ticker):
    """Flatten multi-index columns and remove ticker from column names"""
    # Convert columns to list if they're in the problematic format
    if any(isinstance(col, tuple) for col in df.columns):
        new_columns = []
        for col in df.columns:
            if isinstance(col, tuple):
                # Keep just the metric name (e.g., 'Close') and drop the ticker
                new_columns.append(col[0])
            else:
                new_columns.append(col)
        df.columns = new_columns
    return df

def get_stock_data(ticker: str, force_download: bool = False) -> duckdb.DuckDBPyRelation:
    """Fetch stock data from cache or Yahoo Finance, storing in DuckDB."""
    Path('data').mkdir(exist_ok=True)
    parquet_path = f'data/{ticker}.parquet'
    
    if not force_download and Path(parquet_path).exists():
        print(f"Loading cached data for {ticker}")
        con.execute(f"CREATE OR REPLACE TABLE {ticker}_data AS SELECT * FROM read_parquet('{parquet_path}')").df()
    else:
        print(f"Downloading fresh data for {ticker}")
        df = yf.download(
            ticker, 
            start=(datetime.today() - timedelta(days=365*10)).strftime('%Y-%m-%d'),
            end=datetime.today().strftime('%Y-%m-%d'),
            progress=False
        )
        
        # Clean the column names
        df = clean_column_names(df, ticker)
        
        # Select our standard columns and reset index
        df = df[['Open', 'High', 'Low', 'Close', 'Volume']].dropna()
        df = df[['Open', 'High', 'Low', 'Close', 'Volume']].reset_index()
        
        # Save to Parquet
        df.to_parquet(parquet_path, index=False)
        
        # Register with DuckDB
        con.execute(f"CREATE OR REPLACE TABLE {ticker}_data AS SELECT * FROM df")
    
    return con.table(f'{ticker}_data').df()

# Usage
if __name__ == "__main__":
    # Get data
    df = get_stock_data("RYCEY")
    
    # Show schema to verify clean column names
    print("Table Schema:")
    print(con.execute("DESCRIBE rycey_data").df())
    
    # Query recent data
    recent_data = con.execute("""
        SELECT 
            MIN(Date) as start_date
            ,MAX(Date) as end_date
            ,COUNT(*) as row_count
            ,AVG(Close) as avg_close
        FROM df
    """).df()
    
    print("\nRecent Data:")
    print(recent_data)
    print("\nColumn Names:", recent_data.columns.tolist())

Loading cached data for RYCEY
Table Schema:
  column_name   column_type null   key default extra
0        Date  TIMESTAMP_NS  YES  None    None  None
1        Open        DOUBLE  YES  None    None  None
2        High        DOUBLE  YES  None    None  None
3         Low        DOUBLE  YES  None    None  None
4       Close        DOUBLE  YES  None    None  None
5      Volume        BIGINT  YES  None    None  None

Recent Data:
  start_date   end_date  row_count  avg_close
0 2015-06-05 2025-05-30       2512   6.510762

Column Names: ['start_date', 'end_date', 'row_count', 'avg_close']


In [4]:
# Technical Indicators & Advanced Metrics

# 2.1 Relative Strength Index (RSI) - 14 day
delta = df['Close'].diff()
gain = delta.clip(lower=0)
loss = -delta.clip(upper=0)
avg_gain = gain.ewm(alpha=1/14, adjust=False).mean()
avg_loss = loss.ewm(alpha=1/14, adjust=False).mean()
rs = avg_gain / avg_loss
df['RSI14'] = 100 - (100 / (1 + rs))

# 2.2 Moving Average Convergence/Divergence (MACD: 12,26 with Signal 9)
ema12 = df['Close'].ewm(span=12, adjust=False).mean()
ema26 = df['Close'].ewm(span=26, adjust=False).mean()
df['MACD_line'] = ema12 - ema26
df['MACD_signal'] = df['MACD_line'].ewm(span=9, adjust=False).mean()
df['MACD_hist'] = df['MACD_line'] - df['MACD_signal']

# 2.3 Average True Range (ATR) - 14 day
high_low = df['High'] - df['Low']
high_prev_close = (df['High'] - df['Close'].shift(1)).abs()
low_prev_close  = (df['Low'] - df['Close'].shift(1)).abs()
true_range = pd.DataFrame({'hl': high_low, 'hc': high_prev_close, 'lc': low_prev_close}).max(axis=1)
df['ATR14'] = true_range.rolling(window=14).mean()

# 2.4 Bollinger Bands (20-day SMA ± 2 STD)
rolling_mean20 = df['Close'].rolling(window=20).mean()
rolling_std20  = df['Close'].rolling(window=20).std()
df['BB_mid']   = rolling_mean20
df['BB_upper'] = rolling_mean20 + 2 * rolling_std20
df['BB_lower'] = rolling_mean20 - 2 * rolling_std20

# 2.5 On-Balance Volume (OBV)
direction = np.sign(df['Close'].diff().fillna(0))
df['OBV'] = (direction * df['Volume']).cumsum()

# 2.6 Stochastic Oscillator %K and %D (14-day)
window = 14
lowest_low  = df['Low'].rolling(window).min()
highest_high = df['High'].rolling(window).max()
df['%K'] = (df['Close'] - lowest_low) / (highest_high - lowest_low + 1e-9) * 100  # +1e-9 to avoid zero division
df['%D'] = df['%K'].rolling(3).mean()

# 2.7 Money Flow Index (MFI) - 14 day
typical_price = (df['High'] + df['Low'] + df['Close']) / 3.0
mf = typical_price * df['Volume']
tp_diff = typical_price.diff()
pos_mf = mf.where(tp_diff > 0, 0.0).rolling(window).sum()
neg_mf = mf.where(tp_diff < 0, 0.0).rolling(window).sum()
df['MFI14'] = 100 - 100 / (1 + pos_mf / (neg_mf + 1e-9))

# 2.8 Commodity Channel Index (CCI) - 20 day
TP20 = typical_price.rolling(20).mean()
MD20 = (typical_price - TP20).abs().rolling(20).mean()  # Mean deviation
df['CCI20'] = (typical_price - TP20) / (0.015 * MD20)

# 2.9 Williams %R (14-day)
df['Williams_%R'] = (highest_high - df['Close']) / (highest_high - lowest_low + 1e-9) * -100

# 2.10 Rate of Change (ROC) - 10 day
df['ROC10'] = df['Close'].pct_change(periods=10) * 100

# 2.11 GARCH(1,1) Volatility Estimate (daily)
returns = df['Close'].pct_change().fillna(0)
# Initialize GARCH parameters (omega, alpha, beta) and variance
var0 = returns.var()
alpha, beta = 0.1, 0.85
omega = var0 * max(0, (1 - alpha - beta))
garch_vars = [var0]
for r in returns.iloc[1:]:
    new_var = omega + alpha * (r**2) + beta * garch_vars[-1]
    garch_vars.append(new_var)
df['GARCH_vol'] = np.sqrt(garch_vars)

# 2.12 Fourier Transform Dominant Period
# Compute the Fourier Transform dominant period for each row using a rolling window
df = calculate_dominant_periods(df, window_size=60)

# 2.13 One-day Return (%) as feature
df['Return1'] = returns * 100

# Drop initial rows with NaN values from rolling calculations
df.dropna(inplace=True)
print(f"After feature engineering: {len(df)} data points, {df.shape[1]} columns (incl. features).")
print(f"{df.Dominant_Period.describe()}\n{df.Close.describe()}")
df.sample(10)  # display last 5 rows of the last 10 feature columns

After feature engineering: 2453 data points, 24 columns (incl. features).
count    2453.000000
mean        4.803999
std         1.820771
min         2.142857
25%         3.537582
50%         4.615385
75%         5.555556
max        16.111111
Name: Dominant_Period, dtype: float64
count    2453.000000
mean        6.387663
std         3.950234
min         0.744062
25%         1.865117
50%         7.301733
75%         9.914142
max        13.825971
Name: Close, dtype: float64


Unnamed: 0,Date,Open,High,Low,Close,Volume,RSI14,MACD_line,MACD_signal,MACD_hist,...,OBV,%K,%D,MFI14,CCI20,Williams_%R,ROC10,GARCH_vol,Dominant_Period,Return1
1834,2022-09-16,0.892875,0.892875,0.863112,0.873033,3351900,35.071058,-0.040442,-0.040748,0.000307,...,-252834700.0,31.24998,35.906841,44.777387,-48.016808,-68.75002,1.149419,0.028509,2.222222,-2.222223
586,2017-10-02,11.15901,11.196551,11.121469,11.187165,15400,52.011253,0.005713,0.014286,-0.008573,...,-2786300.0,60.86966,55.555698,54.848986,17.402721,-39.13034,-0.334453,0.020977,8.888889,0.421234
1905,2022-12-28,1.091292,1.091292,1.061529,1.061529,2766900,53.072743,0.011843,0.018514,-0.006671,...,-231514900.0,36.363692,36.363692,43.815725,-0.382888,-63.636307,-2.727273,0.022624,4.984127,0.0
1943,2023-02-23,1.557571,1.577412,1.517887,1.54765,13195400,76.210061,0.039499,0.030767,0.008733,...,-196738400.0,90.624973,56.134222,62.838888,391.774194,-9.375027,19.999996,0.066844,5.166667,19.999996
789,2018-07-24,12.561286,12.608831,12.52325,12.570795,21300,59.937571,0.23396,0.293385,-0.059425,...,-1890300.0,41.304717,36.047365,58.464115,7.490571,-58.695283,-1.636889,0.019789,4.74359,0.685465
663,2018-01-23,11.476885,11.58062,11.467454,11.58062,23200,61.401362,0.187983,0.103324,0.084659,...,-4298000.0,80.291914,81.751782,44.054249,110.098074,-19.708086,4.067795,0.026592,3.157895,0.244892
854,2018-10-24,10.507358,10.621465,10.212581,10.241109,139100,25.391933,-0.461111,-0.366066,-0.095045,...,-2879800.0,1.515187,4.745237,13.482042,-115.813111,-98.484813,-7.869966,0.025777,2.749907,-2.97297
473,2017-04-21,9.614589,9.679741,9.586667,9.679741,27700,67.278363,0.235941,0.197268,0.038672,...,-6289600.0,92.592719,88.271721,92.166698,101.325597,-7.407281,6.122475,0.021006,5.333333,0.192667
2431,2025-02-04,7.281892,7.321575,7.26205,7.271971,1570500,52.78459,0.040722,0.029939,0.010783,...,31299000.0,67.94873,69.658133,71.542298,46.554276,-32.05127,0.273598,0.022011,10.0,0.68681
139,2015-12-22,7.660381,7.761055,7.632925,7.761055,167900,40.156253,-0.232294,-0.261173,0.028879,...,-6169700.0,29.464223,35.416557,20.01945,-100.147786,-70.535777,-4.611942,0.026134,5.538462,-0.586169


In [5]:
# Define target: 1 if next day's Close is higher than today's, else 0
df['UpNext'] = (df['Close'].shift(-1) > df['Close']).astype(int)
df.dropna(inplace=True)  # drop last row (no target for it)
print("Target 'UpNext':", df['UpNext'].value_counts().to_dict())  # distribution of up/down


Target 'UpNext': {0: 1291, 1: 1162}


In [None]:
import pandas as pd
import requests
import time
from datetime import datetime

def get_movers(url, cache_time=300):
    """Cache results to avoid repeated requests"""
    now = time.time()
    if not hasattr(get_movers, 'cache'):
        get_movers.cache = {'time': 0, 'data': None}
    
    # Return cached data if recent enough
    if now - get_movers.cache['time'] < cache_time:
        return get_movers.cache['data']
    
    try:
        r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        r.raise_for_status()
        # Parse the HTML tables from the page
        tables = pd.read_html(r.content)
        # The first table is usually the desired one
        data = tables[0]
        # Cache the result
        get_movers.cache = {'time': now, 'data': data}
        return data
    except Exception as e:
        print(f"Error fetching data from {url}: {e}")
        return None

# Fetch gainers and losers
gainers = get_movers("https://finance.yahoo.com/gainers")
losers = get_movers("https://finance.yahoo.com/losers")

# Check if data was fetched successfully
if gainers is not None and not gainers.empty and losers is not None and not losers.empty:
    print("Top Gainers:")
    display(gainers.head(10))
    print("\nTop Losers:")
    display(losers.head(10))
else:
    print("Couldn't fetch data. Please wait and try again later.")

Top Gainers:


Unnamed: 0,Symbol,Name,Unnamed: 2,Price,Change,Change %,Volume,Avg Vol (3M),Market Cap,P/E Ratio (TTM),52 Wk Change %,52 Wk Range
0,FIX,"Comfort Systems USA, Inc.",,688.74 +125.91 (+22.37%),125.91,+22.37%,1.254M,367312,24.3B,35.47,+119.34%,
1,LYEL,"Lyell Immunopharma, Inc.",,12.42 +2.01 (+19.31%),2.01,+19.31%,461610,138811,230.821M,--,-60.19%,
2,GNTX,Gentex Corporation,,27.42 +3.82 (+16.19%),3.82,+16.19%,6.68M,2.36M,6.164B,15.49,-12.54%,
3,NEGG,"Newegg Commerce, Inc.",,34.31 +3.60 (+11.72%),3.6,+11.72%,805808,793614,668.29M,--,+94.94%,
4,DECK,Deckers Outdoor Corporation,,116.85 +11.91 (+11.35%),11.91,+11.35%,14.733M,3.784M,17.462B,17.95,-23.89%,
5,TNXP,Tonix Pharmaceuticals Holding Corp.,,49.72 +5.01 (+11.19%),5.01,+11.19%,1.927M,1.041M,365.837M,0.14,-12.75%,
6,MTX,Minerals Technologies Inc.,,65.45 +6.25 (+10.56%),6.25,+10.56%,607399,290172,2.077B,6545.00,-16.76%,
7,EME,"EMCOR Group, Inc.",,635.06 +56.26 (+9.72%),56.26,+9.72%,648391,448400,28.424B,28.09,+75.01%,
8,RNA,"Avidity Biosciences, Inc.",,36.27 +3.06 (+9.21%),3.06,+9.21%,5.972M,1.701M,4.371B,--,-21.88%,
9,GNW,"Genworth Financial, Inc.",,8.00 +0.66 (+8.99%),0.66,+8.99%,10.084M,7.22M,3.316B,15.09,+18.69%,



Top Losers:


Unnamed: 0,Symbol,Name,Unnamed: 2,Price,Change,Change %,Volume,Avg Vol (3M),Market Cap,P/E Ratio (TTM),52 Wk Change %,52 Wk Range
0,FIX,"Comfort Systems USA, Inc.",,688.74 +125.91 (+22.37%),125.91,+22.37%,1.254M,367312,24.3B,35.47,+119.34%,
1,LYEL,"Lyell Immunopharma, Inc.",,12.42 +2.01 (+19.31%),2.01,+19.31%,461610,138811,230.821M,--,-60.19%,
2,GNTX,Gentex Corporation,,27.42 +3.82 (+16.19%),3.82,+16.19%,6.68M,2.36M,6.164B,15.49,-12.54%,
3,NEGG,"Newegg Commerce, Inc.",,34.31 +3.60 (+11.72%),3.6,+11.72%,805808,793614,668.29M,--,+94.94%,
4,DECK,Deckers Outdoor Corporation,,116.85 +11.91 (+11.35%),11.91,+11.35%,14.733M,3.784M,17.462B,17.95,-23.89%,
5,TNXP,Tonix Pharmaceuticals Holding Corp.,,49.72 +5.01 (+11.19%),5.01,+11.19%,1.927M,1.041M,365.837M,0.14,-12.75%,
6,MTX,Minerals Technologies Inc.,,65.45 +6.25 (+10.56%),6.25,+10.56%,607399,290172,2.077B,6545.00,-16.76%,
7,EME,"EMCOR Group, Inc.",,635.06 +56.26 (+9.72%),56.26,+9.72%,648391,448400,28.424B,28.09,+75.01%,
8,RNA,"Avidity Biosciences, Inc.",,36.27 +3.06 (+9.21%),3.06,+9.21%,5.972M,1.701M,4.371B,--,-21.88%,
9,GNW,"Genworth Financial, Inc.",,8.00 +0.66 (+8.99%),0.66,+8.99%,10.084M,7.22M,3.316B,15.09,+18.69%,


In [None]:
import os
import requests
from rich import print
from dotenv import load_dotenv
load_dotenv(dotenv_path='/workspaces/smart_dev/ml-unified/vars.env')

AV_KEY   = os.getenv('ALPHA_VANTAGE_KEY')

# replace the "demo" apikey below with your own key from https://www.alphavantage.co/support/#api-key
url = f'https://www.alphavantage.co/query?function=NEWS_SENTIMENT&tickers=TSLA&apikey={AV_KEY}'
r = requests.get(url)
data = r.json()

print(data)

In [2]:
print(AV_KEY)

In [8]:
# Train-test split (80% train, 20% test by date order)
train_size = int(0.8 * len(df))
train_df = df.iloc[:train_size].copy()
test_df  = df.iloc[train_size:].copy()
train_start, train_end = get_date_range(train_df, 'Date')
test_start, test_end = get_date_range(test_df, 'Date')

print(f"Training: {train_start} to {train_end} ({len(train_df)} samples)")
print(f"Testing: {test_start} to {test_end} ({len(test_df)} samples)")

# Features for modeling (20+ features we engineered)
feature_cols = [
    'RSI14','MACD_line','MACD_signal','MACD_hist','ATR14',
    'BB_mid','BB_upper','BB_lower','OBV','%K','%D',
    'MFI14','CCI20','Williams_%R','ROC10','GARCH_vol',
    'Dominant_Period','Return1','Close','Volume'
]
X_train = train_df[feature_cols]
y_train = train_df['UpNext'].astype(int)
X_test  = test_df[feature_cols]
y_test  = test_df['UpNext'].astype(int)


Training: 2015-08-28 to 2023-06-14 (1962 samples)
Testing: 2023-06-15 to 2025-05-30 (491 samples)


In [9]:
# 4.1 Train XGBoost model
xgb_model = XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
accuracy = (y_pred == y_test).mean()
print(f"XGBoost Test Accuracy: {accuracy:.2%}")

# 4.2 SHAP feature importance for XGBoost
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_test)
# If XGBoost classifier returns a list (one per class), take the second element (positive class)
if isinstance(shap_values, list):
    shap_values = shap_values[1]  # shap values for class "1"
# Calculate mean absolute SHAP value for each feature
importance = np.mean(np.abs(shap_values), axis=0)
feature_importance = pd.Series(importance, index=X_test.columns).sort_values(ascending=False)
print("Top 5 features by SHAP importance:")
print(feature_importance.head(5))

# 5.1 Prepare sequence data for Transformer
window_size = 60  # sequence length (days)
X_values = df[feature_cols].values
y_values = df['UpNext'].values.astype(int)

X_seq, y_seq = [], []
for i in range(window_size, len(X_values)):
    # Sequence of features for days [i-window_size ... i-1]
    X_seq.append(X_values[i-window_size:i])
    # Label for sequence = UpNext of day i-1 (predicting day i relative to i-1)
    y_seq.append(y_values[i-1])
X_seq = np.array(X_seq, dtype=np.float32)
y_seq = np.array(y_seq, dtype=np.int64)

# Split sequence data into train and test sets corresponding to original split
# A sequence ending at index j (label index j) belongs to train if j < train_size, else test
train_seq_idx = np.where((np.arange(window_size, len(X_values)) - 1) < train_size)[0]
test_seq_idx  = np.where((np.arange(window_size, len(X_values)) - 1) >= train_size)[0]
X_seq_train = X_seq[train_seq_idx]
y_seq_train = y_seq[train_seq_idx]
X_seq_test  = X_seq[test_seq_idx]
y_seq_test  = y_seq[test_seq_idx]
print(f"Sequences: {X_seq_train.shape[0]} train sequences, {X_seq_test.shape[0]} test sequences.")

# Convert to torch tensors
X_seq_train_t = torch.tensor(X_seq_train)
y_seq_train_t = torch.tensor(y_seq_train)
X_seq_test_t  = torch.tensor(X_seq_test)
y_seq_test_t  = torch.tensor(y_seq_test)

# 5.2 Define Transformer model (encoder) for binary classification
class StockTransformer(nn.Module):
    def __init__(self, input_features, d_model=64, nhead=4, num_layers=2, num_classes=2):
        super(StockTransformer, self).__init__()
        self.input_features = input_features
        self.d_model = d_model
        # Feature embedding layer: project input features to d_model dimensions
        self.feature_embed = nn.Linear(input_features, d_model)
        # Transformer Encoder layers
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        # Final output layer
        self.fc_out = nn.Linear(d_model, num_classes)
    def forward(self, x):
        # x shape: (batch, seq_len, input_features)
        x = x.permute(1, 0, 2)               # -> (seq_len, batch, input_features)
        x = self.feature_embed(x)            # -> (seq_len, batch, d_model)
        x = self.transformer_encoder(x)      # -> (seq_len, batch, d_model)
        out = x[-1, :, :]                    # take the last time step's output: (batch, d_model)
        out = self.fc_out(out)               # -> (batch, num_classes)
        return out

# Initialize model, loss, optimizer
model = StockTransformer(input_features=X_seq_train.shape[2], d_model=64, nhead=4, num_layers=2, num_classes=2)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 5.3 Train the Transformer model
epochs = 5
batch_size = 32
train_dataset = TensorDataset(X_seq_train_t, y_seq_train_t)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for epoch in range(1, epochs+1):
    model.train()
    total_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * X_batch.size(0)
    avg_loss = total_loss / len(train_dataset)
    print(f"Epoch {epoch}/{epochs} - Training Loss: {avg_loss:.4f}")

# 5.4 Evaluate on test sequences
model.eval()
with torch.no_grad():
    test_outputs = model(X_seq_test_t)
    test_preds = test_outputs.argmax(dim=1).numpy()
test_accuracy = (test_preds == y_seq_test).mean()
print(f"Transformer Test Accuracy: {test_accuracy:.2%}")


XGBoost Test Accuracy: 50.10%
Top 5 features by SHAP importance:
RSI14      0.386637
OBV        0.372639
Return1    0.300512
MFI14      0.245748
%D         0.245400
dtype: float32


In [None]:
# 6.1 Ensemble probabilities (average of XGBoost and Transformer)
xgb_proba = xgb_model.predict_proba(X_test)[:, 1]            # probability of class 1 from XGBoost
trans_proba = F.softmax(test_outputs, dim=1).numpy()[:, 1]   # probability of class 1 from Transformer
# Ensure we align lengths (Transformer test may have one fewer if sequence window covers until second-last day)
min_len = min(len(xgb_proba), len(trans_proba))
ensemble_proba = (xgb_proba[:min_len] + trans_proba[:min_len]) / 2

# 6.2 Compute Kelly fraction for the latest day in test
latest_p = ensemble_proba[-1]   # ensemble probability of up for the most recent day in test set
kelly_fraction = 2 * latest_p - 1
print(f"Latest ensemble 'Up' probability: {latest_p:.2%}")
print(f"Kelly fraction: {kelly_fraction:.2f}")

# 6.3 Fuzzy logic verdict based on Kelly fraction
if kelly_fraction > 0.5:
    verdict = "Strong Buy"
elif kelly_fraction > 0.1:
    verdict = "Buy"
elif kelly_fraction < -0.5:
    verdict = "Strong Sell"
elif kelly_fraction < -0.1:
    verdict = "Sell"
else:
    verdict = "Hold/Neutral"

print("Fuzzy Verdict for the latest day:", verdict)


Latest ensemble 'Up' probability: 25.36%
Kelly fraction: -0.49
Fuzzy Verdict for the latest day: Sell


In [115]:
df.index = pd.to_datetime(df['Date'])

To work with FRED economic indicators using `pandas-datareader`, follow this guide:

### 1. **Finding FRED Indicators**
There's no direct API to list *all* FRED indicators, but here's how to discover them:
```python
import pandas_datareader as pdr
from pandas_datareader.fred import FredReader

# Search FRED website directly
# https://fred.stlouisfed.org/search?st=economic+indicators
```

### 2. **Top 10 Indicators for Stock Analysis**
Here are key economic indicators with their FRED codes:

| Indicator | FRED Code | Frequency | Description |
|-----------|-----------|-----------|-------------|
| **CPI** | `CPIAUCSL` | Monthly | Consumer Price Index |
| **Treasury Spread** | `T10Y2Y` | Daily | 10-Year vs 2-Year Treasury Spread |
| **Unemployment** | `UNRATE` | Monthly | Unemployment Rate |
| **GDP** | `GDP` | Quarterly | Gross Domestic Product |
| **Fed Funds Rate** | `FEDFUNDS` | Daily | Federal Funds Rate |
| **Mortgage Rates** | `MORTGAGE30US` | Weekly | 30-Year Mortgage Rate |
| **Industrial Production** | `INDPRO` | Monthly | Industrial Production Index |
| **Retail Sales** | `RSXFS` | Monthly | Retail Sales |
| **Housing Starts** | `HOUST` | Monthly | Housing Starts |
| **NASDAQ** | `NASDAQCOM` | Daily | NASDAQ Index |

### 5. **Handling Frequencies**
Different indicators have different frequencies (daily, monthly, quarterly). Resample for alignment:
```python
# Convert monthly data to quarterly
data_daily = data.resample('D').last()
```

### 6. **Additional Useful Indicators**
1. `VIXCLS` - CBOE Volatility Index
2. `UMCSENT` - Consumer Sentiment
3. `PPIACO` - Producer Price Index
4. `A191RL1Q225SBEA` - GDP Growth Rate


In [None]:
import os
import asyncio
import uvloop
import aiohttp
import pandas as pd
import numpy as np
from dotenv import load_dotenv

# ─── Setup ───────────────────────────────────────────────────────────────────
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
load_dotenv(dotenv_path='/workspaces/smart_dev/ml-unified/vars.env')

API_KEY    = os.getenv('FRED_API_KEY')
OBS_START  = '2000-01-01'
OBS_URL    = 'https://api.stlouisfed.org/fred/series/observations'

if not API_KEY:
    raise RuntimeError("FRED_API_KEY not set in vars.env")

# ─── Indicators & FRED codes ─────────────────────────────────────────────────
INDICATORS = {
    'CPI':             'CPIAUCSL',
    'TreasurySpread':  'T10Y2Y',
    'Unemployment':    'UNRATE',
    'GDP':             'GDP',
    'FedFunds':        'FEDFUNDS',
    'Mortgage30Yr':    'MORTGAGE30US',
    'IndustrialProd':  'INDPRO',
    'RetailSales':     'RSXFS',
    'HousingStarts':   'HOUST',
    'NASDAQ':          'NASDAQCOM',
    'ConsumerSenti':   'UMCSENT',
    'ProdPriceIdx':    'PPIACO',
}

# ─── Fetch one series of observations ────────────────────────────────────────
async def fetch_series(session, name, code):
    params = {
        'series_id':         code,
        'api_key':           API_KEY,
        'file_type':         'json',
        'observation_start': OBS_START
    }
    async with session.get(OBS_URL, params=params) as resp:
        resp.raise_for_status()
        payload = await resp.json()

    dates  = [obs['date'] for obs in payload['observations']]
    values = [
        np.nan if obs['value'] == '.' else float(obs['value'])
        for obs in payload['observations']
    ]
    return pd.Series(values,
                     index=pd.to_datetime(dates),
                     name=name)

# ─── Main: fetch all, check, merge, reindex, flag, save ─────────────────────
async def main():
    connector = aiohttp.TCPConnector(limit=0)
    timeout   = aiohttp.ClientTimeout(total=15)

    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
        tasks = [fetch_series(session, nm, cd) for nm, cd in INDICATORS.items()]
        series_list = await asyncio.gather(*tasks)

    # 1) Concat & sort
    df = pd.concat(series_list, axis=1).sort_index()
    print("\nRaw data sample (last 5 rows before reindex):")
    print(df.tail())

    # 2) Determine full daily index
    full_idx = pd.date_range(df.index.min(), df.index.max(), freq='D')

    # 3) Reindex + forward-fill
    df_daily = df.reindex(full_idx).ffill()

    print("\nAfter reindex+ffill (last 10 rows):")
    print(df_daily.tail(10)[['CPI','Unemployment','GDP','NASDAQ','TreasurySpread']])

    # 4) Debug: verify forward-fill at a known date
    sample = pd.Timestamp('2025-07-02')
    print(f"\nValue on {sample.date()}:")
    for col in ['CPI','Unemployment','GDP']:
        print(f"  {col}: raw last obs @ {df[col].last_valid_index()} -> "
              f"daily[{sample.date()}]={df_daily.at[sample, col]}")

    # 5) Build flags
    obs_dates = {s.name: set(s.index.date) for s in series_list}
    idx_dates = pd.Series(df_daily.index.date, index=df_daily.index)
    flags = {
        f"{name}_is_update": idx_dates.isin(obs_dates[name])
        for name in INDICATORS
    }
    df_flags = pd.DataFrame(flags, index=df_daily.index)

    # 6) Merge & save
    df_final = pd.concat([df_daily, df_flags], axis=1)
    df_final.to_parquet('macro_indicators_daily.parquet')

    print(f"\nSaved {len(df_daily)} days × {df_final.shape[1]} columns "
          "to macro_indicators_daily.parquet")

if __name__ == '__main__':
    # asyncio.run(main())
    await main()  # For Jupyter compatibility, use await directly

RuntimeError: asyncio.run() cannot be called from a running event loop

In [22]:
# df_test = pd.read_parquet('macro_indicators_daily.parquet')
# print(df_test.tail(10))  # Display a sample of the fetched macro indicators data
# 1. Sort & resample
df = df_test.sort_index()
df_daily = df.resample('D').ffill()

# 2. Check around the end of June → early July
print(df_daily['2025-06-20':'2025-07-15'][['CPI','Unemployment','GDP']].head(10))

# 3. Confirm last-valid indices
for name in ['CPI','Unemployment','GDP']:
    print(f"{name} last raw obs: {df[name].last_valid_index()}")
    print(f"{name} value on 2025-07-02: {df_daily.at[pd.Timestamp('2025-07-02'),'CPI']}")



            CPI  Unemployment  GDP
2025-06-20  NaN           NaN  NaN
2025-06-21  NaN           NaN  NaN
2025-06-22  NaN           NaN  NaN
2025-06-23  NaN           NaN  NaN
2025-06-24  NaN           NaN  NaN
2025-06-25  NaN           NaN  NaN
2025-06-26  NaN           NaN  NaN
2025-06-27  NaN           NaN  NaN
2025-06-28  NaN           NaN  NaN
2025-06-29  NaN           NaN  NaN
CPI last raw obs: 2025-05-01 00:00:00
CPI value on 2025-07-02: nan
Unemployment last raw obs: 2025-06-01 00:00:00
Unemployment value on 2025-07-02: nan
GDP last raw obs: 2025-01-01 00:00:00
GDP value on 2025-07-02: nan


In [4]:
import os
import asyncio
import uvloop
import aiohttp

# ─── Setup uvloop as the event loop for maximal performance ─────────────────
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())

# ─── Configuration ─────────────────────────────────────────────────────────
API_KEY   = os.getenv('FRED_API_KEY')
BASE_URL  = 'https://api.stlouisfed.org/fred'
TIMEOUT   = aiohttp.ClientTimeout(total=5)   # total timeout for the single call

if not API_KEY:
    raise RuntimeError("FRED_API_KEY not set in environment")

# ─── Main coroutine ────────────────────────────────────────────────────────
async def main(search_term: str):
    # Use an unlimited connector so aiohttp can fully parallelize if you ever fan-out
    connector = aiohttp.TCPConnector(limit=0)

    # Single HTTP call to fetch titles & IDs in one go
    params = {
        'search_text': search_term,
        'api_key':     API_KEY,
        'file_type':   'json'
    }

    async with aiohttp.ClientSession(connector=connector, timeout=TIMEOUT) as session:
        async with session.get(f'{BASE_URL}/series/search', params=params) as resp:
            resp.raise_for_status()
            data = await resp.json()

    series = data.get('seriess', [])
    for s in series:
        print(f"{s['title']}: {s['id']}")

    print(f"\nTotal series found for '{search_term}': {len(series)}")

In [14]:
# ─── Entrypoint ─────────────────────────────────────────────────────────────
if __name__ == '__main__':
    term = 'finance'
    # asyncio.run(main(term))
    await main(term)  # Use await directly in Jupyter or interactive environments

Secured Overnight Financing Rate: SOFR
30-Day Average SOFR: SOFR30DAYAVG
Finance Rate on Consumer Installment Loans at Commercial Banks, New Autos 48 Month Loan: TERMCBAUTO48NS
Finance Rate on Consumer Installment Loans at Commercial Banks, New Autos 60 Month Loan: RIFLPBCIANM60NM
Average Amount Financed for New Car Loans at Finance Companies: DTCTLVENANM
Average Amount Financed for New Car Loans at Finance Companies (DISCONTINUED): DTCTLVENANQ
90-Day Average SOFR: SOFR90DAYAVG
Finance Rate on Personal Loans at Commercial Banks, 24 Month Loan: TERMCBPER24NS
Average Finance Rate of Used Car Loans at Finance Companies, Amount of Finance Weighted (DISCONTINUED): RIELPCFAUNQ
SOFR Index: SOFRINDEX
Secured Overnight Financing Volume: SOFRVOL
Finance Rate on Consumer Installment Loans at Commercial Banks, New Autos 72 Month Loan: RIFLPBCIANM72NM
Value Added by Industry: Finance, Insurance, Real Estate, Rental, and Leasing: Finance and Insurance as a Percentage of GDP: VAPGDPFI
Average Finance

In [8]:
import os
from dotenv import load_dotenv

# point to your vars.env file
load_dotenv(dotenv_path='/workspaces/smart_dev/ml-unified/vars.env')

api_key = os.getenv('FRED_API_KEY')
if not api_key:
    raise RuntimeError("FRED_API_KEY not set in vars.env")
print(f"FRED API Key: {api_key[:4]}... (truncated for security)")

FRED API Key: e8d2... (truncated for security)


In [13]:
import requests

api_key = 'e8d2479cd2afdf52f8216bb44daffe9f'  # Get at: research.stlouisfed.org/docs/api/api_key.html
search_term = 'finance'
url = f'https://api.stlouisfed.org/fred/series/search?search_text={search_term}&api_key={api_key}&file_type=json'

counter = 0
response = requests.get(url).json()
for series in response['seriess']:
    print(f"{series['title']}: {series['id']}")
    counter += 1

print(f"Total series found for '{search_term}': {counter}")

Secured Overnight Financing Rate: SOFR
30-Day Average SOFR: SOFR30DAYAVG
Finance Rate on Consumer Installment Loans at Commercial Banks, New Autos 48 Month Loan: TERMCBAUTO48NS
Finance Rate on Consumer Installment Loans at Commercial Banks, New Autos 60 Month Loan: RIFLPBCIANM60NM
Average Amount Financed for New Car Loans at Finance Companies: DTCTLVENANM
Average Amount Financed for New Car Loans at Finance Companies (DISCONTINUED): DTCTLVENANQ
90-Day Average SOFR: SOFR90DAYAVG
Finance Rate on Personal Loans at Commercial Banks, 24 Month Loan: TERMCBPER24NS
Average Finance Rate of Used Car Loans at Finance Companies, Amount of Finance Weighted (DISCONTINUED): RIELPCFAUNQ
SOFR Index: SOFRINDEX
Secured Overnight Financing Volume: SOFRVOL
Finance Rate on Consumer Installment Loans at Commercial Banks, New Autos 72 Month Loan: RIFLPBCIANM72NM
Value Added by Industry: Finance, Insurance, Real Estate, Rental, and Leasing: Finance and Insurance as a Percentage of GDP: VAPGDPFI
Average Finance

### Analysis Results Table

In [4]:
import pandas as pd
import duckdb
import json
from pathlib import Path

def load_and_save_results(db_path="/workspaces/smart_dev/projects/Notebooks/Stocks/flint/results/audit_log.duckdb", output_csv="audit_log_results.csv"):
    """
    Loads all analysis results from the DuckDB database, saves them to a CSV,
    and returns them as a pandas DataFrame.

    Args:
        db_path (str): The path to the DuckDB database file.
        output_csv (str): The filename for the output CSV file.

    Returns:
        pandas.DataFrame: A DataFrame containing all analysis results,
                          or an empty DataFrame if an error occurs.
    """
    db_file = Path(db_path)
    if not db_file.exists():
        print(f"Error: Database file not found at '{db_path}'")
        return pd.DataFrame()

    print(f"Connecting to '{db_path}'...")
    with duckdb.connect(database=str(db_file), read_only=True) as con:
        try:
            # Fetch the entire table directly into a pandas DataFrame
            df = con.execute("SELECT * FROM analysis_results").fetchdf()
            print(f"Successfully loaded {len(df)} records.")
        except duckdb.Error as e:
            print(f"An error occurred while querying the database: {e}")
            return pd.DataFrame()

    if df.empty:
        print("The 'analysis_results' table is empty.")
        return df

    # Save the full, raw DataFrame to a CSV file
    df.to_csv(output_csv, index=False)
    print(f"All {len(df)} records have been saved to '{output_csv}'.")
    
    return df

# --- Usage in your Jupyter Notebook cell ---

# 1. Call the function to get your data
results_df = load_and_save_results()

# 2. Display the first few rows of the main DataFrame
#    In Jupyter, this will automatically render as a nice HTML table.
results_df

Connecting to '/workspaces/smart_dev/projects/Notebooks/Stocks/flint/results/audit_log.duckdb'...
Successfully loaded 92 records.
All 92 records have been saved to 'audit_log_results.csv'.


Unnamed: 0,run_id,ticker,execution_timestamp,model_name,git_hash,run_config,predictions,metrics,shap_importance
0,1751732807225363968,RYCEY,2025-07-05 16:26:47.225379,Ensemble_v1,b30175110da76bb385640f9872c173302a06fd84,"{""XGBoost"": {""n_estimators"": 100, ""random_stat...","{""probabilities"": [0.7497308254241943, 0.59436...","{""accuracy"": 0.5645161290322581, ""kelly_fracti...","{""features"": [""RSI14"", ""MACD_line"", ""MACD_sign..."
1,1751817325451127040,RYCEY,2025-07-06 15:55:25.451139,Ensemble_v2,0b0fe2ac9b86f4c401a367f576935813e735955e,"{""XGBoost"": {""n_estimators"": 100, ""random_stat...","{""probabilities"": [0.6810305714607239, 0.70815...","{""accuracy"": 0.7027649769585254, ""kelly_fracti...","{""features"": [""RSI14"", ""MACD_line"", ""MACD_sign..."
2,1751819020634740992,RYCEY,2025-07-06 16:23:40.634752,Ensemble_v2,0b0fe2ac9b86f4c401a367f576935813e735955e,"{""XGBoost"": {""n_estimators"": 100, ""random_stat...","{""probabilities"": [0.6794436573982239, 0.70998...","{""accuracy"": 0.6935483870967742, ""kelly_fracti...","{""features"": [""RSI14"", ""MACD_line"", ""MACD_sign..."
3,1751819696203219968,RYCEY,2025-07-06 16:34:56.203232,Ensemble_v2,0b0fe2ac9b86f4c401a367f576935813e735955e,"{""XGBoost"": {""n_estimators"": 100, ""random_stat...","{""probabilities"": [0.7475588917732239, 0.77200...","{""accuracy"": 0.6889400921658986, ""kelly_fracti...","{""features"": [""RSI14"", ""MACD_line"", ""MACD_sign..."
4,1751819958613993984,RYCEY,2025-07-06 16:39:18.614006,Ensemble_v2,0b0fe2ac9b86f4c401a367f576935813e735955e,"{""XGBoost"": {""n_estimators"": 100, ""random_stat...","{""probabilities"": [0.7443850636482239, 0.75662...","{""accuracy"": 0.6935483870967742, ""kelly_fracti...","{""features"": [""RSI14"", ""MACD_line"", ""MACD_sign..."
...,...,...,...,...,...,...,...,...,...
87,1753560444047149824,QS,2025-07-26 20:07:24.047166,RandomForest_v1,748edaf5d1809cadfe8b1136fe314c92269c60cc,"{""model_type"": ""rf"", ""n_estimators"": 100, ""ran...","{""probabilities"": [0.28, 0.27, 0.32, 0.34, 0.3...","{""accuracy"": 0.5549597855227882, ""kelly_fracti...","{""features"": [""RSI14"", ""MACD_line"", ""MACD_sign..."
88,1753579134935243008,QS,2025-07-27 01:18:54.935253,XGBoost_v2_Tuned,3d5e7033cbde310936c04c191854dd0cffd272e4,"{""model_type"": ""xgb"", ""xgboost_params"": {""n_es...","{""probabilities"": [0.01212555542588234, 0.0210...","{""accuracy"": 0.6005361930294906, ""kelly_fracti...","{""features"": [""RSI14"", ""MACD_line"", ""MACD_sign..."
89,1753622229299156992,QS,2025-07-27 13:17:09.299169,Ensemble_v3_Tuned,3d5e7033cbde310936c04c191854dd0cffd272e4,"{""model_type"": ""ensemble"", ""xgboost_params"": {...","{""probabilities"": [0.23689773678779602, 0.2102...","{""accuracy"": 0.45222929936305734, ""kelly_fract...","{""features"": [""RSI14"", ""MACD_line"", ""MACD_sign..."
90,1754137456235713024,TSLA,2025-08-02 12:24:16.235725,Ensemble_v3_Tuned,0311ca1722fb7b45db86936b8e8cf00044f7a6ab,"{""model_type"": ""ensemble"", ""xgboost_params"": {...","{""probabilities"": [0.5424229502677917, 0.66515...","{""accuracy"": 0.5894134477825465, ""kelly_fracti...","{""features"": [""RSI14"", ""MACD_line"", ""MACD_sign..."


### Inspecting Runs

In [12]:
from IPython.display import display, HTML

def inspect_run_details(df, index):
    """
    Selects a single run by its index and displays its nested JSON columns
    (metrics, predictions, shap_importance) in a readable format.

    Args:
        df (pandas.DataFrame): The DataFrame containing all results.
        index (int): The integer index of the row you want to inspect.
    """
    if index not in df.index:
        print(f"Error: Index {index} is out of bounds. Please choose an index between 0 and {len(df)-1}.")
        return

    # Select the specific run (row) you want to inspect
    run_data = df.loc[index]
    
    print("-" * 50)
    display(HTML(f"<h3>Inspecting Run for Ticker: {run_data['ticker']} (Index: {index})</h3>"))
    print(f"Timestamp: {run_data['execution_timestamp']}")
    print("-" * 50)

    # --- 1. Unpack and display METRICS ---
    display(HTML("<h4>Metrics</h4>"))
    try:
        metrics_dict = json.loads(run_data['metrics'])
        metrics_series = pd.Series(metrics_dict, name="Value")
        display(metrics_series.to_frame())
    except (json.JSONDecodeError, TypeError):
        print("Could not parse the 'metrics' column.")

    # --- 2. Unpack and display PREDICTIONS ---
    display(HTML("<h4>Predictions (first 20 probabilities)</h4>"))
    try:
        predictions_dict = json.loads(run_data['predictions'])
        # The probabilities are a list inside the dictionary
        if 'probabilities' in predictions_dict:
            predictions_df = pd.DataFrame(predictions_dict['probabilities'], columns=['probability'])
            display(predictions_df.head(20))
        else:
            print("'probabilities' key not found in predictions JSON.")
    except (json.JSONDecodeError, TypeError):
        print("Could not parse the 'predictions' column.")

    # --- 3. Unpack and display SHAP IMPORTANCE ---
    # This assumes 'shap_importance' and 'feature_names' are in the 'run_config' JSON
    display(HTML("<h4>Feature Importance (SHAP)</h4>"))
    try:
        # The data is in the 'shap_importance' column
        shap_dict = json.loads(run_data['shap_importance'])
        
        if 'values' in shap_dict and 'features' in shap_dict:
            shap_series = pd.Series(
                shap_dict['values'], 
                index=shap_dict['features'],
                name="Mean SHAP Value"
            )
            # Sort for better readability
            display(shap_series.sort_values(ascending=False).to_frame())
        else:
            print("'values' or 'features' keys not found in 'shap_importance' JSON.")

    except (json.JSONDecodeError, TypeError):
        print("Could not parse the 'shap_importance' column. It might be malformed or empty.")
        print("Raw content:", run_data['shap_importance'])


# --- Usage in your Jupyter Notebook cell ---

# Assuming 'results_df' is the DataFrame from Part 1

# Inspect the details of the first run (index 0)
# inspect_run_details(results_df, 0)  # Inspect the last run by default

# Inspect the details of another run (e.g., index 5)
inspect_run_details(results_df, len(results_df) - 1)

--------------------------------------------------


Timestamp: 2025-07-08 21:43:23.934957
--------------------------------------------------


Unnamed: 0,Value
accuracy,0.520833
kelly_fraction,-0.179866
up_prob,0.281327
down_prob,0.718673
trend_strength,-0.245035
ci,"[-0.13128162274837252, 0.18427183735271793]"
n_sims,814
simulated_slopes,"[-0.12401038314700369, 0.006282805975799272, -..."


Unnamed: 0,probability
0,0.853861
1,0.8453
2,0.809569
3,0.763304
4,0.518225
5,0.714415
6,0.751109
7,0.739759
8,0.665142
9,0.672605


Unnamed: 0,Mean SHAP Value
ATR14,0.824338
BB_upper,0.820164
BB_mid,0.751472
BB_lower,0.56748
CCI20,0.471244
Volume,0.444994
MACD_hist,0.406409
MACD_line,0.360028
OBV,0.346973
%K,0.300573


### Potential duckdb compatible optuna table

In [None]:
import duckdb

def create_optuna_schema(conn):
    """Creates Optuna-compatible schema in DuckDB"""
    conn.execute("""
    -- Studies table
    CREATE TABLE studies (
        study_id INTEGER NOT NULL PRIMARY KEY,
        study_name VARCHAR(512) NOT NULL
    );
    CREATE UNIQUE INDEX ix_studies_study_name ON studies (study_name);
    
    -- Version info
    CREATE TABLE version_info (
        version_info_id INTEGER PRIMARY KEY CHECK (version_info_id=1),
        schema_version INTEGER,
        library_version VARCHAR(256)
    );
    
    -- Study directions
    CREATE TABLE study_directions (
        study_direction_id INTEGER PRIMARY KEY,
        direction VARCHAR(8) NOT NULL,
        study_id INTEGER NOT NULL REFERENCES studies(study_id),
        objective INTEGER NOT NULL,
        UNIQUE (study_id, objective)
    );
    
    -- Study attributes
    CREATE TABLE study_user_attributes (
        study_user_attribute_id INTEGER PRIMARY KEY,
        study_id INTEGER REFERENCES studies(study_id),
        key VARCHAR(512),
        value_json TEXT,
        UNIQUE (study_id, key)
    );
    
    CREATE TABLE study_system_attributes (
        study_system_attribute_id INTEGER PRIMARY KEY,
        study_id INTEGER REFERENCES studies(study_id),
        key VARCHAR(512),
        value_json TEXT,
        UNIQUE (study_id, key)
    );
    
    -- Trials table
    CREATE TABLE trials (
        trial_id INTEGER PRIMARY KEY,
        number INTEGER,
        study_id INTEGER REFERENCES studies(study_id),
        state VARCHAR(8) NOT NULL,
        datetime_start TIMESTAMP,
        datetime_complete TIMESTAMP
    );
    CREATE INDEX ix_trials_study_id ON trials (study_id);
    
    -- Trial attributes
    CREATE TABLE trial_user_attributes (
        trial_user_attribute_id INTEGER PRIMARY KEY,
        trial_id INTEGER REFERENCES trials(trial_id),
        key VARCHAR(512),
        value_json TEXT,
        UNIQUE (trial_id, key)
    );
    
    CREATE TABLE trial_system_attributes (
        trial_system_attribute_id INTEGER PRIMARY KEY,
        trial_id INTEGER REFERENCES trials(trial_id),
        key VARCHAR(512),
        value_json TEXT,
        UNIQUE (trial_id, key)
    );
    
    -- Trial parameters
    CREATE TABLE trial_params (
        param_id INTEGER PRIMARY KEY,
        trial_id INTEGER REFERENCES trials(trial_id),
        param_name VARCHAR(512),
        param_value DOUBLE,
        distribution_json TEXT,
        UNIQUE (trial_id, param_name)
    );
    
    -- Trial values
    CREATE TABLE trial_values (
        trial_value_id INTEGER PRIMARY KEY,
        trial_id INTEGER NOT NULL REFERENCES trials(trial_id),
        objective INTEGER NOT NULL,
        value DOUBLE,
        value_type VARCHAR(7) NOT NULL,
        UNIQUE (trial_id, objective)
    );
    
    -- Intermediate values
    CREATE TABLE trial_intermediate_values (
        trial_intermediate_value_id INTEGER PRIMARY KEY,
        trial_id INTEGER NOT NULL REFERENCES trials(trial_id),
        step INTEGER NOT NULL,
        intermediate_value DOUBLE,
        intermediate_value_type VARCHAR(7) NOT NULL,
        UNIQUE (trial_id, step)
    );
    
    -- Heartbeats
    CREATE TABLE trial_heartbeats (
        trial_heartbeat_id INTEGER PRIMARY KEY,
        trial_id INTEGER NOT NULL REFERENCES trials(trial_id),
        heartbeat TIMESTAMP NOT NULL,
        UNIQUE (trial_id)
    );
    
    -- Alembic version
    CREATE TABLE alembic_version (
        version_num VARCHAR(32) PRIMARY KEY
    );
    
    -- Initialize version info
    INSERT INTO version_info VALUES (1, 2, '3.4.0');
    INSERT INTO alembic_version VALUES ('c8c7b2ef4a1a');
    """)

# Test connection
conn = duckdb.connect("/optuna.duckdb")
create_optuna_schema(conn)
print("Optuna schema created successfully in DuckDB!")

In [None]:
import requests

# replace the "demo" apikey below with your own key from https://www.alphavantage.co/support/#api-key
url = 'https://www.alphavantage.co/query?function=NEWS_SENTIMENT&tickers=AAPL&apikey=demo'
r = requests.get(url)
data = r.json()

print(data)



### Inspect torch fles

In [1]:
import torch
import pandas as pd
from IPython.display import display
import ipywidgets as widgets

# Load the file
pt_file = torch.load("/workspaces/smart_dev/projects/Notebooks/Stocks/flint/data/processed/TSLA_data.pt", weights_only=False)

display(pt_file['df_features'].shape)

cols = pt_file['df_features'].columns.tolist()
cols.sort()

# ===== CUSTOMIZE THESE VALUES =====
BACKGROUND = "#250B80"   # Your specified background
DEFAULT_TEXT_COLOR = "yellow"  # For normal columns
HIGHLIGHT_COLOR = "red"        # For special columns
SPECIAL_COLS = ['Open', 'High', 'Low', 'Close', 'Volume']
COLUMNS = 5              # Number of columns
COLUMN_GAP = "15px"      # Space between columns
FONT_SIZE = "16px"       # Font size
HEIGHT = "400px"         # Container height
BORDER_RADIUS = "8px"    # Rounded corners
# ==================================

html_content = f"""
<div style="
  column-count: {COLUMNS};
  column-gap: {COLUMN_GAP};
  font-family: monospace;
">
"""
for col in cols:
    # Determine styling per column
    if col in SPECIAL_COLS:
        text_color = HIGHLIGHT_COLOR
        display_text = f"<b>{col}</b>"  # Make bold
    else:
        text_color = DEFAULT_TEXT_COLOR
        display_text = col

    html_content += f"""<div style="
      break-inside: avoid;
      padding: 3px 0;
      color: {text_color};
      font-size: {FONT_SIZE};
      transition: all 0.3s;
      " onmouseover="this.style.color='white'" onmouseout="this.style.color='{text_color}'"
    ">{display_text}</div>"""

display(widgets.HTML(value=f"""
<div style="
  height: {HEIGHT};
  overflow-y: auto;
  background: {BACKGROUND};
  padding: 15px;
  border-radius: {BORDER_RADIUS};
  box-shadow: 0 4px 8px rgba(0,0,0,0.2);
">
{html_content}
</div>
"""))

# Print the head of the file
display(pt_file['df_features'].head(10))

display(pt_file.keys())

pd.read_parquet("/workspaces/smart_dev/projects/Notebooks/Stocks/flint/data/TSLA.parquet").tail(10)

(2525, 154)

HTML(value='\n<div style="\n  height: 400px;\n  overflow-y: auto;\n  background: #250B80;\n  padding: 15px;\n …

Unnamed: 0,Date,Close,High,Low,Open,Volume,Return1,RSI14,MACD_line,MACD_signal,...,Money_Supply_InEventWindow,Bank_Credit_IsUpdateDay,Bank_Credit_DaysSinceUpdate,Bank_Credit_InEventWindow,Corr_Stock_FedFunds_60D,Corr_Stock_CPI_60D,TreasurySpread_RealVol_21D,Real_FedFunds,Stock_vs_GDP_Ratio,CPI_ROC_3M
0,2015-07-20,18.817333,19.110001,18.169333,18.333332,74677500,-5.487851,0.0,0.0,0.0,...,0,0,5,0,0.079969,0.102264,0.361691,-0.478363,0.001023,-0.126453
1,2015-07-21,17.784666,18.233334,17.77,18.003332,91630500,-5.487851,0.0,-0.082378,-0.016476,...,0,0,6,0,0.079969,0.102264,0.361691,-0.478363,0.000966,-0.126453
2,2015-07-22,17.858,17.962667,17.390667,17.417999,46575000,0.412343,0.543293,-0.14013,-0.041207,...,0,1,0,1,0.079969,0.102264,0.361691,-0.478363,0.00097,-0.126453
3,2015-07-23,17.813334,17.993334,17.684668,17.976667,33408000,-0.250119,0.541364,-0.187344,-0.070434,...,0,0,1,1,0.079969,0.102264,0.361691,-0.478363,0.000968,-0.126453
4,2015-07-24,17.694,18.072666,17.594667,17.825333,42547500,-0.66991,0.535889,-0.23172,-0.102691,...,0,0,2,1,0.079969,0.102264,0.361691,-0.478363,0.000962,-0.126453
5,2015-07-27,16.867332,17.628668,16.719334,17.495333,70413000,-4.672023,0.498293,-0.329791,-0.148111,...,0,0,5,0,0.079969,0.102264,0.361691,-0.478363,0.000917,-0.126453
6,2015-07-28,17.654667,17.693333,16.789333,17.049999,58437000,4.667807,7.177603,-0.340062,-0.186501,...,0,0,6,0,0.079969,0.102264,0.361691,-0.478363,0.000959,-0.126453
7,2015-07-29,17.587999,17.859333,17.466667,17.618,41851500,-0.37762,7.133934,-0.349552,-0.219111,...,0,1,0,1,0.079969,0.102264,0.361691,-0.478363,0.000956,-0.126453
8,2015-07-30,17.785999,17.796,17.474001,17.512667,30519000,1.125767,8.906544,-0.337208,-0.242731,...,0,0,1,1,0.079969,0.102264,0.361691,-0.478363,0.000967,-0.126453
9,2015-07-31,17.743334,17.957333,17.674667,17.84,33339000,-0.239882,8.867267,-0.327098,-0.259604,...,0,0,2,1,0.079969,0.102264,0.361691,-0.478363,0.000964,-0.126453


dict_keys(['X_train', 'y_train', 'X_test', 'y_test_orig', 'feature_cols', 'X_seq_train', 'y_seq_train', 'X_seq_test', 'y_seq_test', 'df_features', 'test_df'])

Unnamed: 0,Date,Close,High,Low,Open,Volume
2515,2025-07-21,328.48999,338.0,326.880005,334.399994,75768800
2516,2025-07-22,332.109985,335.410004,321.549988,329.73999,77370400
2517,2025-07-23,332.559998,336.200012,328.670013,330.899994,92553800
2518,2025-07-24,305.299988,310.149994,300.410004,310.0,156966000
2519,2025-07-25,316.059998,323.630005,308.01001,308.73999,148227000
2520,2025-07-28,325.589996,330.48999,315.690002,318.450012,112673800
2521,2025-07-29,321.200012,326.25,318.25,325.549988,87358900
2522,2025-07-30,319.040009,324.450012,311.619995,322.179993,83931900
2523,2025-07-31,308.269989,321.369995,306.100006,319.609985,85270900
2524,2025-08-01,302.630005,309.309998,297.820007,306.204987,88358015


### Conversion Script

In [5]:
import os
from pathlib import Path
from typing import Union, List, Tuple

def convert_files_to_text(
    root_dir: str,
    output_dir: str,
    include_subdirs: Union[bool, List[str]] = True,
    exclude_dirs: List[str] = None,
    extensions: Tuple[str, ...] = ('.py', '.yml', '.toml'),
    exclude_files: List[str] = None
):
    """
    Convert files to text with flexible directory processing options.
    
    Args:
        root_dir: Directory containing source files
        output_dir: Where to save text versions
        include_subdirs: True=all, False=none, List=specific subdirs + root
        exclude_dirs: Directories to always exclude
        extensions: File extensions to process
        exclude_files: Specific filenames to exclude (including extensions)
    """
    # Create output directory if needed
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Default exclude directories and files
    if exclude_dirs is None:
        exclude_dirs = [".venv", ".vscode", ".git", "__pycache__"]
    if exclude_files is None:
        exclude_files = [".python-version", "*.ipynb", "*.md"]
    if extensions is None:
        extensions = '*'
    
    # Prepare files to process
    files_to_process = []
    
    # Always include root directory files
    files_to_process.extend(Path(root_dir).glob('*'))
    
    # Add subdirectories if specified
    if isinstance(include_subdirs, list):
        for subdir in include_subdirs:
            subdir_path = Path(root_dir) / subdir
            if subdir_path.exists():
                files_to_process.extend(subdir_path.rglob('*'))
    elif include_subdirs is True:
        # Include all subdirectories recursively
        files_to_process.extend(Path(root_dir).rglob('*'))
    
    # Track output filenames to prevent overwrites
    output_files = set()
    
    # NEW: Handle wildcard extension logic
    process_all_files = False
    if extensions == '*' or extensions == ('*',) or extensions is None:
        process_all_files = True
    
    for filepath in files_to_process:
        # Skip directories and excluded files
        if (not filepath.is_file() or 
            any(excluded in filepath.parts for excluded in exclude_dirs) or
            filepath.name in exclude_files):
            continue
            
        # MODIFIED: Extension check logic
        if process_all_files or filepath.suffix.lower() in extensions:
            # Get relative path to maintain structure
            rel_path = filepath.relative_to(root_dir)
            
            # Create unique output filename that preserves original extension
            output_filename = f"{rel_path.stem}_{rel_path.suffix[1:]}.txt"
            output_path = Path(output_dir) / rel_path.with_name(output_filename)
            
            # Ensure we don't overwrite files
            if output_path in output_files:
                print(f"Warning: Skipping potential overwrite of {output_path}")
                continue
                
            output_files.add(output_path)
            
            # Ensure parent directory exists
            output_path.parent.mkdir(parents=True, exist_ok=True)
            
            # Read and write the file
            try:
                with open(filepath, 'r', encoding='utf-8') as infile, \
                     open(output_path, 'w', encoding='utf-8') as outfile:
                    outfile.write(infile.read())
                
                print(f"Converted: {filepath} → {output_path}")
            except Exception as e:
                print(f"Error processing {filepath}: {str(e)}")


# Example usage
if __name__ == "__main__":
    convert_files_to_text(
        root_dir="/workspaces/smart_dev/projects/Notebooks/Stocks",
        output_dir="/workspaces/smart_dev/projects/Notebooks/Stocks/text_output",
        include_subdirs=['flint'],
        extensions=None,
        exclude_dirs=[".venv", ".vscode", ".git", "__pycache__", ".ruff_cache", "data", "results", "ml-unified"],
        exclude_files=[".python-version", "local_settings.py", "stock_algo.ipynb", "ruff_check.log", "README.md"]  # Additional excludes
    )

Converted: /workspaces/smart_dev/projects/Notebooks/Stocks/audit_log_results.csv → /workspaces/smart_dev/projects/Notebooks/Stocks/text_output/audit_log_results_csv.txt
Converted: /workspaces/smart_dev/projects/Notebooks/Stocks/audit_log.txt → /workspaces/smart_dev/projects/Notebooks/Stocks/text_output/audit_log_txt.txt
Converted: /workspaces/smart_dev/projects/Notebooks/Stocks/predictors.py → /workspaces/smart_dev/projects/Notebooks/Stocks/text_output/predictors_py.txt
Converted: /workspaces/smart_dev/projects/Notebooks/Stocks/main.py → /workspaces/smart_dev/projects/Notebooks/Stocks/text_output/main_py.txt
Converted: /workspaces/smart_dev/projects/Notebooks/Stocks/flint/profile.html → /workspaces/smart_dev/projects/Notebooks/Stocks/text_output/flint/profile_html.txt
Converted: /workspaces/smart_dev/projects/Notebooks/Stocks/flint/validate.py → /workspaces/smart_dev/projects/Notebooks/Stocks/text_output/flint/validate_py.txt
Converted: /workspaces/smart_dev/projects/Notebooks/Stocks/f

In [2]:
import yfinance as yf
tkr = yf.Ticker("MSFT")
income_statement = tkr.financials
balance_sheet = tkr.balance_sheet

In [14]:
income_statement = income_statement.T
balance_sheet = balance_sheet.T
display(income_statement)
display(balance_sheet)

Unnamed: 0,Tax Effect Of Unusual Items,Tax Rate For Calcs,Normalized EBITDA,Total Unusual Items,Total Unusual Items Excluding Goodwill,Net Income From Continuing Operation Net Minority Interest,Reconciled Depreciation,Reconciled Cost Of Revenue,EBITDA,EBIT,Net Interest Income,Interest Expense,Interest Income,Normalized Income,Net Income From Continuing And Discontinued Operation,Total Expenses,Total Operating Income As Reported,Diluted Average Shares,Basic Average Shares,Diluted EPS,Basic EPS,Diluted NI Availto Com Stockholders,Net Income Common Stockholders,Net Income,Net Income Including Noncontrolling Interests,Net Income Continuous Operations,Tax Provision,Pretax Income,Other Income Expense,Other Non Operating Income Expenses,Special Income Charges,Write Off,Gain On Sale Of Security,Net Non Operating Interest Income Expense,Interest Expense Non Operating,Interest Income Non Operating,Operating Income,Operating Expense,Research And Development,Selling General And Administration,Selling And Marketing Expense,General And Administrative Expense,Other Gand A,Gross Profit,Cost Of Revenue,Total Revenue,Operating Revenue
2025-06-30,-77217840.0,0.176296,160603000000.0,-438000000.0,-438000000.0,101832000000.0,34153000000.0,87831000000.0,160165000000.0,126012000000.0,262000000.0,2385000000.0,2647000000.0,102192800000.0,101832000000.0,153196000000.0,128528000000.0,7465000000.0,7433000000.0,13.64,13.7,101832000000.0,101832000000.0,101832000000.0,101832000000.0,101832000000.0,21795000000.0,123627000000.0,-5163000000.0,-4725000000.0,-943000000.0,943000000.0,505000000.0,262000000.0,2385000000.0,2647000000.0,128528000000.0,65365000000.0,32488000000.0,32877000000.0,25654000000.0,7223000000.0,7223000000.0,193893000000.0,87831000000.0,281724000000.0,281724000000.0
2024-06-30,-100090000.0,0.182313,133558000000.0,-549000000.0,-549000000.0,88136000000.0,22287000000.0,74114000000.0,133009000000.0,110722000000.0,222000000.0,2935000000.0,3157000000.0,88584910000.0,88136000000.0,135689000000.0,109433000000.0,7469000000.0,7431000000.0,11.8,11.86,88136000000.0,88136000000.0,88136000000.0,88136000000.0,88136000000.0,19651000000.0,107787000000.0,-1868000000.0,-1319000000.0,-206000000.0,206000000.0,-343000000.0,222000000.0,2935000000.0,3157000000.0,109433000000.0,61575000000.0,29510000000.0,32065000000.0,24456000000.0,7609000000.0,7609000000.0,171008000000.0,74114000000.0,245122000000.0,245122000000.0
2023-06-30,-2850000.0,0.19,105155000000.0,-15000000.0,-15000000.0,72361000000.0,13861000000.0,65863000000.0,105140000000.0,91279000000.0,1026000000.0,1968000000.0,2994000000.0,72373150000.0,72361000000.0,123392000000.0,88523000000.0,7472000000.0,7446000000.0,9.68,9.72,72361000000.0,72361000000.0,72361000000.0,72361000000.0,72361000000.0,16950000000.0,89311000000.0,-238000000.0,-223000000.0,-30000000.0,30000000.0,15000000.0,1026000000.0,1968000000.0,2994000000.0,88523000000.0,57529000000.0,27195000000.0,30334000000.0,22759000000.0,7575000000.0,7575000000.0,146052000000.0,65863000000.0,211915000000.0,211915000000.0
2022-06-30,43754000.0,0.131,99905000000.0,334000000.0,334000000.0,72738000000.0,14460000000.0,62650000000.0,100239000000.0,85779000000.0,31000000.0,2063000000.0,2094000000.0,72447750000.0,72738000000.0,114887000000.0,83383000000.0,7540000000.0,7496000000.0,9.65,9.7,72738000000.0,72738000000.0,72738000000.0,72738000000.0,72738000000.0,10978000000.0,83716000000.0,302000000.0,-32000000.0,-101000000.0,101000000.0,435000000.0,31000000.0,2063000000.0,2094000000.0,83383000000.0,52237000000.0,24512000000.0,27725000000.0,21825000000.0,5900000000.0,5900000000.0,135620000000.0,62650000000.0,198270000000.0,198270000000.0


Unnamed: 0,Ordinary Shares Number,Share Issued,Net Debt,Total Debt,Tangible Book Value,Invested Capital,Working Capital,Net Tangible Assets,Capital Lease Obligations,Common Stock Equity,Total Capitalization,Total Equity Gross Minority Interest,Stockholders Equity,Gains Losses Not Affecting Retained Earnings,Other Equity Adjustments,Retained Earnings,Capital Stock,Common Stock,Total Liabilities Net Minority Interest,Total Non Current Liabilities Net Minority Interest,Other Non Current Liabilities,Tradeand Other Payables Non Current,Non Current Deferred Liabilities,Non Current Deferred Revenue,Non Current Deferred Taxes Liabilities,Long Term Debt And Capital Lease Obligation,Long Term Capital Lease Obligation,Long Term Debt,Current Liabilities,Other Current Liabilities,Current Deferred Liabilities,Current Deferred Revenue,Current Debt And Capital Lease Obligation,Current Debt,Other Current Borrowings,Commercial Paper,Pensionand Other Post Retirement Benefit Plans Current,Payables And Accrued Expenses,Payables,Total Tax Payable,Income Tax Payable,Accounts Payable,Total Assets,Total Non Current Assets,Other Non Current Assets,Financial Assets,Investments And Advances,Investmentin Financial Assets,Available For Sale Securities,Long Term Equity Investment,Goodwill And Other Intangible Assets,Other Intangible Assets,Goodwill,Net PPE,Accumulated Depreciation,Gross PPE,Leases,Other Properties,Machinery Furniture Equipment,Buildings And Improvements,Land And Improvements,Properties,Current Assets,Other Current Assets,Hedging Assets Current,Inventory,Finished Goods,Work In Process,Raw Materials,Receivables,Accounts Receivable,Allowance For Doubtful Accounts Receivable,Gross Accounts Receivable,Cash Cash Equivalents And Short Term Investments,Other Short Term Investments,Cash And Cash Equivalents,Cash Equivalents,Cash Financial
2025-06-30,7434000000.0,7434000000.0,12909000000.0,60588000000.0,201366000000.0,386630000000.0,49913000000.0,201366000000.0,17437000000.0,343479000000.0,383631000000.0,343479000000.0,343479000000.0,-3347000000.0,-3347000000.0,237731000000.0,109095000000.0,109095000000.0,275524000000.0,134306000000.0,45186000000.0,25986000000.0,5545000000.0,2710000000.0,2835000000.0,57589000000.0,17437000000.0,40152000000.0,141218000000.0,25020000000.0,64555000000.0,64555000000.0,2999000000.0,2999000000.0,2999000000.0,0.0,13709000000.0,34935000000.0,34935000000.0,7211000000.0,7211000000.0,27724000000.0,619003000000.0,427872000000.0,40565000000.0,272000000.0,15133000000.0,2460000000.0,2460000000.0,12673000000.0,142113000000.0,22604000000.0,119509000000.0,229789000000.0,-93653000000.0,323442000000.0,12117000000.0,24823000000.0,139243000000.0,137921000000.0,9338000000.0,0.0,191131000000.0,25723000000.0,10000000.0,938000000.0,,,,69905000000.0,69905000000.0,-944000000.0,70849000000.0,94555000000.0,64313000000.0,30242000000.0,18531000000.0,11711000000.0
2024-06-30,7434139000.0,7434139000.0,33315000000.0,67127000000.0,121660000000.0,320107000000.0,34448000000.0,121660000000.0,15497000000.0,268477000000.0,311165000000.0,268477000000.0,268477000000.0,-5590000000.0,-5590000000.0,173144000000.0,100923000000.0,100923000000.0,243686000000.0,118400000000.0,27064000000.0,27931000000.0,5220000000.0,2602000000.0,2618000000.0,58185000000.0,15497000000.0,42688000000.0,125286000000.0,19185000000.0,57582000000.0,57582000000.0,8942000000.0,8942000000.0,2249000000.0,6693000000.0,12564000000.0,27013000000.0,27013000000.0,5017000000.0,5017000000.0,21996000000.0,512163000000.0,352429000000.0,36460000000.0,0.0,14600000000.0,1500000000.0,1500000000.0,13100000000.0,146817000000.0,27597000000.0,119220000000.0,154552000000.0,-76421000000.0,230973000000.0,9594000000.0,18961000000.0,100312000000.0,93943000000.0,8163000000.0,0.0,159734000000.0,26021000000.0,12000000.0,1246000000.0,845000000.0,7000000.0,394000000.0,56924000000.0,56924000000.0,-830000000.0,57754000000.0,75531000000.0,57216000000.0,18315000000.0,6744000000.0,11571000000.0
2023-06-30,7432000000.0,7432000000.0,12533000000.0,59965000000.0,128971000000.0,253460000000.0,80108000000.0,128971000000.0,12728000000.0,206223000000.0,248213000000.0,206223000000.0,206223000000.0,-6343000000.0,-6343000000.0,118848000000.0,93718000000.0,93718000000.0,205753000000.0,101604000000.0,17981000000.0,25560000000.0,3345000000.0,2912000000.0,433000000.0,54718000000.0,12728000000.0,41990000000.0,104149000000.0,14745000000.0,50901000000.0,50901000000.0,5247000000.0,5247000000.0,5247000000.0,0.0,11009000000.0,22247000000.0,22247000000.0,4152000000.0,4152000000.0,18095000000.0,411976000000.0,227719000000.0,30601000000.0,0.0,9879000000.0,0.0,,9879000000.0,77252000000.0,9366000000.0,67886000000.0,109987000000.0,-68251000000.0,178238000000.0,8537000000.0,14346000000.0,81207000000.0,68465000000.0,5683000000.0,0.0,184257000000.0,21807000000.0,6000000.0,2500000000.0,1768000000.0,23000000.0,709000000.0,48688000000.0,48688000000.0,-650000000.0,49338000000.0,111256000000.0,76552000000.0,34704000000.0,26226000000.0,8478000000.0
2022-06-30,7464000000.0,7464000000.0,35850000000.0,61270000000.0,87720000000.0,216323000000.0,74602000000.0,87720000000.0,11489000000.0,166542000000.0,213574000000.0,166542000000.0,166542000000.0,-4678000000.0,-4678000000.0,84281000000.0,86939000000.0,86939000000.0,198298000000.0,103216000000.0,15526000000.0,26069000000.0,3100000000.0,2870000000.0,230000000.0,58521000000.0,11489000000.0,47032000000.0,95082000000.0,13067000000.0,45538000000.0,45538000000.0,2749000000.0,2749000000.0,,,10661000000.0,23067000000.0,23067000000.0,4067000000.0,4067000000.0,19000000000.0,364840000000.0,195156000000.0,21897000000.0,0.0,6891000000.0,0.0,,6891000000.0,78822000000.0,11298000000.0,67524000000.0,87546000000.0,-59660000000.0,147206000000.0,7819000000.0,13148000000.0,66491000000.0,55014000000.0,4734000000.0,0.0,169684000000.0,16924000000.0,8000000.0,3742000000.0,2516000000.0,82000000.0,1144000000.0,44261000000.0,44261000000.0,-633000000.0,44894000000.0,104749000000.0,90818000000.0,13931000000.0,5673000000.0,8258000000.0
2021-06-30,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1367000000.0,79000000.0,1190000000.0,,,,,,,,,


In [8]:
import yfinance as yf
from datetime import date
import pandas as pd
from IPython.display import display, HTML, Markdown

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 8)

def section_header(title):
    display(Markdown(f"## {title}"))
    
def sub_header(title):
    display(Markdown(f"**{title}**"))

# ========================
# OHLCV_VARIANTS
# ========================
section_header("OHLCV Variants")

sub_header("1. Intraday 1-minute data (TSLA)")
display(yf.download("TSLA", period="7d", interval="1h", prepost=True).tail(3))

sub_header("2. Corporate action-adjusted data (AAPL)")
display(yf.download("AAPL", start="2023-01-01", auto_adjust=True, actions=True).tail(3))

sub_header("3. Continuous futures (ES=F)")
display(yf.download("ES=F", period="1y", interval="1d", auto_adjust=False, back_adjust=True).tail(3))

sub_header("4. Repaired prices (BA)")
display(yf.download("BA", start="2020-01-01", end="2020-05-31", interval="1d", repair=True))

# ========================
# FUNDAMENTALS
# ========================
section_header("Fundamentals: MSFT")
tkr = yf.Ticker("MSFT")

fundamentals = pd.Series({
    "Forward P/E": tkr.info.get('forwardPE', 'N/A'),
    "Beta": tkr.info.get('beta', 'N/A'),
    "Trailing EPS": tkr.info.get('trailingEps', 'N/A'),
    "Dividend Yield": tkr.info.get('dividendYield', 'N/A'),
    "Float Shares": tkr.info.get('floatShares', 'N/A'),
    "Short % of Float": tkr.info.get('shortPercentOfFloat', 'N/A'),
    "Institutional Ownership": tkr.info.get('heldPercentInstitutions', 'N/A')
})
display(fundamentals.to_frame(name="Value"))

sub_header("Earnings Dates")
display(tkr.earnings_dates.tail(3))

sub_header("Recent Dividends")
display(tkr.dividends.tail(3).to_frame(name="Dividend"))

sub_header("Major Holders")
display(tkr.major_holders)

# ========================
# OPTIONS_CHAINS
# ========================
section_header("Options Chains: MSFT")

try:
    exp_date = tkr.options[0]
    opt = tkr.option_chain(exp_date)
    
    sub_header(f"Calls for {exp_date}")
    display(opt.calls[['strike', 'openInterest', 'impliedVolatility']].head(3))
    
    sub_header(f"Puts for {exp_date}")
    puts_columns = ['strike', 'inTheMoney', 'openInterest', 'impliedVolatility']
    if 'delta' in opt.puts.columns:
        puts_columns.append('delta')
    display(opt.puts[puts_columns].head(3))
except IndexError:
    display(HTML("<i>No options data available</i>"))


## OHLCV Variants

**1. Intraday 1-minute data (TSLA)**

  display(yf.download("TSLA", period="7d", interval="1h", prepost=True).tail(3))
[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,TSLA,TSLA,TSLA,TSLA,TSLA
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2025-08-01 21:00:00+00:00,302.47,323.3682,295.1414,302.6982,0
2025-08-01 22:00:00+00:00,302.16,302.5,302.14,302.5,0
2025-08-01 23:00:00+00:00,301.05,302.2,301.0,302.16,0


**2. Corporate action-adjusted data (AAPL)**

[*********************100%***********************]  1 of 1 completed


Price,Close,Dividends,High,Low,Open,Stock Splits,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2025-07-30,209.050003,0.0,212.389999,207.720001,211.899994,0.0,45512500
2025-07-31,207.570007,0.0,209.839996,207.160004,208.490005,0.0,80698400
2025-08-01,202.380005,0.0,213.580002,201.5,210.869995,0.0,104301700


**3. Continuous futures (ES=F)**

[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,ES=F,ES=F,ES=F,ES=F,ES=F
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2025-07-30,6396.25,6435.25,6366.75,6404.0,1302795
2025-07-31,6374.25,6468.5,6357.5,6435.5,1857020
2025-08-01,6264.5,6373.5,6239.5,6359.5,2207468


**4. Repaired prices (BA)**

  display(yf.download("BA", start="2020-01-01", end="2020-05-31", interval="1d", repair=True))
[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Repaired?,Volume
Ticker,BA,BA,BA,BA,BA,BA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2020-01-02,331.348572,331.378393,325.761816,326.606765,False,4544400
2020-01-03,330.791901,332.909308,328.346428,328.674494,False,3875900
2020-01-06,331.766083,332.879454,325.940756,327.352341,False,5355000
2020-01-07,335.285156,342.154291,328.754007,332.283029,False,9898600
...,...,...,...,...,...,...
2020-05-26,144.729996,145.910004,142.610001,145.210007,False,30338300
2020-05-27,149.520004,149.649994,141.240005,149.139999,False,32799900
2020-05-28,149.820007,156.699997,149.050003,156.100006,False,34734300
2020-05-29,145.850006,152.000000,142.940002,145.300003,False,33853900


## Fundamentals: MSFT

Unnamed: 0,Value
Forward P/E,35.05753
Beta,1.033
Trailing EPS,13.63
Dividend Yield,0.63
Float Shares,7422537000.0
Short % of Float,0.0072
Institutional Ownership,0.74707


**Earnings Dates**

Unnamed: 0_level_0,EPS Estimate,Reported EPS,Surprise(%),Event Type
Earnings Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-04-25 16:05:00-04:00,2.23,2.45,9.81,Earnings
2023-01-24 16:04:00-05:00,2.29,2.32,1.09,Earnings
2022-10-25 16:02:00-04:00,2.3,2.35,2.05,Earnings


**Recent Dividends**

Unnamed: 0_level_0,Dividend
Date,Unnamed: 1_level_1
2024-11-21 00:00:00-05:00,0.83
2025-02-20 00:00:00-05:00,0.83
2025-05-15 00:00:00-04:00,0.83


**Major Holders**

Breakdown,Value
insidersPercentHeld,0.00063
institutionsPercentHeld,0.74707
institutionsFloatPercentHeld,0.74755
institutionsCount,7489.0


## Options Chains: MSFT

**Calls for 2025-08-08**

Unnamed: 0,strike,openInterest,impliedVolatility
0,320.0,1,1.613283
1,330.0,0,1.521487
2,335.0,1,1.49512


**Puts for 2025-08-08**

Unnamed: 0,strike,inTheMoney,openInterest,impliedVolatility
0,250.0,False,1121,1.656252
1,260.0,False,3,2.242192
2,270.0,False,2,2.130864


In [19]:

# ========================
# SEARCH_LOOKUP
# ========================
section_header("Search & Lookup")

lookup_data = {
    "BRK.B ISIN": yf.Ticker("BRK.B").isin,
    "NVDA Company Name": yf.Ticker("NVDA").info['shortName'],
    "AMD Confirmed Symbol": yf.Tickers(["AMD"]).tickers['AMD'].info['symbol']
}
display(pd.Series(lookup_data).to_frame(name="Value"))

# ========================
# SECTOR_INDUSTRY
# ========================
section_header("Sector & Industry: JPM")
jpm = yf.Ticker("JPM")

sector_info = pd.Series({
    "Sector": jpm.info.get('sector', 'N/A'),
    "Industry": jpm.info.get('industry', 'N/A'),
    "Employees": jpm.info.get('fullTimeEmployees', 'N/A')
})
display(sector_info.to_frame(name="Value"))

# ========================
# SCREENER_QUERY
# ========================
section_header("Screener Query: MSFT")
msft = yf.Ticker("MSFT")

screener_results = []
if msft.info.get('marketCap', 0) > 1e12:
    screener_results.append("Large-cap stock")
if msft.info.get('volume', 0) > 1e7:
    screener_results.append("Highly liquid")
if msft.info.get('trailingPE', float('inf')) < 20:
    screener_results.append("Low P/E")

display(pd.Series(screener_results, name="Screener Results") if screener_results 
        else display("No screener matches"))

# ========================
# MULTI_TICKER
# ========================
section_header("Multi-Ticker Download")

sub_header("Grouped by Ticker (Threaded)")
display(yf.download(["TSLA", "RIVN", "LCID"], period="1d", group_by="ticker", threads=True))

sub_header("Grouped by Column")
display(yf.download("AAPL MSFT", period="1d", group_by="column", threads=False).tail(3))

# ========================
# ERROR_HANDLING
# ========================
section_header("Error Handling")

sub_header("Invalid Ticker")
try:
    yf.download("INVALID_TICKER")
except Exception as e:
    display(HTML(f"<span style='color:red'>Error: {str(e)}</span>"))

sub_header("Stale Data Check")
hist = yf.download("MSFT", period="5d")
if hist.index[-1].date() < date.today():
    display(HTML("<span style='color:orange'>Warning: Data might be stale</span>"))
else:
    display("Data is up-to-date")

## Search & Lookup

Unnamed: 0,Value
BRK.B ISIN,US0846707026
NVDA Company Name,NVIDIA Corporation
AMD Confirmed Symbol,AMD


## Sector & Industry: JPM

Unnamed: 0,Value
Sector,Financial Services
Industry,Banks - Diversified
Employees,318477


## Screener Query: MSFT

0    Large-cap stock
Name: Screener Results, dtype: object

## Multi-Ticker Download

**Grouped by Ticker (Threaded)**

  display(yf.download(["TSLA", "RIVN", "LCID"], period="1d", group_by="ticker", threads=True))
[*********************100%***********************]  3 of 3 completed


Ticker,TSLA,TSLA,TSLA,TSLA,TSLA,RIVN,RIVN,RIVN,RIVN,RIVN,LCID,LCID,LCID,LCID,LCID
Price,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
2025-07-08,296.880005,304.049408,295.641602,301.070892,56739675,12.79,13.29,12.78,13.155,13494175,2.12,2.27,2.11,2.265,108747111


**Grouped by Column**

  display(yf.download("AAPL MSFT", period="1d", group_by="column", threads=False).tail(3))
[*********************100%***********************]  2 of 2 completed


Price,Close,Close,High,High,Low,Low,Open,Open,Volume,Volume
Ticker,AAPL,MSFT,AAPL,MSFT,AAPL,MSFT,AAPL,MSFT,AAPL,MSFT
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2025-07-08,210.550003,495.790009,211.429993,498.200012,208.449997,494.109985,210.130005,497.410004,20469083,4737048


## Error Handling

**Invalid Ticker**

  yf.download("INVALID_TICKER")
HTTP Error 404: 
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['INVALID_TICKER']: YFPricesMissingError('possibly delisted; no price data found  (period=1mo) (Yahoo error = "No data found, symbol may be delisted")')


**Stale Data Check**

  hist = yf.download("MSFT", period="5d")
[*********************100%***********************]  1 of 1 completed


'Data is up-to-date'

In [16]:
%load_ext scalene

Scalene extension successfully loaded. Note: Scalene currently only
supports CPU+GPU profiling inside Jupyter notebooks. For full Scalene
profiling, use the command line version. To profile in line mode, use
`%scrun [options] statement`. To profile in cell mode, use `%%scalene
[options]` followed by your code.


In [29]:
%%scalene --profile
import numpy as np
import pandas as pd
import time
from numba import jit

# Install numba if not already installed
# !pip install numba

@jit(nopython=True)
def _hurst_numba(ts, min_lag=2, max_lag=50):
    """Numba-optimized Hurst calculation"""
    n = len(ts)
    if n < max_lag + 1:
        return np.nan
        
    lags = np.arange(min_lag, min(max_lag, n//2))
    n_lags = len(lags)
    
    if n_lags < 2:
        return np.nan
        
    log_lags = np.log(lags.astype(np.float64))
    log_tau = np.empty(n_lags)
    
    for i in range(n_lags):
        lag = lags[i]
        diffs = ts[lag:] - ts[:-lag]
        tau = np.sqrt(np.var(diffs))
        if tau <= 0:
            return np.nan
        log_tau[i] = np.log(tau)
    
    # Manual linear regression for numba compatibility
    n_points = len(log_lags)
    sum_x = np.sum(log_lags)
    sum_y = np.sum(log_tau)
    sum_xy = np.sum(log_lags * log_tau)
    sum_x2 = np.sum(log_lags * log_lags)
    
    slope = (n_points * sum_xy - sum_x * sum_y) / (n_points * sum_x2 - sum_x * sum_x)
    return slope * 2.0

@jit(nopython=True)
def _vectorized_katz_fd(close_values, window_size):
    """Fully vectorized Katz fractal dimension calculation"""
    n = len(close_values)
    fd_values = np.full(n, np.nan)
    
    for i in range(window_size - 1, n):
        start_idx = i - window_size + 1
        ts = close_values[start_idx:i+1]
        
        n_points = len(ts) - 1
        if n_points <= 0:
            continue
            
        # Calculate differences
        diffs = np.diff(ts)
        
        # Path length
        L = np.sum(np.sqrt(1 + diffs**2))
        
        # Diameter  
        d = np.max(np.abs(ts - ts[0]))
        
        if d <= 0 or L <= 0:
            continue
            
        fd_values[i] = np.log(n_points) / (np.log(n_points) + np.log(d/L))
    
    return fd_values

@jit(nopython=True)
def _vectorized_hurst(close_values, window_size, min_lag=2, max_lag=50):
    """Fully vectorized Hurst calculation"""
    n = len(close_values)
    hurst_values = np.full(n, np.nan)
    
    for i in range(window_size, n):
        start_idx = max(0, i - window_size + 1)
        ts = close_values[start_idx:i+1]
        hurst_values[i] = _hurst_numba(ts, min_lag, max_lag)
    
    return hurst_values

# Pre-compile the numba functions to avoid first-call overhead
def _warmup_numba():
    """Warm up numba functions to avoid compilation overhead in timing"""
    dummy_data = np.random.randn(100)
    _hurst_numba(dummy_data)
    _vectorized_katz_fd(dummy_data, 14)
    _vectorized_hurst(dummy_data, 50)

class OptimizedIndicators:
    def __init__(self, df):
        self.df = df.copy()
    
    def _add_hurst_optimized(self, window=100):
        """Fully optimized Hurst calculation"""
        close_values = self.df['Close'].values
        hurst_values = _vectorized_hurst(close_values, window)
        self.df['Hurst'] = hurst_values
        return self
    
    def _add_fractal_dimension_optimized(self, window: int = 14):
        """Fully optimized fractal dimension calculation"""
        close_values = self.df['Close'].values
        fd_values = _vectorized_katz_fd(close_values, window)
        self.df['Fractal_Dim'] = fd_values
        return self
    
    def _add_fractal_dimension_numpy(self, window: int = 14):
        """Pure numpy vectorized version (often fastest for small windows)"""
        close = self.df['Close'].values
        n = len(close)
        fd_values = np.full(n, np.nan)
        
        # Vectorized sliding window approach
        for i in range(window-1, n):
            ts = close[i-window+1:i+1]
            n_points = len(ts) - 1
            
            if n_points > 0:
                diffs = np.diff(ts)
                L = np.sum(np.sqrt(1 + diffs**2))
                d = np.max(np.abs(ts - ts[0]))
                
                if d > 0 and L > 0:
                    fd_values[i] = np.log(n_points) / (np.log(n_points) + np.log(d/L))
        
        self.df['Fractal_Dim'] = fd_values
        return self

# Original methods for comparison
class OriginalIndicators:
    def __init__(self, df):
        self.df = df.copy()
        
    def _add_hurst(self, window=100):
        def get_hurst(ts):
            lags = range(2, 50)
            tau = [np.sqrt(np.std(np.subtract(ts[lag:], ts[:-lag]))) for lag in lags]
            poly = np.polyfit(np.log(lags), np.log(tau), 1)
            return poly[0]*2.0
        self.df['Hurst'] = self.df['Close'].expanding(window).apply(get_hurst, raw=True)
        return self
    
    def _add_fractal_dimension(self, window: int = 14):
        def katz_fd(ts):
            n = len(ts) - 1
            L = np.sum(np.sqrt(1 + np.diff(ts)**2))
            d = np.max(np.abs(ts - ts[0]))
            return np.log(n) / (np.log(n) + np.log(d/L)) if d > 0 and L > 0 else np.nan
        self.df['Fractal_Dim'] = self.df['Close'].rolling(window).apply(katz_fd, raw=True)
        return self

# Generate sample data for testing
def create_sample_data(size=1000):
    np.random.seed(42)
    dates = pd.date_range('2020-01-01', periods=size, freq='D')
    prices = 100 + np.cumsum(np.random.randn(size) * 0.02)
    return pd.DataFrame({'Close': prices}, index=dates)

# Performance comparison function
def benchmark_indicators(data_size=1000, warmup=True):
    print(f"Creating sample data with {data_size} points...")
    df = create_sample_data(data_size)
    
    if warmup:
        print("Warming up numba functions...")
        _warmup_numba()
    
    print("\n" + "="*50)
    print("FRACTAL DIMENSION COMPARISON")
    print("="*50)
    
    # Test original fractal dimension
    print("Testing original fractal dimension...")
    orig = OriginalIndicators(df)
    start_time = time.time()
    orig._add_fractal_dimension()
    orig_fd_time = time.time() - start_time
    print(f"Original FD time: {orig_fd_time:.4f}s")
    
    # Test numba optimized fractal dimension
    print("Testing numba optimized fractal dimension...")
    opt_numba = OptimizedIndicators(df)
    start_time = time.time()
    opt_numba._add_fractal_dimension_optimized()
    opt_numba_fd_time = time.time() - start_time
    print(f"Numba optimized FD time: {opt_numba_fd_time:.4f}s")
    
    # Test pure numpy fractal dimension
    print("Testing numpy optimized fractal dimension...")
    opt_numpy = OptimizedIndicators(df)
    start_time = time.time()
    opt_numpy._add_fractal_dimension_numpy()
    opt_numpy_fd_time = time.time() - start_time
    print(f"Numpy optimized FD time: {opt_numpy_fd_time:.4f}s")
    
    best_fd_time = min(opt_numba_fd_time, opt_numpy_fd_time)
    best_method = "Numba" if opt_numba_fd_time < opt_numpy_fd_time else "Numpy"
    
    print(f"Best FD method: {best_method}")
    print(f"FD Speedup: {orig_fd_time/best_fd_time:.2f}x faster")
    
    # Verify results are similar - handle potential size differences
    orig_fd_clean = orig.df['Fractal_Dim'].dropna()
    opt_fd_clean = opt_numba.df['Fractal_Dim'].dropna()
    
    # Align the arrays by taking the minimum length
    min_len = min(len(orig_fd_clean), len(opt_fd_clean))
    if min_len > 10:  # Only compare if we have enough data points
        orig_aligned = orig_fd_clean.iloc[-min_len:]
        opt_aligned = opt_fd_clean.iloc[-min_len:]
        correlation = np.corrcoef(orig_aligned, opt_aligned)[0,1]
        print(f"Result correlation: {correlation:.6f}")
    else:
        print("Not enough data points for correlation comparison")
    
    print("\n" + "="*50)
    print("HURST EXPONENT COMPARISON")
    print("="*50)
    
    if data_size <= 2000:  # Test on reasonable size
        # Test original Hurst
        print("Testing original Hurst exponent...")
        orig_hurst = OriginalIndicators(df)
        start_time = time.time()
        orig_hurst._add_hurst(window=100)
        orig_hurst_time = time.time() - start_time
        print(f"Original Hurst time: {orig_hurst_time:.4f}s")
        
        # Test optimized Hurst
        print("Testing optimized Hurst exponent...")
        opt_hurst = OptimizedIndicators(df)
        start_time = time.time()
        opt_hurst._add_hurst_optimized(window=100)
        opt_hurst_time = time.time() - start_time
        print(f"Optimized Hurst time: {opt_hurst_time:.4f}s")
        print(f"Hurst Speedup: {orig_hurst_time/opt_hurst_time:.2f}x faster")
        
        # Verify results are similar - handle potential size differences
        orig_hurst_clean = orig_hurst.df['Hurst'].dropna()
        opt_hurst_clean = opt_hurst.df['Hurst'].dropna()
        
        # Align the arrays by taking the minimum length
        min_len = min(len(orig_hurst_clean), len(opt_hurst_clean))
        if min_len > 10:  # Only compare if we have enough data points
            orig_aligned = orig_hurst_clean.iloc[-min_len:]
            opt_aligned = opt_hurst_clean.iloc[-min_len:]
            hurst_corr = np.corrcoef(orig_aligned, opt_aligned)[0,1]
            print(f"Hurst correlation: {hurst_corr:.6f}")
        else:
            print("Not enough data points for correlation comparison")
        
        return orig_hurst.df, opt_hurst.df
    else:
        print("Testing optimized Hurst only (large dataset)...")
        opt_hurst = OptimizedIndicators(df)
        start_time = time.time()
        opt_hurst._add_hurst_optimized(window=100)
        opt_hurst_time = time.time() - start_time
        print(f"Optimized Hurst time: {opt_hurst_time:.4f}s")
        return None, opt_hurst.df

# Test on different data sizes
def comprehensive_benchmark():
    sizes = [500, 1000, 2000, 5000]
    
    print("COMPREHENSIVE BENCHMARK")
    print("="*60)
    
    for size in sizes:
        print(f"\n--- TESTING WITH {size} DATA POINTS ---")
        benchmark_indicators(size, warmup=(size==sizes[0]))

# Example usage function
def example_usage():
    print("\n" + "="*50)
    print("EXAMPLE USAGE")
    print("="*50)
    
    # Create your data
    df = create_sample_data(500)
    
    # Use optimized indicators
    indicators = OptimizedIndicators(df)
    
    # Add both indicators
    indicators._add_hurst_optimized(window=100)
    indicators._add_fractal_dimension_numpy(window=14)  # Use the fastest method
    
    # Display results
    print("\nResults sample:")
    print(indicators.df[['Close', 'Hurst', 'Fractal_Dim']].tail(10))
    
    print(f"\nHurst stats:")
    print(f"Mean: {indicators.df['Hurst'].mean():.4f}")
    print(f"Std: {indicators.df['Hurst'].std():.4f}")
    
    print(f"\nFractal Dimension stats:")
    print(f"Mean: {indicators.df['Fractal_Dim'].mean():.4f}")
    print(f"Std: {indicators.df['Fractal_Dim'].std():.4f}")
    
    return indicators.df

# Run the comprehensive benchmark
comprehensive_benchmark()

# Show example usage
result_df = example_usage()

print("\n" + "="*50)
print("READY TO USE!")
print("="*50)
print("Best practices based on testing:")
print("- For Fractal Dimension: Use _add_fractal_dimension_numpy() for small windows")
print("- For Hurst Exponent: Use _add_hurst_optimized() - significant speedup")
print("\nUsage:")
print("indicators = OptimizedIndicators(your_dataframe)")
print("indicators._add_hurst_optimized(window=100)")
print("indicators._add_fractal_dimension_numpy(window=14)  # or _optimized")
print("optimized_df = indicators.df")

COMPREHENSIVE BENCHMARK

--- TESTING WITH 500 DATA POINTS ---
Creating sample data with 500 points...
Warming up numba functions...

FRACTAL DIMENSION COMPARISON
Testing original fractal dimension...
Original FD time: 0.0161s
Testing numba optimized fractal dimension...
Numba optimized FD time: 0.1296s
Testing numpy optimized fractal dimension...
Numpy optimized FD time: 0.1025s
Best FD method: Numpy
FD Speedup: 0.16x faster
Result correlation: 1.000000

HURST EXPONENT COMPARISON
Testing original Hurst exponent...
Original Hurst time: 1.0619s
Testing optimized Hurst exponent...
Optimized Hurst time: 0.0112s
Hurst Speedup: 94.43x faster
Hurst correlation: 0.532035

--- TESTING WITH 1000 DATA POINTS ---
Creating sample data with 1000 points...

FRACTAL DIMENSION COMPARISON
Testing original fractal dimension...
Original FD time: 0.1308s
Testing numba optimized fractal dimension...
Numba optimized FD time: 0.0014s
Testing numpy optimized fractal dimension...
Numpy optimized FD time: 0.0571

Scalene: An exception of type ImportError occurred. Arguments:
("cannot import name 'display' from 'IPython.core.display' (/workspaces/smart_dev/ml-unified/.venv/lib/python3.11/site-packages/IPython/core/display.py)",)
Traceback (most recent call last):
  File "/workspaces/smart_dev/ml-unified/.venv/lib/python3.11/site-packages/scalene/scalene_profiler.py", line 2133, in run_profiler
    exit_status = profiler.profile_code(
                  ^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/smart_dev/ml-unified/.venv/lib/python3.11/site-packages/scalene/scalene_profiler.py", line 1868, in profile_code
    ScaleneJupyter.display_profile(
  File "/workspaces/smart_dev/ml-unified/.venv/lib/python3.11/site-packages/scalene/scalene_jupyter.py", line 35, in display_profile
    try:
ImportError: cannot import name 'display' from 'IPython.core.display' (/workspaces/smart_dev/ml-unified/.venv/lib/python3.11/site-packages/IPython/core/display.py)



In [1]:
from rich.console import Console
console = Console()

# Styles and Markup
console.print("[bold]Bold[/bold], [italic]Italic[/], [underline]Underline[/]")
console.print("[strike]Strikethrough[/], [dim]Dim text[/]")
console.print("[reverse]Reverse colors[/] and [blink]Blinking[/] text")

# Colors and Backgrounds
console.print("[red]Red[/] on [green]Green[/] background")
console.print("[white on blue]White on blue background[/]")

# Hex and RGB colors
console.print("Hex color", style="#ff00ff")         # Magenta text
console.print("RGB color", style="rgb(128,128,255)") # Pastel blue text

# Alignment and Overflow
console.print("Centered Text", style="on blue", justify="center", width=40)
console.print("Too long to show completely", overflow="ellipsis", width=18)
