In [1]:
import tensorflow as tf
print(tf.__version__)  # should be 2.15.0
print(tf.config.list_physical_devices('GPU'))  


2025-05-25 16:34:27.141099: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-25 16:34:27.141160: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-25 16:34:27.142007: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


2.15.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        print(e)

### 🧽 Warning Suppression

To maintain a clean output and avoid cluttering the notebook with non-critical warnings (e.g., future deprecations), we suppress all warnings using Python's built-in `warnings` module:

```python
import warnings
warnings.filterwarnings('ignore')


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import importlib
import utilities  

importlib.reload(utilities)


<module 'utilities' from '/home/jovyan/files/utilities.py'>

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
from datetime import datetime,timedelta
import requests
import ta
import base64
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.backends import default_backend
from snowflake.sqlalchemy import URL
from sqlalchemy import create_engine
from sqlalchemy import text
import networkx as nx


In [6]:
import os
from dotenv import load_dotenv

load_dotenv()

True

# Snowflake Connection


In [7]:
with open(os.getenv("PRIVATE_KEY_PATH"), "rb") as key_file:
    private_key = serialization.load_pem_private_key(
        key_file.read(),
        password=None,
        backend=default_backend()
    )

private_key_pkcs8 = base64.b64encode(
    private_key.private_bytes(
        encoding=serialization.Encoding.DER,
        format=serialization.PrivateFormat.PKCS8,
        encryption_algorithm=serialization.NoEncryption()
    )
).decode("utf-8")  # ✅ base64 string, not bytes!

engine = create_engine(URL(
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    user=os.getenv("SNOWFLAKE_USER"),
    private_key=private_key_pkcs8,
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    database=os.getenv("SNOWFLAKE_DATABASE"),
    schema=os.getenv("SNOWFLAKE_SCHEMA"),
    role=os.getenv("SNOWFLAKE_ROLE")
))

with engine.connect() as conn:
    result = conn.execute(text("SELECT CURRENT_USER(), CURRENT_ROLE(), CURRENT_TIMESTAMP();"))
    for row in result:
        print("✅ Snowflake connected:", row)


✅ Snowflake connected: ('CRYPTO_USER', 'CRYPTO_ROLE', datetime.datetime(2025, 5, 25, 9, 34, 32, 53000, tzinfo=<DstTzInfo 'America/Los_Angeles' PDT-1 day, 17:00:00 DST>))


In [8]:
from datetime import datetime

execution_time = datetime.now().strftime("%Y%m%d%H%M%S")

# Reading assets file

In [9]:
top100 = pd.read_csv('Assets_Categorized.csv')

In [10]:
top100['staging'] = '@cryptodatasource'
top100.drop(columns=['realticker'],inplace=True)


### 🗂️ Incremental Data Ingestion from Yahoo Finance

This section ensures that price data for the top 100 tickers is **up-to-date**, downloading only the missing historical records and storing them into a Snowflake staging area.

#### **Logic Breakdown**
- **Latest Date Check**: For each ticker, the script queries the latest available date from the `VW_CRYPTO` view in Snowflake.
- **Smart Start Date**: If no data exists, it defaults to `2019-01-01`. Otherwise, it starts from the next day after the most recent entry.
- **Skip Up-to-Date Tickers**: If the latest date is today or later, the ticker is skipped to avoid redundant downloads.
- **Download Missing Data**: For all other cases, the `download_yahoo_to_stage()` utility fetches historical daily prices from Yahoo Finance and stages them.

#### **Why This Matters**
- Ensures efficient **delta loading** — no unnecessary API calls or overwrites.
- Maintains **data freshness** for modeling and clustering without full reprocessing.
- Reduces compute costs and network traffic by avoiding full dataset reloads.

> ⚙️ This incremental ingestion design is perfect for **automated daily jobs** (e.g., via Airflow or dbt) and supports long-term project scalability.


In [11]:
# from utilities import download_yahoo_to_stage

# now = datetime(datetime.now().year,datetime.now().month,datetime.now().day)
# start_date = datetime(2019, 1, 1)

# for index, row in top100.iterrows():
#     query = text("SELECT coalesce(dateadd(day,1,MAX(date)),'2019-01-01') as date FROM PUBLIC.VW_CRYPTO WHERE ticker = :ticker ")
#     df_date = pd.read_sql(query, con=engine, params={"ticker": row['ticker']})
#     start_date = pd.to_datetime(df_date['date'].iloc[0])
     
#     if start_date > now:
#         print(f"✅ {row['ticker']}: Up to date — skipping download.")
#     else:
#         # Download only what’s missing
#         download_yahoo_to_stage(
#             ticker=row['ticker'],
#             private_key=private_key,
#             stage_area=row['staging'],
#             interval="1d",
#             start=start_date,
#             time="12:00 AM",
#             execution_time=execution_time
#         )


### 🧹 Data Loading & Cleaning: OHLCV Time Series

This step loads the OHLCV (Open, High, Low, Close, Volume) price data for crypto tickers from Snowflake and applies a multi-step cleaning process to ensure consistency across the dataset.

---

#### **Step 1: Load & Filter**
- SQL query pulls price data from `VW_CRYPTO` starting from `2020-01-01`.
- Filters out **stablecoins** (non-volatile by design) to focus on more predictive assets.
- Merges with the `top100` metadata to bring in staging/category info.

#### **Step 2: Missing Value Handling**
Each OHLCV column is cleaned with a **multi-stage imputation** strategy:
1. **Forward-fill** (fill with previous known value)
2. **Backward-fill** (fill with next known value)
3. **Fill with per-ticker mean** if still missing

This preserves temporal continuity and avoids dropping valuable rows due to sparsity.

#### **Step 3: Pivot to Close Price Matrix**
- A matrix of close prices is created (`price_df`), with:
  - **Rows** = Dates
  - **Columns** = Tickers
  - **Values** = Closing prices

#### **Step 4: Final Cleanup**
- Any remaining NaNs in `price_df` (likely isolated values) are filled with the **average close price** for that ticker.
- A final NaN check is performed and printed to validate the dataset is ready for analysis.

---

> 📌 This data wrangling stage ensures a **complete and consistent** dataset — a critical foundation for accurate clustering, modeling, and evaluation.


In [12]:
from sqlalchemy import text
import pandas as pd
import numpy as np

# Step 1: Load and merge
query = text("""
    SELECT DISTINCT date, open,high,low,close, volume, ticker
    FROM PUBLIC.VW_CRYPTO
    WHERE date >= '2020-01-01'
    and ticker in ( select distinct ticker from vw_crypto where date <='2020-01-01')
    ORDER BY ticker, date
""")
data = pd.read_sql(query, con=engine)
data.drop_duplicates(['date', 'ticker'], keep='last', inplace=True)

data = pd.merge(data, top100, on='ticker', how='left')
data = data[data['category'] != 'Stablecoin']

print("✅ Loaded data for tickers:", data['ticker'].nunique())


ohlcv_cols = ['open', 'high', 'low', 'close', 'volume']
data = data.sort_values(by=['ticker', 'date'])

for col in ohlcv_cols:
    # Fill forward, then backward, then with mean (per ticker)
    data[col] = (
        data.groupby('ticker')[col]
        .apply(lambda grp: grp.ffill().bfill().fillna(grp.mean()))
        .reset_index(level=0, drop=True)
    )

nan_summary = data[ohlcv_cols].isna().sum()
print("🧹 Still NaNs in `data` (should be 0):")
print(nan_summary[nan_summary > 0])


# Step 2: Pivot close prices
price_df = data.pivot(index='date', columns='ticker', values='close')

# b. Fill remaining NaNs with the average of the column
price_df = price_df.apply(lambda col: col.fillna(col.mean()), axis=0)

# Step 4: Optional check
nan_summary = price_df.isna().sum()
print("🧹 Still NaN per ticker (should be 0):")
print(nan_summary[nan_summary > 0])





✅ Loaded data for tickers: 142
🧹 Still NaNs in `data` (should be 0):
Series([], dtype: int64)
🧹 Still NaN per ticker (should be 0):
Series([], dtype: int64)


In [13]:
price_df.head()

ticker,AAPL,ADA-USD,ADBE,ADI,ADP,ADSK,AEP,ALGO-USD,AMAT,AMD,...,XEL,XLM-USD,XMR-USD,XRP-USD,XTZ-USD,ZEC-USD,ZS,^GSPC,^NDX,^VIX
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01,156.905111,0.033458,468.000236,163.30215,210.340759,240.050804,80.751228,0.219938,126.195807,103.856844,...,60.089335,0.045451,45.753544,0.192667,1.37021,28.050165,175.537954,4368.310738,14608.588987,21.44444
2020-01-02,72.620834,0.032751,334.429993,109.436218,152.39386,187.830002,76.578613,0.213518,59.05666,49.099998,...,53.112019,0.044112,45.74947,0.188043,1.241036,27.118073,47.330002,3257.850098,8872.219727,12.47
2020-01-03,71.914803,0.03418,331.809998,107.509766,152.071762,184.949997,76.496651,0.228098,58.116684,48.599998,...,53.367439,0.045234,51.092037,0.193521,1.282225,28.618681,47.380001,3234.850098,8793.900391,14.02
2020-01-04,156.905111,0.034595,468.000236,163.30215,210.340759,240.050804,80.751228,0.236382,126.195807,103.856844,...,60.089335,0.046272,50.536694,0.194355,1.261942,30.23868,175.537954,4368.310738,14608.588987,21.44444
2020-01-05,156.905111,0.034721,468.000236,163.30215,210.340759,240.050804,80.751228,0.231657,126.195807,103.856844,...,60.089335,0.045359,54.096893,0.195537,1.263569,31.021275,175.537954,4368.310738,14608.588987,21.44444


In [14]:
btc_df = data[data['ticker'] == 'BTC-USD'].copy()
btc_df.head()

Unnamed: 0,date,open,high,low,close,volume,ticker,name,category,market,staging
46100,2020-01-01,7194.89209,7254.330566,7174.944336,7200.174316,18565660000.0,BTC-USD,Bitcoin,Layer 1,Crypto,@cryptodatasource
46101,2020-01-02,7202.55127,7212.155273,6935.27002,6985.470215,20802080000.0,BTC-USD,Bitcoin,Layer 1,Crypto,@cryptodatasource
46102,2020-01-03,6984.428711,7413.715332,6914.996094,7344.884277,28111480000.0,BTC-USD,Bitcoin,Layer 1,Crypto,@cryptodatasource
46103,2020-01-04,7345.375488,7427.385742,7309.51416,7410.656738,18444270000.0,BTC-USD,Bitcoin,Layer 1,Crypto,@cryptodatasource
46104,2020-01-05,7410.45166,7544.49707,7400.535645,7411.317383,19725070000.0,BTC-USD,Bitcoin,Layer 1,Crypto,@cryptodatasource


In [15]:
# ----- Prepare price delta as target -----
btc_df['target'] = btc_df['close'].shift(-1) - btc_df['close']
btc_features = btc_df.set_index('date')[['target']].dropna()

In [16]:
btc_features

Unnamed: 0_level_0,target
date,Unnamed: 1_level_1
2020-01-01,-214.704102
2020-01-02,359.414062
2020-01-03,65.772461
2020-01-04,0.660645
2020-01-05,357.901855
...,...
2025-05-20,2886.992188
2025-05-21,1995.203125
2025-05-22,-4385.484375
2025-05-23,503.359375


In [17]:
data.head()

Unnamed: 0,date,open,high,low,close,volume,ticker,name,category,market,staging
0,2020-01-02,71.627084,72.681281,71.373211,72.620834,135480400.0,AAPL,Apple Inc.,Nasdaq-100,Stock,@cryptodatasource
1,2020-01-03,71.847102,72.676431,71.689942,71.914803,146322800.0,AAPL,Apple Inc.,Nasdaq-100,Stock,@cryptodatasource
2,2020-01-06,71.034709,72.526533,70.783248,72.487846,118387200.0,AAPL,Apple Inc.,Nasdaq-100,Stock,@cryptodatasource
3,2020-01-07,72.497522,72.753816,71.926907,72.146935,108872000.0,AAPL,Apple Inc.,Nasdaq-100,Stock,@cryptodatasource
4,2020-01-08,71.84954,73.609752,71.84954,73.307518,132079200.0,AAPL,Apple Inc.,Nasdaq-100,Stock,@cryptodatasource


In [18]:
# Merge with peer prices
final_df = btc_features.join(price_df, how='inner').dropna()

In [19]:
final_df.head()

Unnamed: 0_level_0,target,AAPL,ADA-USD,ADBE,ADI,ADP,ADSK,AEP,ALGO-USD,AMAT,...,XEL,XLM-USD,XMR-USD,XRP-USD,XTZ-USD,ZEC-USD,ZS,^GSPC,^NDX,^VIX
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01,-214.704102,156.905111,0.033458,468.000236,163.30215,210.340759,240.050804,80.751228,0.219938,126.195807,...,60.089335,0.045451,45.753544,0.192667,1.37021,28.050165,175.537954,4368.310738,14608.588987,21.44444
2020-01-02,359.414062,72.620834,0.032751,334.429993,109.436218,152.39386,187.830002,76.578613,0.213518,59.05666,...,53.112019,0.044112,45.74947,0.188043,1.241036,27.118073,47.330002,3257.850098,8872.219727,12.47
2020-01-03,65.772461,71.914803,0.03418,331.809998,107.509766,152.071762,184.949997,76.496651,0.228098,58.116684,...,53.367439,0.045234,51.092037,0.193521,1.282225,28.618681,47.380001,3234.850098,8793.900391,14.02
2020-01-04,0.660645,156.905111,0.034595,468.000236,163.30215,210.340759,240.050804,80.751228,0.236382,126.195807,...,60.089335,0.046272,50.536694,0.194355,1.261942,30.23868,175.537954,4368.310738,14608.588987,21.44444
2020-01-05,357.901855,156.905111,0.034721,468.000236,163.30215,210.340759,240.050804,80.751228,0.231657,126.195807,...,60.089335,0.045359,54.096893,0.195537,1.263569,31.021275,175.537954,4368.310738,14608.588987,21.44444


In [20]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping



# Feature/target split
feature_cols = final_df.columns.drop('target')
X = final_df[feature_cols].values
y = final_df['target'].values

# Scaling
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

X_scaled = feature_scaler.fit_transform(X)
y_scaled = target_scaler.fit_transform(y.reshape(-1, 1))

# ----- Sequence builder -----
def create_sequences(X_data, y_data, time_step):
    X_seq, y_seq = [], []
    for i in range(len(X_data) - time_step):
        X_seq.append(X_data[i:i + time_step])
        y_seq.append(y_data[i + time_step - 1])  # predict delta for t+1
    return np.array(X_seq), np.array(y_seq)

time_step = 60
X_seq, y_seq = create_sequences(X_scaled, y_scaled, time_step)

# Split
split = int(len(X_seq) * 0.8)
X_train, X_test = X_seq[:split], X_seq[split:]
y_train, y_test = y_seq[:split], y_seq[split:]

# ----- Model -----
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    LSTM(50),
    Dense(25, activation='relu'),
    Dense(1)
])
model.compile(optimizer='adam', loss='mean_squared_error')

# Train
early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)
model.fit(X_train, y_train, epochs=25, batch_size=32, callbacks=[early_stop])

# Evaluate
loss = model.evaluate(X_test, y_test)
print(f"✅ LSTM Test Loss (MSE): {loss:.4f}")

# ----- Inverse Transform & Rebuild Price -----
# Predict deltas (scaled -> real)
y_pred_scaled = model.predict(X_test)
y_pred_delta = target_scaler.inverse_transform(y_pred_scaled)
y_test_delta = target_scaler.inverse_transform(y_test.reshape(-1, 1))


btc_close = btc_df.set_index('date')['close']
btc_close = btc_close.loc[final_df.index]  # Match dates used in final_df
btc_close = btc_close.values[time_step - 1:]  # Align with LSTM shift
base_prices = btc_close[-len(y_test):]

y_pred_price = base_prices + y_pred_delta.flatten()
y_test_price = base_prices + y_test_delta.flatten()


# ----- Metrics -----
rmse = np.sqrt(mean_squared_error(y_test_price, y_pred_price))
mae = mean_absolute_error(y_test_price, y_pred_price)
r2 = r2_score(y_test_price, y_pred_price)

print(f"\n📉 RMSE (price): {rmse:.2f}")
print(f"📈 MAE (price): {mae:.2f}")
print(f"📊 R² Score (price): {r2:.4f}")

Epoch 1/25


I0000 00:00:1748190886.824856   20660 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
✅ LSTM Test Loss (MSE): 0.0162

📉 RMSE (price): 2085.94
📈 MAE (price): 1481.33
📊 R² Score (price): 0.9845


In [21]:
from utilities import explain_lstm_with_shap

print("🚀 Running warm-up prediction to initialize GPU...")
_ = model.predict(X_train[:1])


explain_lstm_with_shap(
    model=model,
    X_train=X_train,
    X_test=X_test,
    feature_cols=feature_cols,  
    time_steps=60,
    background_size=300,
    sample_size=150,
    nsamples=100,
    plot_filename="shap_lstm_v1.png",
    plot_influence ='shap_crypto_influence_louvain'
)

🚀 Running warm-up prediction to initialize GPU...
🔎 Input shapes:
 - X_train shape: (1528, 60, 142)
 - X_test shape: (383, 60, 142)
🚀 Warming up model to initialize GPU...
⚙️ Trying SHAP DeepExplainer (fastest)...
⚠️ DeepExplainer failed: in user code:

    File "/usr/local/lib/python3.11/dist-packages/shap/explainers/_deep/deep_tf.py", line 245, in grad_graph  *
        out = self.model(shap_rAnD)
    File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "/usr/local/lib/python3.11/dist-packages/shap/explainers/_deep/deep_tf.py", line 394, in custom_grad
        out = op_handlers[type_name](self, op, *grads)  # we cut off the shap_ prefix before the lookup
    File "/usr/local/lib/python3.11/dist-packages/shap/explainers/_deep/deep_tf.py", line 691, in handler
        return linearity_with_excluded_handler(input_inds, explainer, op, *grads)
    File "/usr/local/lib/

Using 300 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
  0%|          | 0/150 [00:00<?, ?it/s]



  1%|          | 1/150 [00:08<22:09,  8.92s/it]



  1%|▏         | 2/150 [00:17<21:52,  8.87s/it]



  2%|▏         | 3/150 [00:25<20:45,  8.48s/it]



  3%|▎         | 4/150 [00:34<20:35,  8.46s/it]



  3%|▎         | 5/150 [00:42<19:53,  8.23s/it]



  4%|▍         | 6/150 [00:50<19:44,  8.23s/it]



  5%|▍         | 7/150 [00:59<20:21,  8.54s/it]



  5%|▌         | 8/150 [01:07<20:04,  8.48s/it]



  6%|▌         | 9/150 [01:17<20:33,  8.75s/it]



  7%|▋         | 10/150 [01:25<20:01,  8.58s/it]



  7%|▋         | 10/150 [01:38<22:58,  9.84s/it]


InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.