**Cryptocurrency Volatility
Prediction
**

In [3]:
# Cryptocurrency Volatility Prediction
# Colab-ready notebook for the project prompt you provided.
# Assumes dataset.zip is uploaded to Colab at /mnt/data/dataset.csv.zip
# If you run on your local machine, change paths accordingly.

# ----
# 0. Install (optional) and imports
# ----
# Uncomment if running in an environment missing libraries
# !pip install xgboost lightgbm catboost streamlit --quiet

import os, zipfile, io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
sns.set_style("whitegrid")
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import Ridge, LogisticRegression

# Try optional advanced models
try:
    import xgboost as xgb
    from xgboost import XGBRegressor, XGBClassifier
    HAS_XGB = True
except:
    HAS_XGB = False

try:
    import lightgbm as lgb
    HAS_LGB = True
except:
    HAS_LGB = False

# ----
# 1. Load dataset from uploaded zip (path: /mnt/data/dataset.csv.zip)
# ----
zip_path = "/mnt/data/dataset.csv.zip"
if not os.path.exists(zip_path):
    raise FileNotFoundError(f"{zip_path} not found. Upload dataset.csv.zip to /mnt/data/")

with zipfile.ZipFile(zip_path, 'r') as z:
    # assume single file dataset.csv inside
    csv_names = [n for n in z.namelist() if n.lower().endswith('.csv')]
    if not csv_names:
        raise FileNotFoundError("No CSV found inside the uploaded zip.")
    csv_name = csv_names[0]
    print("Found CSV inside zip:", csv_name)
    with z.open(csv_name) as f:
        df = pd.read_csv(f)

print("Raw data shape:", df.shape)
df.head()

# ----
# 2. Quick schema cleanup and type conversions
# ----
# Rename common columns if necessary and parse dates
df.columns = [c.strip() for c in df.columns]
# Ensure date column exists
date_col = None
for candidate in ['date', 'timestamp', 'time']:
    if candidate in df.columns:
        date_col = candidate
        break
if date_col is None:
    raise ValueError("No date/timestamp column found in dataset. Expecting 'date' or 'timestamp'.")

# Parse date and sort
df['date'] = pd.to_datetime(df[date_col])
df = df.sort_values(['crypto_name', 'date']).reset_index(drop=True)

# Basic columns we expect: open, high, low, close, volume, marketCap, crypto_name
expected_cols = ['open','high','low','close','volume','marketCap','crypto_name']
for c in expected_cols:
    if c not in df.columns:
        print(f"Warning: expected column '{c}' not found. Check dataset. Available columns: {df.columns.tolist()}")

# Check NA counts
print("\nMissing values per column:\n", df.isna().sum())

# ----
# 3. Basic EDA (per-crypto summary)
# ----
print("\nCryptocurrencies in dataset:", df['crypto_name'].nunique())
print(df['crypto_name'].value_counts().head(10))

# Example: plot price time series for top N cryptos by records
top_n = df['crypto_name'].value_counts().head(5).index.tolist()
plt.figure(figsize=(12,6))
for name in top_n:
    sub = df[df['crypto_name']==name]
    plt.plot(sub['date'], sub['close'], label=name)
plt.legend(); plt.title("Close price for top 5 cryptos"); plt.xlabel("Date"); plt.ylabel("Close"); plt.show()

# ----
# 4. Preprocessing
#    - Handle missing values
#    - Fill forward/backward per crypto for OHLC and volume/mcap
# ----
# We'll forward-fill and then drop remaining NAs
df.sort_values(['crypto_name','date'], inplace=True)
df[['open','high','low','close','volume','marketCap']] = df.groupby('crypto_name')[['open','high','low','close','volume','marketCap']].apply(lambda g: g.fillna(method='ffill').fillna(method='bfill'))
# After that, drop rows still having NaN in price
df = df.dropna(subset=['open','high','low','close'])
print("\nAfter imputation, remaining NA counts:\n", df.isna().sum())

# ----
# 5. Feature engineering (per crypto)
#    Features:
#      - log_return
#      - rolling volatility (std of log returns) for windows: 7, 14, 30
#      - ATR-like measure: True Range and rolling average (ATR)
#      - Bollinger Bands %B or bandwidth
#      - liquidity ratio: volume / marketCap
#      - moving averages (ma7, ma21)
# ----
def add_features(g):
    g = g.copy()
    g['log_close'] = np.log(g['close'].replace(0, np.nan))
    g['log_return'] = g['log_close'].diff()
    # True range
    g['tr1'] = g['high'] - g['low']
    g['tr2'] = (g['high'] - g['close'].shift()).abs()
    g['tr3'] = (g['low'] - g['close'].shift()).abs()
    g['true_range'] = g[['tr1','tr2','tr3']].max(axis=1)
    # ATR like
    g['ATR_14'] = g['true_range'].rolling(14, min_periods=1).mean()
    # rolling vol
    for w in [7,14,30]:
        g[f'roll_vol_{w}'] = g['log_return'].rolling(window=w, min_periods=1).std()
        g[f'ma_{w}'] = g['close'].rolling(window=w, min_periods=1).mean()
    # Bollinger bands (20)
    g['ma_20'] = g['close'].rolling(20, min_periods=1).mean()
    g['std_20'] = g['close'].rolling(20, min_periods=1).std()
    g['bb_upper'] = g['ma_20'] + 2*g['std_20']
    g['bb_lower'] = g['ma_20'] - 2*g['std_20']
    g['bb_width'] = (g['bb_upper'] - g['bb_lower']) / g['ma_20']
    # liquidity
    g['liquidity'] = g['volume'] / (g['marketCap'].replace({0:np.nan}))
    # fill any infinite or large values
    g.replace([np.inf, -np.inf], np.nan, inplace=True)
    return g

df_fe = df.groupby('crypto_name').apply(add_features).reset_index(drop=True)
print("\nAfter feature engineering shape:", df_fe.shape)
df_fe[[ 'crypto_name','date','close','log_return','roll_vol_7','roll_vol_14','roll_vol_30','ATR_14','bb_width','liquidity']].head(10)

# ----
# 6. Target construction
#   Option A: Regression → predict next-day volatility (e.g., roll_vol_7 shifted -1)
#   Option B: Classification → bin volatility into levels (low/medium/high) using quantiles
# ----
# We'll create both: a numeric target 'target_vol_7_next' and a categorical 'vol_level'
df_fe['target_vol_7_next'] = df_fe.groupby('crypto_name')['roll_vol_7'].shift(-1)  # next day's 7-day rolling vol

# For classification, compute per-crypto quantiles (to account for different base vol levels)
def assign_levels(g):
    g = g.copy()
    q_low = g['roll_vol_7'].quantile(0.33)
    q_high = g['roll_vol_7'].quantile(0.66)
    def lvl(x):
        if pd.isna(x): return np.nan
        if x <= q_low: return 0  # low
        elif x <= q_high: return 1  # medium
        else: return 2  # high
    g['vol_level'] = g['roll_vol_7'].apply(lvl)
    return g

df_fe = df_fe.groupby('crypto_name').apply(assign_levels).reset_index(drop=True)
print("\nDistribution of levels (sample):")
print(df_fe['vol_level'].value_counts(dropna=True))

# Drop rows where target is NaN (end of series)
df_model = df_fe.dropna(subset=['target_vol_7_next']).copy()

# ----
# 7. Choose one cryptocurrency (or build cross-crypto model)
#    For demonstration, we'll build two workflows:
#      (A) Per-crypto model for Bitcoin (if present)
#      (B) Cross-crypto model (includes crypto_name as categorical via one-hot or label encoding)
# ----
cryptos = df_model['crypto_name'].unique().tolist()
print("Available cryptos:", cryptos[:10])
# If Bitcoin present, pick it; else pick the largest by records
target_crypto = 'Bitcoin' if 'Bitcoin' in cryptos else cryptos[0]
print("Selected crypto for per-crypto demo:", target_crypto)

df_btc = df_model[df_model['crypto_name']==target_crypto].copy()
print("Per-crypto shape:", df_btc.shape)

# ----
# 8A. Regression (predict next-day volatility) — per-crypto example (Bitcoin)
# Features to use: roll_vol_7, roll_vol_14, roll_vol_30, ATR_14, bb_width, liquidity, ma_7, ma_14, ma_30
features = ['roll_vol_7','roll_vol_14','roll_vol_30','ATR_14','bb_width','liquidity','ma_7','ma_14','ma_30']
df_btc = df_btc.dropna(subset=features + ['target_vol_7_next'])
X = df_btc[features].values
y = df_btc['target_vol_7_next'].values

# Train-test split time-based: use last 20% as test
split_idx = int(len(df_btc)*0.8)
X_tr, X_te = X[:split_idx], X[split_idx:]
y_tr, y_te = y[:split_idx], y[split_idx:]

# Scaling
scaler = StandardScaler()
X_tr_s = scaler.fit_transform(X_tr)
X_te_s = scaler.transform(X_te)

# Baseline model: RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rfr.fit(X_tr_s, y_tr)
pred_te = rfr.predict(X_te_s)
print("\nRegression (per-crypto) metrics — RandomForestRegressor:")
print("RMSE:", np.sqrt(mean_squared_error(y_te, pred_te)))
print("MAE:", mean_absolute_error(y_te, pred_te))
print("R2:", r2_score(y_te, pred_te))

# Feature importances
fi = pd.Series(rfr.feature_importances_, index=features).sort_values(ascending=False)
print("\nFeature importances:\n", fi)

# ----
# 8B. Classification (predict vol_level) — cross-crypto example
# We'll build a cross-crypto model to predict current vol level (0,1,2) using features
# Prepare dataset: drop rows with NaN vol_level
df_class = df_model.dropna(subset=features + ['vol_level']).copy()

# Simple encoding: label encode crypto_name (or drop it to keep purely market features)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_class['crypto_label'] = le.fit_transform(df_class['crypto_name'])

# Choose features + crypto_label
class_features = features + ['crypto_label']
Xc = df_class[class_features].fillna(0).values
yc = df_class['vol_level'].astype(int).values

# train-test split (stratified)
Xc_tr, Xc_te, yc_tr, yc_te = train_test_split(Xc, yc, test_size=0.2, random_state=42, stratify=yc)

scaler_c = StandardScaler()
Xc_tr_s = scaler_c.fit_transform(Xc_tr)
Xc_te_s = scaler_c.transform(Xc_te)

# Baseline classifier
rfc = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1)
rfc.fit(Xc_tr_s, yc_tr)
yc_pred = rfc.predict(Xc_te_s)

print("\nClassification metrics — RandomForestClassifier (cross-crypto):")
print("Accuracy:", accuracy_score(yc_te, yc_pred))
print("Precision (macro):", precision_score(yc_te, yc_pred, average='macro'))
print("Recall (macro):", recall_score(yc_te, yc_pred, average='macro'))
print("\nClassification report:\n", classification_report(yc_te, yc_pred))

# Confusion matrix plot
cm = confusion_matrix(yc_te, yc_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix (vol_level)")
plt.xlabel("Predicted"); plt.ylabel("Actual"); plt.show()

# ----
# 9. Hyperparameter tuning (example grid for RandomForest regression)
# ----
print("\nHyperparameter tuning example (RandomForestRegressor on BTC):")
param_grid = {
    'n_estimators': [50,100],
    'max_depth': [5,10,None],
    'min_samples_leaf': [1,3]
}
grid = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid.fit(X_tr_s, y_tr)
print("Best params:", grid.best_params_)
best_rfr = grid.best_estimator_
pred_best = best_rfr.predict(X_te_s)
print("Tuned RMSE:", np.sqrt(mean_squared_error(y_te, pred_best)))

# ----
# 10. Save model and scaler (for deployment)
# ----
import joblib
os.makedirs('models', exist_ok=True)
joblib.dump(scaler, 'models/btc_scaler.joblib')
joblib.dump(best_rfr, 'models/btc_rfr_best.joblib')
print("Saved scaler and model to models/ directory.")

# ----
# 11. Simple Streamlit app stub (for local deploy) — save as app.py
# ----
streamlit_app = r'''
# Save this as app.py and run: streamlit run app.py
import joblib
import pandas as pd
import streamlit as st
import numpy as np

scaler = joblib.load("models/btc_scaler.joblib")
model = joblib.load("models/btc_rfr_best.joblib")

st.title("Crypto Volatility Predictor (BTC example)")
st.markdown("Provide feature values to predict next-day 7-day rolling volatility.")

# Collect features
features = ["roll_vol_7","roll_vol_14","roll_vol_30","ATR_14","bb_width","liquidity","ma_7","ma_14","ma_30"]
vals = []
for f in features:
    vals.append(st.number_input(f, value=0.0))

if st.button("Predict"):
    X = np.array(vals).reshape(1,-1)
    Xs = scaler.transform(X)
    pred = model.predict(Xs)[0]
    st.write("Predicted next-day 7-day rolling volatility:", pred)
'''
with open('streamlit_app_stub.py', 'w') as f:
    f.write(streamlit_app)
print("\nStreamlit app stub saved to streamlit_app_stub.py (run locally after downloading models).")

# ----
# 12. Evaluation report notes (what to include in your submission)
# ----
# Include:
# - Data cleaning steps and rationale (how missing values handled)
# - Feature engineering list and motivation (why rolling vol, ATR, Bollinger width, liquidity)
# - Model selection choices and baseline vs tuned results (RMSE, MAE, R2, accuracy/precision/recall for classification)
# - Plots: time-series price, volatility time-series, feature importances, confusion matrix
# - HLD & LLD bullets:
#   HLD: Data ingestion -> Preprocessing -> Feature Engineering -> Model Training -> Model Serving (Streamlit/Flask)
#   LLD: scripts for extract, transform, features (e.g., preprocess.py), training (train.py), inference (predict.py), app (streamlit app)
#
# ----
# 13. Next steps & enhancements
# ----
# - Use more advanced models (XGBoost/LightGBM/CatBoost) and compare
# - Use time-series specific models: ARIMA/GARCH for volatility, LSTM/Transformer for sequence modeling
# - Use walk-forward validation instead of random CV for time-series robustness
# - Calibrate classification thresholds using business costs (false negative vs false positive)



FileNotFoundError: /mnt/data/dataset.csv.zip not found. Upload dataset.csv.zip to /mnt/data/