In [None]:
import importlib
import utilities  # your module

importlib.reload(utilities)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
from datetime import datetime,timedelta
import requests
import ta
import base64
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.backends import default_backend
from snowflake.sqlalchemy import URL
from sqlalchemy import create_engine
from sqlalchemy import text

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

In [None]:
with open(os.getenv("PRIVATE_KEY_PATH"), "rb") as key_file:
    private_key = serialization.load_pem_private_key(
        key_file.read(),
        password=None,
        backend=default_backend()
    )

private_key_pkcs8 = base64.b64encode(
    private_key.private_bytes(
        encoding=serialization.Encoding.DER,
        format=serialization.PrivateFormat.PKCS8,
        encryption_algorithm=serialization.NoEncryption()
    )
).decode("utf-8")  # ✅ base64 string, not bytes!

engine = create_engine(URL(
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    user=os.getenv("SNOWFLAKE_USER"),
    private_key=private_key_pkcs8,
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    database=os.getenv("SNOWFLAKE_DATABASE"),
    schema=os.getenv("SNOWFLAKE_SCHEMA"),
    role=os.getenv("SNOWFLAKE_ROLE")
))

with engine.connect() as conn:
    result = conn.execute(text("SELECT CURRENT_USER(), CURRENT_ROLE(), CURRENT_TIMESTAMP();"))
    for row in result:
        print("✅ Snowflake connected:", row)


In [None]:
from datetime import datetime

execution_time = datetime.now().strftime("%Y%m%d%H%M%S")

In [None]:
top100 = pd.read_csv('Assets_Categorized.csv')

In [None]:
top100['staging'] = '@cryptodatasource'
top100.drop(columns=['realticker'],inplace=True)
#if top100['market'] == 'Crypto':
#    top100['ticker'] = top100['ticker']+'-USD'


In [None]:
top100.head()

In [None]:
from utilities import download_yahoo_to_stage

now = datetime(datetime.now().year,datetime.now().month,datetime.now().day)
start_date = datetime(2019, 1, 1)

for index, row in top100.iterrows():
    # Safe SQL string formatting using :params
    query = text("SELECT coalesce(dateadd(day,1,MAX(date)),'2019-01-01') as date FROM PUBLIC.VW_CRYPTO WHERE ticker = :ticker ")
     #query = text("SELECT dateadd(day,1,cast(date_trunc('day',MAX(date)) as date)) as date FROM PUBLIC.VW_CRYPTO WHERE ticker = :ticker ")
    df_date = pd.read_sql(query, con=engine, params={"ticker": row['ticker']})
    start_date = pd.to_datetime(df_date['date'].iloc[0])
     
    if start_date > now:
        print(f"✅ {row['ticker']}: Up to date — skipping download.")
    else:
        # Download only what’s missing
        download_yahoo_to_stage(
            ticker=row['ticker'],
            private_key=private_key,
            stage_area=row['staging'],
            interval="1d",
            start=start_date,
            time="12:00 AM",
            execution_time=execution_time
        )


In [None]:
from sqlalchemy import text
import pandas as pd
import numpy as np

# Step 1: Load and merge
query = text("""
    SELECT DISTINCT date, open,high,low,close, volume, ticker
    FROM PUBLIC.VW_CRYPTO
    WHERE date >= '2020-01-01'
    and ticker in ( select distinct ticker from vw_crypto where date <='2020-01-01')
    ORDER BY ticker, date
""")
data = pd.read_sql(query, con=engine)
data.drop_duplicates(['date', 'ticker'], keep='last', inplace=True)

data = pd.merge(data, top100, on='ticker', how='left')
data = data[data['category'] != 'Stablecoin']

print("✅ Loaded data for tickers:", data['ticker'].nunique())


ohlcv_cols = ['open', 'high', 'low', 'close', 'volume']
data = data.sort_values(by=['ticker', 'date'])

for col in ohlcv_cols:
    # Fill forward, then backward, then with mean (per ticker)
    data[col] = (
        data.groupby('ticker')[col]
        .apply(lambda grp: grp.ffill().bfill().fillna(grp.mean()))
        .reset_index(level=0, drop=True)
    )

nan_summary = data[ohlcv_cols].isna().sum()
print("🧹 Still NaNs in `data` (should be 0):")
print(nan_summary[nan_summary > 0])


# Step 2: Pivot close prices
price_df = data.pivot(index='date', columns='ticker', values='close')

# b. Fill remaining NaNs with the average of the column
price_df = price_df.apply(lambda col: col.fillna(col.mean()), axis=0)

# Step 4: Optional check
nan_summary = price_df.isna().sum()
print("🧹 Still NaN per ticker (should be 0):")
print(nan_summary[nan_summary > 0])





In [None]:
topN = 70
required_tickers = ['BTC-USD', 'ETH-USD','GC=F','SI=F','^GSPC','^NDX','BTC-RUB','BTC-GBP','BTC-CNY','BTC-INR','^VIX']

top_tickers = price_df.notna().sum().sort_values(ascending=False).head(topN).index.tolist()

for ticker in required_tickers:
    if ticker not in top_tickers and ticker in price_df.columns:
        top_tickers.append(ticker)

# Optional: Deduplicate while preserving order
top_tickers = list(dict.fromkeys(top_tickers))

filtered_price_df = price_df[top_tickers]

# Step 3: Clean prices (remove 0s), then compute log returns
cleaned_prices = filtered_price_df.replace(0, np.nan)
returns = np.log(cleaned_prices / cleaned_prices.shift(1)).dropna()

# Step 4: Correlation + Distance matrix
corr_matrix = returns.corr()
distance_matrix = np.sqrt(2 * (1 - corr_matrix))

In [None]:
print(top_tickers)

In [None]:
import networkx as nx
import pandas as pd

# Clean the distance matrix
clean_distance = distance_matrix.dropna(axis=0, how='any').dropna(axis=1, how='any')

# Step 1: Create an undirected graph
G = nx.Graph()

# Add nodes (tickers)
for ticker in clean_distance.columns:
    G.add_node(ticker)

# Add edges with weights (distances)
for i in range(len(clean_distance.columns)):
    for j in range(i + 1, len(clean_distance.columns)):
        t1 = clean_distance.columns[i]
        t2 = clean_distance.columns[j]
        dist = clean_distance.iloc[i, j]
        if pd.notnull(dist):
            G.add_edge(t1, t2, weight=dist)

# Step 2: Create Minimum Spanning Tree (MST)
mst = nx.minimum_spanning_tree(G)

# Step 3: Calculate centrality metrics (using eigenvector_centrality_numpy)
centrality = {
    'degree': nx.degree_centrality(mst),
    'betweenness': nx.betweenness_centrality(mst),
    'closeness': nx.closeness_centrality(mst),
    'eigenvector': nx.eigenvector_centrality_numpy(mst)  # <-- numpy method (robust)
}

# Combine into a DataFrame
centrality_df = pd.DataFrame(centrality)

# Preview top nodes by eigenvector centrality
print(centrality_df.sort_values(by='eigenvector', ascending=False).head())


In [None]:
print(f"Total nodes in MST: {len(mst.nodes())}")
print(f"Total edges in MST: {len(mst.edges())}")


In [None]:
centrality_df.head()

In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

# Use spring layout for positioning
pos = nx.spring_layout(mst, seed=42, k=0.3)

# Centrality scores
eigen = centrality_df['eigenvector']
closeness = centrality_df['closeness']

# Normalize node sizes and colors
node_sizes = 1000 * (eigen - eigen.min()) / (eigen.max() - eigen.min()) + 300
norm_closeness = (closeness - closeness.min()) / (closeness.max() - closeness.min())

# Edge weights and colors based on inverse distance
edge_weights = [1 / mst[u][v]['weight'] for u, v in mst.edges()]
edge_colors = edge_weights

# Set up plot
fig, ax = plt.subplots(figsize=(14, 10))

# Draw nodes
nodes = nx.draw_networkx_nodes(
    mst, pos,
    node_size=node_sizes,
    node_color=norm_closeness,
    cmap=plt.cm.turbo,
    ax=ax
)

# Draw edges with color mapping
edges = nx.draw_networkx_edges(
    mst, pos,
    width=edge_weights,
    edge_color=edge_colors,
    edge_cmap=plt.cm.plasma,
    edge_vmin=min(edge_colors),
    edge_vmax=max(edge_colors),
    alpha=0.6,
    ax=ax
)

# Draw labels
nx.draw_networkx_labels(
    mst, pos,
    font_size=10,
    font_family='serif',
    ax=ax
)

# Highlight top eigenvector nodes
#top_n = eigen.sort_values(ascending=False).head(5).index
#for node in top_n:
#    x, y = pos[node]
#    ax.text(x, y + 0.05, f"[Top] {node}", fontsize=9, ha='center', color='gold')
    #ax.text(x, y + 0.05, f"⭐ {node}", fontsize=9, ha='center', color='gold')

# Add colorbar for node color
sm = plt.cm.ScalarMappable(cmap=plt.cm.turbo, norm=plt.Normalize(vmin=closeness.min(), vmax=closeness.max()))
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax)
cbar.set_label("Closeness Centrality")

# Final formatting
ax.set_title("Crypto Network (MST)\nSize = Eigenvector | Color = Closeness | Edge = Correlation Strength", fontsize=14)
plt.axis('off')
plt.tight_layout()
plt.show()




In [None]:
filtered_price_df.head()

In [None]:
from utilities import cluster_from_correlation

data.isna().sum()



In [None]:
cluster_corr = cluster_from_correlation(data, k=4)
cluster_corr.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=cluster_corr, x='cluster', palette='tab10')
plt.title('Number of Coins per Cluster')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.show()


In [None]:
btc_cluster = cluster_corr[cluster_corr['ticker']=='BTC-USD']['cluster'].values[0]

In [None]:
cluster_corr[cluster_corr['cluster']==btc_cluster]

In [None]:
from utilities import louvain_from_returns
#print(data['ticker'].unique())
cluster_louvain,G,clustered_returns = louvain_from_returns(data, min_corr=0.5, plot=True)


In [None]:
from utilities import technical_analysis

btc_df = data[data['ticker'] == 'BTC-USD'].copy()
btc_df = technical_analysis(btc_df)

In [None]:
btc_cluster = cluster_louvain[cluster_louvain['ticker'] == 'BTC-USD']['cluster'].values[0]
peer_tickers = cluster_louvain[(cluster_louvain['cluster'] == btc_cluster) & 
                                (cluster_louvain['ticker'] != 'BTC-USD')]['ticker'].tolist()

# Filter and pivot
peer_df = data[data['ticker'].isin(peer_tickers)]
peer_prices = peer_df.pivot(index='date', columns='ticker', values='close').sort_index()


# Get all unique clusters
unique_clusters = cluster_louvain['cluster'].unique()

# Plot each cluster as a subgraph
for cluster_id in sorted(unique_clusters):
    tickers_in_cluster = cluster_louvain[cluster_louvain['cluster'] == cluster_id]['ticker'].tolist()
    subgraph = G.subgraph(tickers_in_cluster)

    # Skip clusters with only 1 node (no edges to draw)
    if len(subgraph.nodes) <= 1:
        continue

    pos = nx.spring_layout(subgraph, seed=42)
    plt.figure(figsize=(8, 6))
    nx.draw(subgraph, pos, with_labels=True,
            node_color='lightcoral', edge_color='gray', node_size=800)
    plt.title(f"Louvain Cluster #{cluster_id} ({len(subgraph.nodes)} nodes)")
    plt.axis('off')
    plt.tight_layout()
    plt.show()




In [None]:
print (peer_tickers)

In [None]:
btc_df['return'] = btc_df['close'].pct_change()
btc_df['target'] = (btc_df['return'].shift(-1) > 0).astype(int)  # Predict next-day move


In [None]:
btc_features = btc_df.set_index('date')[[
    'MACD', 'MACD_Diff', 'RSI', 'MFI', 'EMA_Short', 'EMA_Long', 
    'Bollinger_Upper', 'Bollinger_Lower', 'Stochastic', 'SAR', 'target'
]].dropna()

final_df = btc_features.join(peer_prices, how='inner')
final_df = final_df.dropna()  # Drop rows with any missing peer close prices


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Features and target
X = final_df.drop(columns='target')
y = final_df['target']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

# Pipeline: StandardScaler + RandomForest
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42))
])

# Fit and evaluate
pipeline.fit(X_train, y_train)
print("✅ Model accuracy:", pipeline.score(X_test, y_test))



In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Compute the matrix
y_pred = pipeline.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

# Labels
labels = np.array([
    ["TN: Correctly predicted BTC will go down", "FP: Predicted up but it went down"],
    ["FN: Predicted down but it went up", "TP: Correctly predicted BTC will go up"]
])

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Standard numerical matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=["Predicted Down", "Predicted Up"],
            yticklabels=["Actual Down", "Actual Up"],
            ax=axes[0])
axes[0].set_title("Confusion Matrix (Counts)")
axes[0].set_xlabel("Prediction")
axes[0].set_ylabel("Actual")

# Annotated label version
sns.heatmap(cm, annot=labels, fmt='', cmap='Blues', cbar=False,
            xticklabels=["Predicted Down", "Predicted Up"],
            yticklabels=["Actual Down", "Actual Up"],
            ax=axes[1])
axes[1].set_title("Confusion Matrix with BTC Prediction Labels")
axes[1].set_xlabel("Prediction")
axes[1].set_ylabel("Actual")

plt.tight_layout()
plt.show()




In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=["Down", "Up"]))


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

# Prep BTC-only dataframe
btc_lregression = btc_df.copy()
btc_lregression['target'] = btc_lregression['close'].shift(-1)

# Keep only TA indicators + target
btc_features = btc_lregression.set_index('date')[[
    'MACD', 'MACD_Diff', 'RSI', 'MFI', 'EMA_Short', 'EMA_Long',
    'Bollinger_Upper', 'Bollinger_Lower', 'Stochastic', 'SAR', 'target'
]].dropna()

# Join peer prices (cluster tickers)
final_lregression = btc_features.join(peer_prices, how='inner').dropna()

# Features & Target
X = final_lregression.drop(columns='target')
y = final_lregression['target']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

# Build pipeline: Standardization + LinearRegression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LinearRegression())
])

# Train model
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_test)

# Evaluate
print("✅ R² score:", r2_score(y_test, y_pred))
print("📉 RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))



In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(y_test.index, y_test, label="Actual BTC Price", color='blue')
plt.plot(y_test.index, y_pred, label="Predicted BTC Price", color='orange')
plt.title("BTC Close Price: Actual vs Predicted")
plt.xlabel("Date")
plt.ylabel("Price (USD)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
residuals = y_test - y_pred
plt.figure(figsize=(8, 4))
plt.plot(y_test.index, residuals, label='Residuals (Actual - Predicted)', color='red')
plt.axhline(0, linestyle='--', color='gray')
plt.title("Prediction Residuals")
plt.xlabel("Date")
plt.ylabel("Error")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Residuals
residuals = y_test - y_pred

# Plot histogram + KDE
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True, bins=40, color='purple')
plt.axvline(0, linestyle='--', color='gray')
plt.title("Distribution of Prediction Residuals")
plt.xlabel("Prediction Error (Actual - Predicted)")
plt.ylabel("Frequency / Density")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import statsmodels.api as sm

# Add intercept manually (statsmodels doesn't do it by default)
X_sm = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X_sm).fit()

# Show summary
print(model.summary())


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Select significant features (excluding intercept)
significant_features = model.pvalues[model.pvalues < 0.05].index.tolist()
X_reduced = X[significant_features[1:]]  # Exclude intercept if present

# Build reduced pipeline
pipeline_reduced = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LinearRegression())
])

# Train model with reduced features
pipeline_reduced.fit(X_train[X_reduced.columns], y_train)

# Predict with reduced features
y_pred_reduced = pipeline_reduced.predict(X_test[X_reduced.columns])

# Evaluate
print("✅ R² score (reduced):", r2_score(y_test, y_pred_reduced))
print("📉 RMSE (reduced):", np.sqrt(mean_squared_error(y_test, y_pred_reduced)))

# Residuals
residuals = y_test - y_pred_reduced

# Plot histogram + KDE
plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True, bins=40, color='purple')
plt.axvline(0, linestyle='--', color='gray')
plt.title("Distribution of Prediction Residuals (Reduced Features)")
plt.xlabel("Prediction Error (Actual - Predicted)")
plt.ylabel("Frequency / Density")
plt.grid(True)
plt.tight_layout()
plt.show()




In [None]:
residuals_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred_reduced,
    'Residual': residuals
})

# Filter rows with absolute residual > 7000
significant_errors = residuals_df[np.abs(residuals_df['Residual']) > 7000]

# Show them
print("🔍 Residuals with absolute error > $7,000:")
print(significant_errors.sort_values(by='Residual', key=abs, ascending=False))