# Data Exploration and Visualization

This notebook demonstrates how to explore and visualize cryptocurrency market data using the BTB data loading utilities.

In [None]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Add project root to path for imports
sys.path.append(os.path.abspath(".."))

from btb.data.loader import DataLoader
from btb.data.preprocessing import DataPreprocessor
from btb.utils.config import load_config

## 1. Load Configuration

In [None]:
# Load backtest configuration
config = load_config("../config/backtest_config.yaml")
print(
    f"Configuration loaded for {config['backtest']['symbols'][0]} \
    with {config['backtest']['timeframes'][0]} timeframe"
)

## 2. Load Historical Data

In [None]:
# Initialize data loader
# The DataLoader will use parameters from the load_data call below
# or configuration if passed during initialization (e.g., API keys).
data_loader = DataLoader() # Initialize without forcing dummy data

# Load historical market data
start_date = config["backtest"]["start_date"]
end_date = config["backtest"]["end_date"]
symbol = config["backtest"]["symbols"][0]
timeframe = config["backtest"]["timeframes"][0]

# Load data
data = data_loader.load_data(symbols=[symbol], timeframes=[timeframe], start_date=start_date, end_date=end_date)

# Get the DataFrame for the specific symbol and timeframe
df = data[f"{symbol}_{timeframe}"]

# Display first few rows
df.head()

## 3. Basic Data Statistics and Visualization

In [None]:
# Display basic statistics
print("Data shape:", df.shape)
print("\nBasic statistics:")
df.describe()

In [None]:
# Plot price history
plt.figure(figsize=(14, 7))
plt.plot(df.index, df["close"], label="Close Price")
plt.title(f"{symbol} Price History")
plt.xlabel("Date")
plt.ylabel("Price")
plt.legend()
plt.grid(True)
plt.show()

## 4. Generate Technical Indicators

In [None]:
# Initialize data preprocessor
preprocessor = DataPreprocessor()

# Add technical indicators
df_with_features = preprocessor.add_technical_indicators(df)

# Display columns after adding indicators
print("Generated features:", df_with_features.columns.tolist())
df_with_features.head()

## 5. Visualize Technical Indicators

In [None]:
# Select a subset period for better visualization
subset_period = df_with_features[-100:].copy()

# Plot price with MA (using calculated indicators: ma_21, ma_50)
plt.figure(figsize=(14, 7))
plt.plot(subset_period.index, subset_period["close"], label="Close Price")
if 'ma_21' in subset_period.columns:
    plt.plot(subset_period.index, subset_period["ma_21"], label="MA 21")
if 'ma_50' in subset_period.columns:
    plt.plot(subset_period.index, subset_period["ma_50"], label="MA 50")
plt.title(f"{symbol} Price with Moving Averages")
plt.xlabel("Date")
plt.ylabel("Price")
plt.legend()
plt.grid(True)
plt.show()

## 6. Correlation Analysis

In [None]:
# Calculate correlation matrix of features
correlation = df_with_features.select_dtypes(include=[np.number]).corr()

# Plot correlation heatmap
plt.figure(figsize=(16, 14))
mask = np.triu(correlation)
sns.heatmap(correlation, mask=mask, annot=False, cmap="coolwarm", linewidths=0.5)
plt.title("Feature Correlation Matrix")
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## 7. Anomaly Detection

In [None]:
# Calculate daily returns
df_with_features["daily_return"] = df_with_features["close"].pct_change()

# Calculate mean and standard deviation of returns
mean_return = df_with_features["daily_return"].mean()
std_return = df_with_features["daily_return"].std()

# Define outliers (3 standard deviations from mean)
outliers = df_with_features[abs(df_with_features["daily_return"] - mean_return) > 3 * std_return]

# Plot returns with outliers highlighted
plt.figure(figsize=(14, 7))
plt.plot(df_with_features.index, df_with_features["daily_return"], label="Daily Returns", alpha=0.5)
plt.scatter(outliers.index, outliers["daily_return"], color="red", label="Anomalies (>3σ)", alpha=1)
plt.axhline(y=mean_return, color="g", linestyle="-", alpha=0.3, label="Mean Return")
plt.axhline(y=mean_return + 3 * std_return, color="r", linestyle="--", alpha=0.3, label="Upper Bound (3σ)")
plt.axhline(y=mean_return - 3 * std_return, color="r", linestyle="--", alpha=0.3, label="Lower Bound (3σ)")
plt.title("Daily Returns with Anomaly Detection")
plt.xlabel("Date")
plt.ylabel("Daily Return")
plt.legend()
plt.grid(True)
plt.show()

# Print anomalies
print(f"Found {len(outliers)} anomalies in the price data")
if len(outliers) > 0:
    print(outliers[["close", "daily_return"]])

## 8. Feature Importance Analysis

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Define target (next day's return)
df_with_features["target"] = df_with_features["close"].pct_change(1).shift(-1)

# Drop NaN values
df_clean = df_with_features.dropna()

# Select features (exclude price data and target)
feature_cols = [
    col for col in df_clean.columns if col not in ["open", "high", "low", "close", "volume", "target", "daily_return"]
]

# Prepare data
X = df_clean[feature_cols]
y = df_clean["target"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_scaled, y)

# Get feature importances
feature_importance = pd.DataFrame({"Feature": feature_cols, "Importance": model.feature_importances_}).sort_values(
    by="Importance", ascending=False
)

# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x="Importance", y="Feature", data=feature_importance[:15])
plt.title("Top 15 Most Important Features for Price Prediction")
plt.grid(True, axis="x")
plt.tight_layout()
plt.show()

## 9. Save Processed Data for Model Training

In [None]:
# Normalize data for ML models using the public process method
# Note: The process method expects a dictionary and returns a dictionary.
# We'll wrap our DataFrame, process it, and then extract it.
data_to_process = {f"{symbol}_{timeframe}": df_with_features.copy()}
processed_data_dict = preprocessor.process(
    data_to_process, 
    add_technical_indicators=False, # Indicators already added
    normalize="min_max",          # Specify normalization method
    fill_missing=None             # Missing values already handled
)

# Extract the processed DataFrame
df_normalized = processed_data_dict[f"{symbol}_{timeframe}"]

# Select only the feature columns and the original 'close' column for saving
# (The target was calculated earlier and might not be present after processing if NaNs were dropped)
# Re-calculate target on the potentially shorter df_normalized if needed, or handle in model training
df_to_save = df_normalized[feature_cols + ["close"]].copy()

# Display normalized data
df_to_save.head()

In [None]:
# Save processed data for model training
# Note: The target column is not saved here; it should be generated during model training sequence creation.
processed_data_path = "../data/processed/"
os.makedirs(processed_data_path, exist_ok=True)
df_to_save.to_csv(f"{processed_data_path}{symbol}_{timeframe}_processed.csv")
print(f"Processed data saved to: {processed_data_path}{symbol}_{timeframe}_processed.csv")

## 10. Summary and Next Steps

In this notebook, we:
1. Loaded historical market data using the BTB DataLoader
2. Visualized price history and patterns
3. Generated technical indicators using the DataPreprocessor
4. Performed correlation analysis to understand feature relationships
5. Detected anomalies in the price data
6. Analyzed feature importance for price prediction
7. Prepared and saved normalized data for model training

Next steps:
- Use the processed data to train ML models in the model development notebook
- Test different feature combinations for improved model performance
- Experiment with different lookback windows and prediction horizons