In [None]:
# Phase 2: Crypto Market Intelligence System
# Full ML Pipeline: Data → Features → Models → Evaluation

# ============================================================================
# SECTION 1: SETUP & DATA LOADING
# ============================================================================

import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, confusion_matrix, classification_report
)
import ta
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("=" * 80)
print("PHASE 2: CRYPTO MARKET INTELLIGENCE SYSTEM")
print("=" * 80)

# ============================================================================
# SECTION 2: DATA COLLECTION
# ============================================================================

print("\n[1] FETCHING DATA FROM COINGECKO...")

url = "https://api.coingecko.com/api/v3/coins/bitcoin/market_chart"
params = {
    'vs_currency': 'usd',
    'days': '365',
    'interval': 'daily'
}

r = requests.get(url, params=params)
data = r.json()

# Convert to DataFrame
prices = pd.DataFrame(data['prices'], columns=['timestamp', 'price'])
volumes = pd.DataFrame(data['total_volumes'], columns=['timestamp', 'volume'])

# Merge price and volume
df = prices.merge(volumes, on='timestamp')

# Convert timestamp to datetime
df['date'] = pd.to_datetime(df['timestamp'], unit='ms')
df = df.drop('timestamp', axis=1)

print(f"✓ Loaded {len(df)} days of Bitcoin data")
print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"\nFirst few rows:\n{df.head()}")

# ============================================================================
# SECTION 3: FEATURE ENGINEERING
# ============================================================================

print("\n" + "=" * 80)
print("[2] FEATURE ENGINEERING")
print("=" * 80)

# Make a copy to work with
df_features = df.copy()
df_features = df_features.sort_values('date').reset_index(drop=True)

# --- PRICE-BASED FEATURES ---
print("\n[Price-Based Features]")

# Daily returns (% change in price)
df_features['daily_return'] = df_features['price'].pct_change()
print("  ✓ Daily return")

# Price momentum (% change over 5 days, 10 days)
df_features['momentum_5d'] = df_features['price'].pct_change(5)
df_features['momentum_10d'] = df_features['price'].pct_change(10)
print("  ✓ Momentum (5d, 10d)")

# Rolling volatility (standard deviation of returns over 30 days)
df_features['volatility_30d'] = df_features['daily_return'].rolling(30).std()
print("  ✓ Volatility (30d rolling)")

# Price range (high-low as % of close)
df_features['price_range'] = (df_features['price'].rolling(20).max() - 
                              df_features['price'].rolling(20).min()) / df_features['price']
print("  ✓ Price range (20d)")

# --- TECHNICAL INDICATORS (using TA library) ---
print("\n[Technical Indicators]")

# Simple Moving Average (SMA) - smooths price trend
df_features['sma_20'] = ta.trend.sma_indicator(df_features['price'], window=20)
df_features['sma_50'] = ta.trend.sma_indicator(df_features['price'], window=50)
print("  ✓ SMA (20, 50)")

# Exponential Moving Average (EMA) - gives more weight to recent prices
df_features['ema_12'] = ta.trend.ema_indicator(df_features['price'], window=12)
df_features['ema_26'] = ta.trend.ema_indicator(df_features['price'], window=26)
print("  ✓ EMA (12, 26)")

# MACD (Moving Average Convergence Divergence) - momentum indicator
macd_line = ta.trend.macd(df_features['price'])
df_features['macd'] = macd_line
print("  ✓ MACD")

# RSI (Relative Strength Index) - measures overbought/oversold (0-100)
df_features['rsi_14'] = ta.momentum.rsi(df_features['price'], window=14)
print("  ✓ RSI (14)")

# Bollinger Bands (upper, lower bands + %B)
bb = ta.volatility.BollingerBands(df_features['price'], window=20, window_dev=2)
df_features['bb_upper'] = bb.bollinger_hband()
df_features['bb_lower'] = bb.bollinger_lband()
df_features['bb_pct'] = bb.bollinger_pband()  # % between bands
print("  ✓ Bollinger Bands")

# --- VOLUME-BASED FEATURES ---
print("\n[Volume-Based Features]")

# Volume ratio (current vs average)
df_features['volume_ratio'] = df_features['volume'] / df_features['volume'].rolling(20).mean()
print("  ✓ Volume ratio")

# Price-Volume Trend (shows relationship between price and volume)
df_features['pvt'] = ta.volume.money_flow_index(df_features['price'], df_features['volume'], window=14)
print("  ✓ Price-Volume Trend (Money Flow Index)")

# --- DERIVED FEATURES ---
print("\n[Derived Features]")

# Price position relative to SMA (how far from trend)
df_features['price_to_sma20'] = df_features['price'] / df_features['sma_20']
df_features['price_to_sma50'] = df_features['price'] / df_features['sma_50']
print("  ✓ Price to SMA ratios")

# EMA crossover signal (bullish when EMA12 > EMA26)
df_features['ema_signal'] = (df_features['ema_12'] > df_features['ema_26']).astype(int)
print("  ✓ EMA crossover signal")

# Remove rows with NaN values (from rolling/indicator calculations)
initial_rows = len(df_features)
df_features = df_features.dropna()
removed_rows = initial_rows - len(df_features)
print(f"\nRemoved {removed_rows} rows with NaN values (technical indicators need historical data)")
print(f"Final dataset: {len(df_features)} rows")

# ============================================================================
# SECTION 4: TARGET VARIABLE CREATION
# ============================================================================

print("\n" + "=" * 80)
print("[3] TARGET VARIABLE CREATION")
print("=" * 80)

# REGRESSION TARGET: Predict tomorrow's price
df_features['target_price_tomorrow'] = df_features['price'].shift(-1)

# CLASSIFICATION TARGET: Predict up/down movement
# 1 = price goes up tomorrow, 0 = price goes down or stays same
df_features['target_direction'] = (df_features['price'].shift(-1) > df_features['price']).astype(int)

# Remove last row (no target for final day)
df_features = df_features[:-1].reset_index(drop=True)

print(f"✓ Regression target: next day's price")
print(f"✓ Classification target: up (1) or down (0)")
print(f"\nTarget distribution:")
print(f"  Up days: {df_features['target_direction'].sum()} ({df_features['target_direction'].mean():.1%})")
print(f"  Down days: {(1-df_features['target_direction']).sum()} ({(1-df_features['target_direction']).mean():.1%})")

# ============================================================================
# SECTION 5: EXPLORATORY DATA ANALYSIS
# ============================================================================

print("\n" + "=" * 80)
print("[4] FEATURE CORRELATION & ANALYSIS")
print("=" * 80)

# Select key features for correlation analysis
key_features = [
    'price', 'daily_return', 'volatility_30d', 'sma_20', 'rsi_14', 
    'volume_ratio', 'pvt', 'target_price_tomorrow'
]

correlation_matrix = df_features[key_features].corr()
print("\nFeature Correlations with Target Price:")
print(correlation_matrix['target_price_tomorrow'].sort_values(ascending=False))

# Visualize
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# ============================================================================
# SECTION 6: DATA PREPARATION FOR MODELING
# ============================================================================

print("\n" + "=" * 80)
print("[5] DATA PREPARATION")
print("=" * 80)

# Select features (exclude price, date, and targets)
feature_cols = [col for col in df_features.columns 
                if col not in ['date', 'price', 'volume', 'target_price_tomorrow', 'target_direction']]

X = df_features[feature_cols].copy()
y_regression = df_features['target_price_tomorrow'].copy()
y_classification = df_features['target_direction'].copy()

print(f"\n✓ Selected {len(feature_cols)} features for modeling")
print(f"  Features: {', '.join(feature_cols[:5])}... (showing first 5)")

# Train-test split (80-20)
X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
    X, y_regression, y_classification, test_size=0.2, random_state=42
)

print(f"\nTrain-test split (80-20):")
print(f"  Training set: {len(X_train)} samples")
print(f"  Test set: {len(X_test)} samples")

# Scale features (important for linear models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\n✓ Features scaled using StandardScaler")

# ============================================================================
# SECTION 7: REGRESSION MODELS (Price Prediction)
# ============================================================================

print("\n" + "=" * 80)
print("[6] REGRESSION MODELS - PREDICTING NEXT DAY'S PRICE")
print("=" * 80)

regression_models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Decision Tree': DecisionTreeRegressor(max_depth=10, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
}

regression_results = {}

for name, model in regression_models.items():
    print(f"\n{name}:")
    
    # Train
    model.fit(X_train_scaled, y_reg_train)
    
    # Predict
    y_pred_train = model.predict(X_train_scaled)
    y_pred_test = model.predict(X_test_scaled)
    
    # Evaluate
    train_mse = mean_squared_error(y_reg_train, y_pred_train)
    test_mse = mean_squared_error(y_reg_test, y_pred_test)
    test_rmse = np.sqrt(test_mse)
    test_mae = mean_absolute_error(y_reg_test, y_pred_test)
    test_r2 = r2_score(y_reg_test, y_pred_test)
    
    regression_results[name] = {
        'model': model,
        'train_mse': train_mse,
        'test_mse': test_mse,
        'test_rmse': test_rmse,
        'test_mae': test_mae,
        'test_r2': test_r2
    }
    
    print(f"  Train MSE: {train_mse:.2f}")
    print(f"  Test RMSE: {test_rmse:.2f}")
    print(f"  Test MAE: ${test_mae:.2f}")
    print(f"  Test R²: {test_r2:.4f}")

# ============================================================================
# SECTION 8: CLASSIFICATION MODELS (Up/Down Prediction)
# ============================================================================

print("\n" + "=" * 80)
print("[7] CLASSIFICATION MODELS - PREDICTING UP/DOWN MOVEMENT")
print("=" * 80)

classification_models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(max_depth=10, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
}

classification_results = {}

for name, model in classification_models.items():
    print(f"\n{name}:")
    
    # Train
    model.fit(X_train_scaled, y_clf_train)
    
    # Predict
    y_pred_test = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Evaluate
    accuracy = accuracy_score(y_clf_test, y_pred_test)
    precision = precision_score(y_clf_test, y_pred_test)
    recall = recall_score(y_clf_test, y_pred_test)
    f1 = f1_score(y_clf_test, y_pred_test)
    roc_auc = roc_auc_score(y_clf_test, y_pred_proba)
    
    classification_results[name] = {
        'model': model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }
    
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    print(f"  ROC-AUC: {roc_auc:.4f}")

# ============================================================================
# SECTION 9: MODEL COMPARISON & VISUALIZATION
# ============================================================================

print("\n" + "=" * 80)
print("[8] MODEL COMPARISON")
print("=" * 80)

# Regression comparison
print("\nREGRESSION MODELS COMPARISON:")
reg_comparison = pd.DataFrame(regression_results).T[['test_rmse', 'test_mae', 'test_r2']]
reg_comparison = reg_comparison.sort_values('test_rmse')
print(reg_comparison)

# Classification comparison
print("\nCLASSIFICATION MODELS COMPARISON:")
clf_comparison = pd.DataFrame(classification_results).T[['accuracy', 'precision', 'recall', 'f1', 'roc_auc']]
clf_comparison = clf_comparison.sort_values('roc_auc', ascending=False)
print(clf_comparison)

# Visualization: Model Performance Comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Regression
reg_names = list(regression_results.keys())
reg_rmse = [regression_results[m]['test_rmse'] for m in reg_names]
axes[0].barh(reg_names, reg_rmse, color='steelblue')
axes[0].set_xlabel('RMSE (Lower is Better)')
axes[0].set_title('Regression Models - RMSE Comparison')
axes[0].invert_yaxis()

# Classification
clf_names = list(classification_results.keys())
clf_f1 = [classification_results[m]['f1'] for m in clf_names]
axes[1].barh(clf_names, clf_f1, color='coral')
axes[1].set_xlabel('F1 Score (Higher is Better)')
axes[1].set_title('Classification Models - F1 Score Comparison')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

# ============================================================================
# SECTION 10: FEATURE IMPORTANCE
# ============================================================================

print("\n" + "=" * 80)
print("[9] FEATURE IMPORTANCE ANALYSIS")
print("=" * 80)

# Get best models
best_reg_model = regression_models['Random Forest']
best_clf_model = classification_models['Random Forest']

# Train on full training set to get importance
best_reg_model.fit(X_train_scaled, y_reg_train)
best_clf_model.fit(X_train_scaled, y_clf_train)

# Feature importance
reg_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': best_reg_model.feature_importances_
}).sort_values('importance', ascending=False)

clf_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': best_clf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Features - Regression:")
print(reg_importance.head(10))

print("\nTop 10 Features - Classification:")
print(clf_importance.head(10))

# Visualization: Feature Importance
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

top_n = 10
reg_importance.head(top_n).sort_values('importance').plot(
    kind='barh', x='feature', y='importance', ax=axes[0], legend=False, color='steelblue'
)
axes[0].set_title('Top 10 Features - Regression Model')
axes[0].set_xlabel('Importance')

clf_importance.head(top_n).sort_values('importance').plot(
    kind='barh', x='feature', y='importance', ax=axes[1], legend=False, color='coral'
)
axes[1].set_title('Top 10 Features - Classification Model')
axes[1].set_xlabel('Importance')

plt.tight_layout()
plt.show()

# ============================================================================
# SECTION 11: SUMMARY & INSIGHTS
# ============================================================================

print("\n" + "=" * 80)
print("[10] SUMMARY & KEY INSIGHTS")
print("=" * 80)

best_reg = regression_results['Random Forest']
best_clf = classification_results['Random Forest']

print(f"""
BEST REGRESSION MODEL: Random Forest
  - RMSE: ${best_reg['test_rmse']:.2f}
  - MAE: ${best_reg['test_mae']:.2f}
  - R² Score: {best_reg['test_r2']:.4f}
  
BEST CLASSIFICATION MODEL: Random Forest
  - Accuracy: {best_clf['accuracy']:.4f}
  - F1 Score: {best_clf['f1']:.4f}
  - ROC-AUC: {best_clf['roc_auc']:.4f}

KEY FEATURES FOR PREDICTION:
  1. {reg_importance.iloc[0]['feature']} (Importance: {reg_importance.iloc[0]['importance']:.3f})
  2. {reg_importance.iloc[1]['feature']} (Importance: {reg_importance.iloc[1]['importance']:.3f})
  3. {reg_importance.iloc[2]['feature']} (Importance: {reg_importance.iloc[2]['importance']:.3f})
""")

print("=" * 80)
print("PHASE 2 PIPELINE COMPLETE")
print("=" * 80)