In [31]:
import ccxt
import pandas as pd
import pandas_ta as ta
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score

# Prettify plots
sns.set_style('whitegrid')

In [32]:
# -----------------------------------------------------------------------------
# 1. Configuration & Data Fetching
# -----------------------------------------------------------------------------
SYMBOL = 'ETH/USDT'
TIMEFRAME = '1h'
LIMIT = 1000

# Initialize Exchange
exchange = ccxt.binance({'enableRateLimit': True})

def fetch_historical_data(symbol, timeframe, limit):
    """
    Fetches OHLCV data from Binance and returns a DataFrame.
    """
    print(f"Fetching {limit} {timeframe} candles for {symbol}...")
    ohlcv = exchange.fetch_ohlcv(symbol, timeframe, limit=limit)
    
    df = pd.DataFrame(ohlcv, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
    df['datetime'] = pd.to_datetime(df['timestamp'], unit='ms')
    df.set_index('datetime', inplace=True)
    
    # Ensure numeric types
    numeric_cols = ['open', 'high', 'low', 'close', 'volume']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, axis=1)
    
    return df

df = fetch_historical_data(SYMBOL, TIMEFRAME, LIMIT)
print(f"Data Loaded: {df.shape}")
df.head()

Fetching 1000 1h candles for ETH/USDT...
Data Loaded: (1000, 6)


Unnamed: 0_level_0,timestamp,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-12-18 00:00:00,1766016000000,2833.5,2838.44,2826.35,2836.27,5773.9902
2025-12-18 01:00:00,1766019600000,2836.28,2836.28,2821.34,2827.2,7793.2702
2025-12-18 02:00:00,1766023200000,2827.21,2847.65,2822.0,2837.15,9395.8155
2025-12-18 03:00:00,1766026800000,2837.15,2838.17,2822.5,2832.06,7152.7251
2025-12-18 04:00:00,1766030400000,2832.06,2841.76,2831.64,2838.24,5700.637


In [33]:
# -----------------------------------------------------------------------------
# 2. Feature Engineering
# -----------------------------------------------------------------------------
# Calculate Log Returns
df['log_returns'] = np.log(df['close'] / df['close'].shift(1))

# Calculate Technical Indicators (RSI, MACD, ATR)
df.ta.rsi(length=14, append=True)
df.ta.macd(fast=12, slow=26, signal=9, append=True)
df.ta.atr(length=14, append=True)

# Drop NaNs generated by indicators
df.dropna(inplace=True)

# Verify columns
print("Columns:", df.columns)

Columns: Index(['timestamp', 'open', 'high', 'low', 'close', 'volume', 'log_returns',
       'RSI_14', 'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9', 'ATRr_14'],
      dtype='object')


In [None]:
import statsmodels.api as sm

# -----------------------------------------------------------------------------
# 1. Prepare the Variables
# -----------------------------------------------------------------------------
# INDEPENDENT VARIABLES (X): The features we see NOW
# We select the columns and drop the 'target_direction' or other non-numeric stuff
X = df[['RSI_14', 'MACDh_12_26_9', 'ATRr_14']].copy()

# DEPENDENT VARIABLE (Y): The return we want to PREDICT (Next Hour's Return)
# We shift log_returns backwards by 1. 
# So the row for "12:00" contains the indicators for 12:00, but the return for 13:00.
df['future_return'] = df['log_returns'].shift(-1)
Y = df['future_return']

# -----------------------------------------------------------------------------
# 2. Clean Data (Stationarity & NaNs)
# -----------------------------------------------------------------------------
# The shift operation creates a NaN at the very end. The indicators created NaNs at the start.
# We must align them perfectly.
data = pd.concat([Y, X], axis=1).dropna()

Y_clean = data['future_return']
X_clean = data[['RSI_14', 'MACDh_12_26_9', 'ATRr_14']]

# -----------------------------------------------------------------------------
# 3. Run OLS Regression
# -----------------------------------------------------------------------------
# Statsmodels requires you to manually add the Constant (Intercept)
X_clean = sm.add_constant(X_clean)

# Initialize and Fit
model = sm.OLS(Y_clean, X_clean).fit()

# -----------------------------------------------------------------------------
# 4. Print the Summary
# -----------------------------------------------------------------------------
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          future_return   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.072
Date:                Wed, 28 Jan 2026   Prob (F-statistic):              0.360
Time:                        18:23:53   Log-Likelihood:                 3759.6
No. Observations:                 966   AIC:                            -7511.
Df Residuals:                     962   BIC:                            -7492.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -0.0009      0.001     -0.839

In [36]:
# -----------------------------------------------------------------------------
# 3. New Hypothesis: Can we predict RISK (Volatility)?
# -----------------------------------------------------------------------------
# Y = Squared Future Returns (A proxy for realized volatility)
df['future_volatility'] = df['future_return'] ** 2
Y_vol = df['future_volatility']

# X = Current ATR (Average True Range)
X_vol = df[['ATRr_14']]
X_vol = sm.add_constant(X_vol)

# Align Data
data_vol = pd.concat([Y_vol, X_vol], axis=1).dropna()

# Run OLS
model_vol = sm.OLS(data_vol['future_volatility'], data_vol[['const', 'ATRr_14']]).fit()

print("\n--- Volatility Prediction Model ---")
print(model_vol.summary())


--- Volatility Prediction Model ---
                            OLS Regression Results                            
Dep. Variable:      future_volatility   R-squared:                       0.017
Model:                            OLS   Adj. R-squared:                  0.016
Method:                 Least Squares   F-statistic:                     17.10
Date:                Wed, 28 Jan 2026   Prob (F-statistic):           3.86e-05
Time:                        18:31:13   Log-Likelihood:                 7551.5
No. Observations:                 966   AIC:                        -1.510e+04
Df Residuals:                     964   BIC:                        -1.509e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -1.