# 1. ETHUSD Minute Data
- Resampling into Daily Data
- Daily RV
- daily returns (pct change)

In [2]:
import pandas as pd
import numpy as np
import os
import io
import requests
import pandas as pd
import numpy as np
from dotenv import load_dotenv


# File path for ETH‑USD minute data
ethusd_file_path = "../../data/raw/minute/ethusd.csv"

# Load the minute-level data
data = pd.read_csv(ethusd_file_path)

# Convert Unix Timestamp (milliseconds) to datetime (keeping it as a column)
data['time'] = pd.to_datetime(data['time'], unit='ms')

# Calculate intraday log returns
data['Log Return'] = np.log(data['close'] / data['close'].shift(1))
data.dropna(subset=['Log Return'], inplace=True)

# Set 'time' as index temporarily to resample
data = data.set_index('time')

# Calculate daily realized variance from log returns
rv_1d = data['Log Return'].resample('1D').apply(lambda x: np.sum(x**2))

# Aggregate minute data into daily data:
daily_data = data.resample('1D').agg({
    'open': 'first',        # First open price of the day
    'close': 'last',        # Last close price of the day
    'high': 'max',          # Highest price of the day
    'low': 'min',           # Lowest price of the day
    'volume': 'sum',        # Total volume for the day
})

# Add daily realized variance and rename the column
daily_data['1D RV'] = rv_1d
daily_data.rename(columns={'1D RV': 'RV_d,t'}, inplace=True)

# Drop days with incomplete data
daily_data.dropna(subset=['open', 'close', 'high', 'low', 'volume', 'RV_d,t'], inplace=True)

# Calculate daily returns from the daily close price
daily_data['daily_return'] = daily_data['close'].pct_change()

# Compute rolling averages for realized variance: 7-day and 30-day
daily_data['RV_w,t'] = daily_data['RV_d,t'].rolling(window=7).mean()
daily_data['RV_m,t'] = daily_data['RV_d,t'].rolling(window=30).mean()

# Apply logarithmic transformation to the realized variance measures
daily_data['ln_RV_d,t'] = np.log(daily_data['RV_d,t'])
daily_data['ln_RV_w,t'] = np.log(daily_data['RV_w,t'])
daily_data['ln_RV_m,t'] = np.log(daily_data['RV_m,t'])

# Reset the index so that 'time' becomes a column again
daily_data = daily_data.reset_index()


In [3]:
def calculate_parkinson(data, window):
    """
    Calculate Parkinson Volatility using high and low prices over a rolling window.
    Haven't found a use for this yet.
    
    Parameters:
    - data: DataFrame with 'High' and 'Low' columns.
    - window: Rolling window size for Parkinson Volatility (e.g., 7 or 30 days).
    
    Returns:
    - Series containing Parkinson Volatility values.
    """
    parkinson_vol = np.sqrt(
        (1 / (4 * np.log(2) * window)) *
        (np.log(data['High'] / data['Low']) ** 2).rolling(window=window).sum()
    )
    return parkinson_vol


# 2. Fred MD data
We will be using the **PublicDataReader** package to pull Fred MD data. You will need to use the FRED API key which can be obtained [here](https://fredaccount.stlouisfed.org/apikey). Store the API key as a local variable using .env file

## 2.1 Creating .env and pulling data
Uncomment the code and put it in the .env file

In [4]:
# FRED_API="your_fred_api_key_here"

## 2.2 Download the full dataset from Fred MD


In [5]:
# Load API Key
load_dotenv()
api_key = os.getenv("FRED_API")

# Download the full dataset
fred_md_url = "https://files.stlouisfed.org/files/htdocs/fred-md/monthly/current.csv"

response = requests.get(fred_md_url)

# Need to use io.StringIO
df = pd.read_csv(io.StringIO(response.content.decode('utf-8')))

# Too many white spaces in the column names
df.columns = df.columns.str.strip()

# Convert 'sasdate' to datetime
df["sasdate"] = pd.to_datetime(df["sasdate"], errors="coerce")

# Remove the sasdate rows with NaT
df = df.dropna(subset=["sasdate"])

# Remove the sasdate rows that are not dates
df = df[df["sasdate"].dt.year > 1960]


  df["sasdate"] = pd.to_datetime(df["sasdate"], errors="coerce")


# 3. Merge the datasets and then save

In [6]:
daily_data['month_year'] = pd.to_datetime(daily_data['time']).dt.to_period('M')
df['month_year'] = df['sasdate'].dt.to_period('M')

# Merge the data using the helper column 'month_year'
merged_data = pd.merge(daily_data, df, on='month_year', how='inner')

# Drop the helper column 'month_year'
merged_data.drop(columns='month_year', inplace=True)

# Drop sasdate column (Repeated with time column)
merged_data.drop(columns='sasdate', inplace=True)

# Save the merged data to a CSV file
merged_data.to_csv("../../data/ethusd_group_project.csv", index=False)


# Feature Selection

## Adding lagged data as a new feature

In [7]:
data = pd.read_csv("../../data/ethusd_group_project.csv")

# Lag the daily, weekly, and monthly rv measures by one day
data['ln_RV_d,t-1'] = data['ln_RV_d,t'].shift(1)
data['ln_RV_w,t-1'] = data['ln_RV_w,t'].shift(1)
data['ln_RV_m,t-1'] = data['ln_RV_m,t'].shift(1)

# Do it for the non-logarithmic RV measures as well
data['RV_d,t-1'] = data['RV_d,t'].shift(1)
data['RV_w,t-1'] = data['RV_w,t'].shift(1)
data['RV_m,t-1'] = data['RV_m,t'].shift(1)


In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV

# exclude warnings
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv("../../data/ethusd_group_project.csv")

data = data.dropna()
# Also exclude the lagged RV measures
to_exclude = ['time', 'ln_RV_d,t', 'ln_RV_w,t', 'ln_RV_m,t', 'RV_d,t', 'RV_w,t', 'RV_m,t']
features = [col for col in data.columns if col not in to_exclude]  # Exclude time & target
X = data[features].values
y = data['ln_RV_d,t'].values  # Predicting next day's realized variance

# Standardize Features
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Apply LASSO Regression with Cross-Validation
lasso = LassoCV(
    alphas=np.logspace(-5, 1, 50), 
    cv=5, 
    max_iter=10000,  
    tol=1e-4,  
    random_state=42
).fit(X_scaled, y_scaled)

print(lasso.coef_)
# Select Important Features
selected_features = np.array(features)[lasso.coef_ != 0]  # Keep non-zero coefficients
print(f"Selected {len(selected_features)} features: {selected_features}")

# Create a New Dataset with Selected Features
data_selected = data[to_exclude + list(selected_features)]
data_selected.to_csv("../../data/ethusd_lasso_selected.csv", index=False)

print("Feature selection complete! Saved dataset with important features.")

Selected 4 features: ['volume' 'CLF16OV' 'UEMPMEAN' 'AAAFFM']
Feature selection complete! Saved dataset with important features.
