# 0. Install and Import dependencies

In [7]:
!pip install finta



In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

# 1. Load BTCUSD Data 
https://www.coingecko.com/it/monete/bitcoin/historical_data?start=2025-01-01&end=2025-05-26
https://www.marketwatch.com/investing/cryptocurrency/btcusd/download-data?mod=mw_quote_ta


In [10]:
# Load DataFrame
df_btc = pd.read_csv('data/BTCUSD_processed.csv')
df_eth = pd.read_csv('data/ETHUSD_processed.csv')

df_btc.head()

Unnamed: 0,date,open,high,low,close,volume,sma,rsi,sma_200,distance_from_sma_200,volatility_100,position_in_200_range
0,2022-12-18,16706,16798,16669,16760,15086380000.0,17151.083333,43.94132,20502.19,-0.182526,0.029084,0.063269
1,2022-12-19,16760,16832,16531,16595,11134050000.0,17131.833333,41.757976,20433.79,-0.187865,0.029052,0.052746
2,2022-12-20,16595,17046,16288,16862,17719670000.0,17103.833333,46.399199,20370.34,-0.172228,0.029107,0.069775
3,2022-12-21,16861,16924,16738,16780,24938220000.0,17077.083333,45.207694,20306.07,-0.173646,0.02884,0.064545
4,2022-12-22,16780,16863,16566,16781,16062770000.0,17044.416667,45.226167,20239.8,-0.170891,0.027252,0.064609


# 2. Data Cleaning

#### BTC

In [11]:
# Ensure 'date' is datetime
df_btc['date'] = pd.to_datetime(df_btc['date'])

# 0. Check for missing dates (assuming daily frequency)
expected_dates_btc = pd.date_range(start=df_btc['date'].min(), end=df_btc['date'].max(), freq='D')
missing_dates_btc = expected_dates_btc.difference(df_btc['date'])
if not missing_dates_btc.empty:
    print("Missing BTC dates:")
    print(missing_dates_btc.date.tolist())

# 1. Check for zero-variance columns
zero_var_cols_btc = df_btc.columns[df_btc.nunique() <= 1]
if not zero_var_cols_btc.empty:
    print("Zero-variance BTC columns:")
    print(zero_var_cols_btc.tolist())
    # df_btc = df_btc.drop(columns=zero_var_cols_btc)

# 2. Check for near-zero variance columns (numeric only)
numeric_df_btc = df_btc.select_dtypes(include='number')
threshold_btc = 0.01
low_var_cols_btc = numeric_df_btc.var()[numeric_df_btc.var() <= threshold_btc].index
if not low_var_cols_btc.empty:
    print("Near-zero variance BTC columns:")
    print(low_var_cols_btc.tolist())
    # df_btc = df_btc.drop(columns=low_var_cols_btc)

# 3. Check for duplicate rows
if df_btc.duplicated().any():
    print("Duplicate BTC rows found:", df_btc.duplicated().sum())
    # df_btc = df_btc.drop_duplicates()

# 4. Check for NaNs
if df_btc.isna().any().any():
    print("Missing BTC values (NaNs) found:")
    print(df_btc.isna().sum()[df_btc.isna().sum() > 0])

# 5. Check for zero values
zero_mask_btc = (numeric_df_btc == 0)
if zero_mask_btc.any().any():
    print("Zero BTC values found in columns:")
    print(zero_mask_btc.sum()[zero_mask_btc.sum() > 0])

Near-zero variance BTC columns:
['volatility_100']


#### ETH

In [17]:
# Ensure 'date' is datetime
df_eth['date'] = pd.to_datetime(df_eth['date'])

# 0. Check for missing dates (assuming daily frequency)
expected_dates_eth = pd.date_range(start=df_eth['date'].min(), end=df_eth['date'].max(), freq='D')
missing_dates_eth = expected_dates_eth.difference(df_eth['date'])
if not missing_dates_eth.empty:
    print("Missing ETH dates:")
    print(missing_dates_eth.date.tolist())

# 1. Check for zero-variance columns
zero_var_cols_eth = df_eth.columns[df_eth.nunique() <= 1]
if not zero_var_cols_eth.empty:
    print("Zero-variance ETH columns:")
    print(zero_var_cols_eth.tolist())
    # df_eth = df_eth.drop(columns=zero_var_cols_eth)

# 2. Check for near-zero variance columns (numeric only)
numeric_df_eth = df_eth.select_dtypes(include='number')
threshold_eth = 0.01
low_var_cols_eth = numeric_df_eth.var()[numeric_df_eth.var() <= threshold_eth].index
if not low_var_cols_eth.empty:
    print("Near-zero variance ETH columns:")
    print(low_var_cols_eth.tolist())
    # df_eth = df_eth.drop(columns=low_var_cols_eth)

# 3. Check for duplicate rows
if df_eth.duplicated().any():
    print("Duplicate ETH rows found:", df_eth.duplicated().sum())
    # df_eth = df_eth.drop_duplicates()

# 4. Check for NaNs
if df_eth.isna().any().any():
    print("Missing ETH values (NaNs) found:")
    print(df_eth.isna().sum()[df_eth.isna().sum() > 0])

# 5. Check for zero values
zero_mask_eth = (numeric_df_eth == 0)
if zero_mask_eth.any().any():
    print("Zero ETH values found in columns:")
    print(zero_mask_eth.sum()[zero_mask_eth.sum() > 0])

Near-zero variance ETH columns:
['volatility_100']
Zero ETH values found in columns:
position_in_200_range    13
dtype: int64


In [19]:
import seaborn as sns

# Compute correlation matrix (numeric only)
corr_eth = df_eth.select_dtypes(include='number').corr()

# Find pairs of features with high correlation
threshold_corr_eth = 0.9
corr_pairs_eth = corr_eth.abs().unstack().sort_values(ascending=False)
high_corr_pairs_eth = corr_pairs_eth[(corr_pairs_eth < 1.0) & (corr_pairs_eth > threshold_corr_eth)]

print("Highly correlated ETH feature pairs:")
print(high_corr_pairs_eth)

Highly correlated ETH feature pairs:
high                   open                     0.997301
open                   high                     0.997301
high                   close                    0.997243
close                  high                     0.997243
low                    close                    0.996735
close                  low                      0.996735
open                   low                      0.995524
low                    open                     0.995524
high                   low                      0.994964
low                    high                     0.994964
open                   close                    0.993721
close                  open                     0.993721
sma                    open                     0.982882
open                   sma                      0.982882
sma                    high                     0.981675
high                   sma                      0.981675
close                  sma                      0.9