In [5]:
import pandas as pd
import os
from datetime import datetime

def transform_crypto_data_with_date_check(csv_path_or_df, ticker):
    # Load the CSV or use DataFrame directly
    if isinstance(csv_path_or_df, str):
        df = pd.read_csv(csv_path_or_df)
    else:
        df = csv_path_or_df.copy()

    df.rename(columns={
        'datetime': 'date',
        'symbol': 'tic',
        'open': 'open',
        'high': 'high',
        'low': 'low',
        'close': 'close',
        'volume': 'volume'
    }, inplace=True)

    df['date'] = pd.to_datetime(df['date'])
    df['day'] = df['date'].dt.weekday
    df['date'] = df['date'].dt.date
    df['tic'] = ticker.lower()
    df = df[['date', 'close', 'high', 'low', 'open', 'volume', 'tic', 'day']]

    return df

def align_dataframes_on_common_range(*dfs):
    start_dates = [df['date'].min() for df in dfs]
    end_dates = [df['date'].max() for df in dfs]

    # common_start = max(start_dates)
    common_start = datetime.strptime("2015-02-01", "%Y-%m-%d").date()
    common_end = min(end_dates)

    aligned_dfs = [
        df[(df['date'] >= common_start) & (df['date'] <= common_end)].reset_index(drop=True)
        for df in dfs
    ]

    print(f"All DataFrames aligned to date range: {common_start} → {common_end}")
    return aligned_dfs


def build_crypto_dataset(
    data_dir="data",
    tickers=["SPY", "VO", "VB", "AGG", "VNQ", "GLD", "BIL", "VWO", "BTCUSD"],
    interval="daily",
    start_date=None,
    end_date=None,
    output_csv="data.csv"
):
    all_dfs = []

    for ticker in tickers:
        file_path = os.path.join(data_dir, ticker, f"{ticker}_{interval}.csv")
        if not os.path.exists(file_path):
            print(f" File not found: {file_path}")
            continue

        df = transform_crypto_data_with_date_check(file_path, ticker)

        # Filter by date range if specified
        if start_date:
            df = df[df['date'] >= pd.to_datetime(start_date).date()]
        if end_date:
            df = df[df['date'] <= pd.to_datetime(end_date).date()]

        all_dfs.append(df)

    if len(all_dfs) < 2:
        print("Need at least 2 tickers with valid data.")
        return None

    aligned_dfs = align_dataframes_on_common_range(*all_dfs)
    merged_df = pd.concat(aligned_dfs, ignore_index=True)
    merged_df.sort_values(by=['date', 'tic'], inplace=True)
    merged_df.to_csv(output_csv, index=False)

    print(f"Final merged dataset saved to: {output_csv}")
    return merged_df


In [6]:
build_crypto_dataset(data_dir="newdata")

All DataFrames aligned to date range: 2015-02-01 → 2025-04-11
Final merged dataset saved to: data.csv


Unnamed: 0,date,close,high,low,open,volume,tic,day
20520,2015-02-01,228.99,233.790,210.0000,218.67,7220.0,btcusd,6
7695,2015-02-02,112.20,112.230,112.0000,112.06,2792120.0,agg,0
15390,2015-02-02,91.46,91.480,91.4600,91.48,3557487.0,bil,0
20521,2015-02-02,237.83,240.100,220.8900,228.39,7421.0,btcusd,0
12825,2015-02-02,122.42,123.155,121.8200,121.84,8885189.0,gld,0
...,...,...,...,...,...,...,...,...
2564,2025-04-11,533.94,536.430,520.0700,523.01,97866334.0,spy,4
7694,2025-04-11,205.82,206.440,199.2843,202.69,1652602.0,vb,4
12824,2025-04-11,83.63,83.800,81.0700,82.05,5276100.0,vnq,4
5129,2025-04-11,243.62,244.330,236.5926,239.25,1396703.0,vo,4


In [16]:
import pandas as pd

# Load your CSV
df = pd.read_csv("data(Kang).csv")  # Replace with your actual file path

# Convert 'date' column to datetime.date
df['date'] = pd.to_datetime(df['date']).dt.date

# Group by date and count unique 'tic' values
tic_counts = df.groupby('date')['tic'].transform('count')

# Filter out rows where tic == 'btcusd' and it's the only ticker that day
filtered_df = df[~((df['tic'] == 'btcusd') & (tic_counts == 1))]

# Save to new CSV
filtered_df.to_csv("filtered_btcusd.csv", index=False)
