In [1]:
import pandas as pd
import os
from datetime import datetime

def transform_data_with_date_check(csv_path_or_df, ticker):
    if isinstance(csv_path_or_df, str):
        df = pd.read_csv(csv_path_or_df)
    else:
        df = csv_path_or_df.copy()

    df.rename(columns={
        'datetime': 'date',
        'symbol': 'tic',
        'open': 'open',
        'high': 'high',
        'low': 'low',
        'close': 'close',
        'volume': 'volume'
    }, inplace=True)

    df['date'] = pd.to_datetime(df['date'])
    df['day'] = df['date'].dt.weekday
    df['date'] = df['date'].dt.date
    df['tic'] = ticker.lower()
    df = df[['date', 'close', 'high', 'low', 'open', 'volume', 'tic', 'day']]

    return df

def align_dataframes_on_common_range(dfs, forced_start_date=None):
    start_dates = [df['date'].min() for df in dfs]
    end_dates = [df['date'].max() for df in dfs]

    common_start = forced_start_date if forced_start_date else max(start_dates)
    common_end = min(end_dates)

    aligned_dfs = [
        df[(df['date'] >= common_start) & (df['date'] <= common_end)].reset_index(drop=True)
        for df in dfs
    ]

    print(f"[INFO] Aligned to common date range: {common_start} → {common_end}")
    return aligned_dfs

def build_dataset(
    data_dir="data",
    tickers=None,
    interval="daily",
    start_date=None,
    end_date=None,
    output_csv="merged_data.csv",
    force_common_start=None
):
    if tickers is None:
        tickers = ["SPY", "VO", "VB", "AGG", "VNQ", "GLD", "BIL", "VWO", "BTCUSD"]

    all_dfs = []

    for ticker in tickers:
        file_path = os.path.join(data_dir, ticker, f"{ticker}_{interval}.csv")
        if not os.path.exists(file_path):
            print(f"[Warning] File not found: {file_path}")
            continue

        df = transform_data_with_date_check(file_path, ticker)

        # Apply individual filtering if needed
        if start_date:
            df = df[df['date'] >= pd.to_datetime(start_date).date()]
        if end_date:
            df = df[df['date'] <= pd.to_datetime(end_date).date()]

        all_dfs.append(df)

    if len(all_dfs) < 2:
        print("[Error] Need at least 2 tickers with valid data.")
        return None

    aligned_dfs = align_dataframes_on_common_range(all_dfs, forced_start_date=force_common_start)
    merged_df = pd.concat(aligned_dfs, ignore_index=True)
    merged_df.sort_values(by=['date', 'tic'], inplace=True)
    merged_df.to_csv(output_csv, index=False)

    print(f"[SUCCESS] Merged dataset saved to: {output_csv}")
    return merged_df


In [2]:
build_dataset(data_dir="newdata", force_common_start=datetime.strptime("2015-02-01", "%Y-%m-%d").date(), output_csv="2015-2025_crypto.csv")
build_dataset(data_dir="newdata", tickers = ["SPY", "VO", "VB", "AGG", "VNQ", "GLD", "BIL", "VWO"], force_common_start=datetime.strptime("2015-02-01", "%Y-%m-%d").date(), output_csv="2015-2025_no_crypto.csv")
build_dataset(data_dir="newdata", tickers = ["SPY", "VO", "VB", "AGG", "VNQ", "GLD", "BIL", "VWO"], force_common_start=datetime.strptime("2007-06-01", "%Y-%m-%d").date(), output_csv="2007-2025_no_crypto.csv")

[INFO] Aligned to common date range: 2015-02-01 → 2025-04-11
[SUCCESS] Merged dataset saved to: 2015-2025_crypto.csv
[INFO] Aligned to common date range: 2015-02-01 → 2025-04-11
[SUCCESS] Merged dataset saved to: 2015-2025_no_crypto.csv
[INFO] Aligned to common date range: 2007-06-01 → 2025-04-11
[SUCCESS] Merged dataset saved to: 2007-2025_no_crypto.csv


Unnamed: 0,date,close,high,low,open,volume,tic,day
13488,2007-06-01,98.63000,98.77000,98.53000,98.72,942500.0,agg,4
26976,2007-06-01,91.62000,91.62000,91.62000,91.62,1450.0,bil,4
22480,2007-06-01,66.44000,66.53999,65.70000,65.70,5634898.0,gld,4
0,2007-06-01,154.08000,154.39999,153.50999,153.88,107774432.0,spy,4
8992,2007-06-01,76.21001,76.44000,75.96001,76.05,68400.0,vb,4
...,...,...,...,...,...,...,...,...
4495,2025-04-11,533.94000,536.43000,520.07000,523.01,97866334.0,spy,4
13487,2025-04-11,205.82000,206.44000,199.28430,202.69,1652602.0,vb,4
22479,2025-04-11,83.63000,83.80000,81.07000,82.05,5276100.0,vnq,4
8991,2025-04-11,243.62000,244.33000,236.59260,239.25,1396703.0,vo,4
