In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [6]:
from pathlib import Path

# Configurable parts
SOURCE = "dukascopy"
SYMBOL = "usdjpy"
MINUTES = 1
EVENT = '100m-dollar'
START_DATE = "2020-01-01"
END_DATE = "2024-12-31"

# Build base name
BASE_NAME = f"{SOURCE}-{SYMBOL}-tick-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RAW_DIR = BASE_DIR / "raw"
RAW_FILE_PATH = RAW_DIR / f"{BASE_NAME}.csv"

In [7]:
df = pd.read_csv(RAW_FILE_PATH)

In [8]:
df.shape

(163024077, 5)

In [9]:
df.head()

Unnamed: 0,timestamp,askPrice,bidPrice,askVolume,bidVolume
0,1577916000219,108.786,108.73,750.0,750.0
1,1577916000433,108.79,108.73,750.0,750.0
2,1577916023533,108.79,108.729,750.0,750.0
3,1577916028663,108.79,108.728,750.0,1309.999943
4,1577916041516,108.791,108.728,750.0,560.000002


In [10]:
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
df['mid'] = (df['askPrice'] + df['bidPrice']) / 2
df['volume'] = df['askVolume'] + df['bidVolume']
df['dollar'] = df['mid'] * df['volume']
df['spread'] = df['askPrice'] - df['bidPrice']

In [14]:
df = df.set_index('timestamp')
df

Unnamed: 0_level_0,askPrice,bidPrice,askVolume,bidVolume,mid,volume,dollar,spread
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-01 22:00:00.219,108.786,108.730,750.000000,750.000000,108.7580,1500.000000,163137.000000,0.056
2020-01-01 22:00:00.433,108.790,108.730,750.000000,750.000000,108.7600,1500.000000,163140.000000,0.060
2020-01-01 22:00:23.533,108.790,108.729,750.000000,750.000000,108.7595,1500.000000,163139.250000,0.061
2020-01-01 22:00:28.663,108.790,108.728,750.000000,1309.999943,108.7590,2059.999943,224043.533777,0.062
2020-01-01 22:00:41.516,108.791,108.728,750.000000,560.000002,108.7595,1310.000002,142474.945259,0.063
...,...,...,...,...,...,...,...,...
2024-12-30 23:59:55.557,157.012,157.004,1200.000048,3599.999905,157.0080,4799.999952,753638.392513,0.008
2024-12-30 23:59:55.760,157.012,157.003,1200.000048,3599.999905,157.0075,4799.999952,753635.992513,0.009
2024-12-30 23:59:57.531,157.012,157.004,1200.000048,3599.999905,157.0080,4799.999952,753638.392513,0.008
2024-12-30 23:59:57.634,157.012,157.003,1200.000048,3599.999905,157.0075,4799.999952,753635.992513,0.009


### Calculate daily average dollar volume

In [15]:
avg_dollar_volume = df['dollar'].resample('1D').sum().mean()
avg_dollar_volume

np.float64(58405330128.192024)

In [16]:
threshold_1000 = avg_dollar_volume / 100 # if we want to have ~1000 bars a day
threshold_1000

np.float64(584053301.2819202)

In [17]:
df = df.reset_index()
df

Unnamed: 0,timestamp,askPrice,bidPrice,askVolume,bidVolume,mid,volume,dollar,spread
0,2020-01-01 22:00:00.219,108.786,108.730,750.000000,750.000000,108.7580,1500.000000,163137.000000,0.056
1,2020-01-01 22:00:00.433,108.790,108.730,750.000000,750.000000,108.7600,1500.000000,163140.000000,0.060
2,2020-01-01 22:00:23.533,108.790,108.729,750.000000,750.000000,108.7595,1500.000000,163139.250000,0.061
3,2020-01-01 22:00:28.663,108.790,108.728,750.000000,1309.999943,108.7590,2059.999943,224043.533777,0.062
4,2020-01-01 22:00:41.516,108.791,108.728,750.000000,560.000002,108.7595,1310.000002,142474.945259,0.063
...,...,...,...,...,...,...,...,...,...
163024072,2024-12-30 23:59:55.557,157.012,157.004,1200.000048,3599.999905,157.0080,4799.999952,753638.392513,0.008
163024073,2024-12-30 23:59:55.760,157.012,157.003,1200.000048,3599.999905,157.0075,4799.999952,753635.992513,0.009
163024074,2024-12-30 23:59:57.531,157.012,157.004,1200.000048,3599.999905,157.0080,4799.999952,753638.392513,0.008
163024075,2024-12-30 23:59:57.634,157.012,157.003,1200.000048,3599.999905,157.0075,4799.999952,753635.992513,0.009


In [19]:
def resample_to_dollar_bar(df, threshold: float):
    """
    Resample tick-level data into dollar bars using ask/bid quotes and volume.

    Parameters:
        df (pd.DataFrame): Must contain ['timestamp', 'askPrice', 'bidPrice', 'askVolume', 'bidVolume']
        threshold (float): Dollar value threshold to form one bar (e.g., 100_000)

    Returns:
        pd.DataFrame: Dollar bars with ['timestamp', 'open', 'high', 'low', 'close', 'volume', 'spread']
    """
    bars = []
    cum_dollar = 0.0
    bar = {'open': None, 'high': -float('inf'), 'low': float('inf'),
           'close': None, 'volume': 0.0, 'spread_sum': 0.0,
           'count': 0, 'start_time': None, 'end_time': None}

    for row in df.itertuples():
        price = row.mid
        vol = row.volume
        dol = row.dollar
        spread = row.spread
        ts = row.timestamp

        if bar['open'] is None:
            bar['open'] = price
            bar['start_time'] = ts

        bar['high'] = max(bar['high'], price)
        bar['low'] = min(bar['low'], price)
        bar['close'] = price
        bar['volume'] += vol
        bar['spread_sum'] += spread
        bar['count'] += 1
        bar['end_time'] = ts
        cum_dollar += dol

        if cum_dollar >= threshold:
            bars.append({
                'timestamp': bar['end_time'],
                'open': bar['open'],
                'high': bar['high'],
                'low': bar['low'],
                'close': bar['close'],
                'volume': bar['volume'],
                'spread': bar['spread_sum'] / bar['count'] if bar['count'] > 0 else None,
            })
            # Reset bar
            cum_dollar = 0.0
            bar = {'open': None, 'high': -float('inf'), 'low': float('inf'),
                   'close': None, 'volume': 0.0, 'spread_sum': 0.0,
                   'count': 0, 'start_time': None, 'end_time': None}

    return pd.DataFrame(bars)


In [21]:
EVENT = '584m-dollar'
RESAMPLED_NAME = f"{SOURCE}-{SYMBOL}-{EVENT}-{START_DATE}-{END_DATE}"
RESAMPLED_DIR = BASE_DIR / "resampled"
RESAMPLED_FILE_PATH = RESAMPLED_DIR / f"{RESAMPLED_NAME}.pkl"

In [22]:
%%time
df_584m = resample_to_dollar_bar(df, 584e6)
df_584m

Unnamed: 0,timestamp,open,high,low,close,volume,spread
0,2020-01-01 23:10:58.298,108.7580,108.7700,108.6050,108.6520,5.374390e+06,0.026176
1,2020-01-02 00:32:54.813,108.6520,108.7360,108.6460,108.6915,5.373900e+06,0.002801
2,2020-01-02 01:31:42.107,108.6940,108.7320,108.6380,108.6565,5.375870e+06,0.002325
3,2020-01-02 02:39:19.796,108.6565,108.6895,108.6460,108.6695,5.375360e+06,0.002285
4,2020-01-02 04:47:22.068,108.6700,108.7320,108.6610,108.7235,5.375220e+06,0.002371
...,...,...,...,...,...,...,...
182425,2024-12-30 21:47:51.641,156.8010,156.8685,156.7810,156.8455,3.730800e+06,0.010174
182426,2024-12-30 21:53:46.988,156.8445,156.8805,156.8445,156.8670,3.728790e+06,0.013855
182427,2024-12-30 23:05:37.529,156.8670,156.9225,156.8345,156.9100,3.725650e+06,0.041753
182428,2024-12-30 23:33:37.983,156.9090,157.0675,156.9085,157.0325,3.721790e+06,0.015361


In [23]:
df_584m.to_pickle(RESAMPLED_FILE_PATH)