In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [2]:
from pathlib import Path

# Configurable parts
SOURCE = "dukascopy"
SYMBOL = "eurusd"
START_DATE = "2020-01-01"
END_DATE = "2024-12-31"

# Build base name
BASE_NAME = f"{SOURCE}-{SYMBOL}-tick-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RAW_DIR = BASE_DIR / "raw"
RAW_FILE_PATH = RAW_DIR / f"{BASE_NAME}.csv"

In [3]:
df = pd.read_csv(RAW_FILE_PATH)

In [4]:
df.shape

(134667284, 5)

In [5]:
df.head()

Unnamed: 0,timestamp,askPrice,bidPrice,askVolume,bidVolume
0,1577916072821,1.1216,1.12106,93.699999,750.0
1,1577916077176,1.1216,1.1212,93.699999,750.0
2,1577916078545,1.1216,1.12117,93.699999,750.0
3,1577916079145,1.12161,1.12123,839.999974,750.0
4,1577916079246,1.12161,1.1212,839.999974,750.0


In [6]:
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
df['mid'] = (df['askPrice'] + df['bidPrice']) / 2
df['volume'] = df['askVolume'] + df['bidVolume']
df['dollar'] = df['mid'] * df['volume']
df['spread'] = df['askPrice'] - df['bidPrice']

In [7]:
df = df.set_index('timestamp')
df

Unnamed: 0_level_0,askPrice,bidPrice,askVolume,bidVolume,mid,volume,dollar,spread
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-01 22:01:12.821,1.12160,1.12106,93.699999,750.000000,1.121330,843.699999,946.066120,0.00054
2020-01-01 22:01:17.176,1.12160,1.12120,93.699999,750.000000,1.121400,843.699999,946.125179,0.00040
2020-01-01 22:01:18.545,1.12160,1.12117,93.699999,750.000000,1.121385,843.699999,946.112524,0.00043
2020-01-01 22:01:19.145,1.12161,1.12123,839.999974,750.000000,1.121420,1589.999974,1783.057771,0.00038
2020-01-01 22:01:19.246,1.12161,1.12120,839.999974,750.000000,1.121405,1589.999974,1783.033921,0.00041
...,...,...,...,...,...,...,...,...
2024-12-30 23:59:44.482,1.04062,1.04057,899.999976,7739.999771,1.040595,8639.999747,8990.740537,0.00005
2024-12-30 23:59:55.733,1.04062,1.04058,899.999976,899.999976,1.040600,1799.999952,1873.079950,0.00004
2024-12-30 23:59:57.849,1.04061,1.04058,899.999976,899.999976,1.040595,1799.999952,1873.070950,0.00003
2024-12-30 23:59:58.005,1.04060,1.04057,899.999976,449.999988,1.040585,1349.999964,1404.789713,0.00003


### Calculate daily average dollar volume

In [8]:
avg_dollar_volume = df['dollar'].resample('1D').sum().mean()
avg_dollar_volume

np.float64(421558182.3168614)

In [9]:
dollar_threshold = int(avg_dollar_volume / 1000) # if we want to have ~1000 bars a day
dollar_threshold

421558

In [10]:
df = df.reset_index()
df

Unnamed: 0,timestamp,askPrice,bidPrice,askVolume,bidVolume,mid,volume,dollar,spread
0,2020-01-01 22:01:12.821,1.12160,1.12106,93.699999,750.000000,1.121330,843.699999,946.066120,0.00054
1,2020-01-01 22:01:17.176,1.12160,1.12120,93.699999,750.000000,1.121400,843.699999,946.125179,0.00040
2,2020-01-01 22:01:18.545,1.12160,1.12117,93.699999,750.000000,1.121385,843.699999,946.112524,0.00043
3,2020-01-01 22:01:19.145,1.12161,1.12123,839.999974,750.000000,1.121420,1589.999974,1783.057771,0.00038
4,2020-01-01 22:01:19.246,1.12161,1.12120,839.999974,750.000000,1.121405,1589.999974,1783.033921,0.00041
...,...,...,...,...,...,...,...,...,...
134667279,2024-12-30 23:59:44.482,1.04062,1.04057,899.999976,7739.999771,1.040595,8639.999747,8990.740537,0.00005
134667280,2024-12-30 23:59:55.733,1.04062,1.04058,899.999976,899.999976,1.040600,1799.999952,1873.079950,0.00004
134667281,2024-12-30 23:59:57.849,1.04061,1.04058,899.999976,899.999976,1.040595,1799.999952,1873.070950,0.00003
134667282,2024-12-30 23:59:58.005,1.04060,1.04057,899.999976,449.999988,1.040585,1349.999964,1404.789713,0.00003


In [11]:
def resample_to_dollar_bar(df, threshold: float):
    """
    Resample tick-level data into dollar bars using ask/bid quotes and volume.

    Parameters:
        df (pd.DataFrame): Must contain ['timestamp', 'askPrice', 'bidPrice', 'askVolume', 'bidVolume']
        threshold (float): Dollar value threshold to form one bar (e.g., 100_000)

    Returns:
        pd.DataFrame: Dollar bars with ['timestamp', 'open', 'high', 'low', 'close', 'volume', 'spread']
    """
    bars = []
    cum_dollar = 0.0
    bar = {'open': None, 'high': -float('inf'), 'low': float('inf'),
           'close': None, 'volume': 0.0, 'spread_sum': 0.0,
           'count': 0, 'start_time': None, 'end_time': None}

    for row in df.itertuples():
        price = row.mid
        vol = row.volume
        dol = row.dollar
        spread = row.spread
        ts = row.timestamp

        if bar['open'] is None:
            bar['open'] = price
            bar['start_time'] = ts

        bar['high'] = max(bar['high'], price)
        bar['low'] = min(bar['low'], price)
        bar['close'] = price
        bar['volume'] += vol
        bar['spread_sum'] += spread
        bar['count'] += 1
        bar['end_time'] = ts
        cum_dollar += dol

        if cum_dollar >= threshold:
            bars.append({
                'timestamp': bar['end_time'],
                'open': bar['open'],
                'high': bar['high'],
                'low': bar['low'],
                'close': bar['close'],
                'volume': bar['volume'],
                'spread': bar['spread_sum'] / bar['count'] if bar['count'] > 0 else None,
            })
            # Reset bar
            cum_dollar = 0.0
            bar = {'open': None, 'high': -float('inf'), 'low': float('inf'),
                   'close': None, 'volume': 0.0, 'spread_sum': 0.0,
                   'count': 0, 'start_time': None, 'end_time': None}

    return pd.DataFrame(bars)


In [15]:
EVENT = f'{dollar_threshold}m-dollar'
RESAMPLED_NAME = f"{SOURCE}-{SYMBOL}-{EVENT}-{START_DATE}-{END_DATE}"
RESAMPLED_DIR = BASE_DIR / "resampled"
RESAMPLED_FILE_PATH = RESAMPLED_DIR / f"{RESAMPLED_NAME}.pkl"
RESAMPLED_FILE_PATH

PosixPath('../data/resampled/dukascopy-eurusd-421558m-dollar-2020-01-01-2024-12-31.pkl')

In [13]:
%%time
dollar_bar_df = resample_to_dollar_bar(df, dollar_threshold)
dollar_bar_df

CPU times: user 2min 55s, sys: 3.05 s, total: 2min 58s
Wall time: 2min 59s


Unnamed: 0,timestamp,open,high,low,close,volume,spread
0,2020-01-01 22:15:37.539,1.121330,1.121585,1.121235,1.121425,376142.501839,0.000310
1,2020-01-01 22:29:44.514,1.121435,1.121605,1.121380,1.121530,377179.999530,0.000137
2,2020-01-01 22:46:34.934,1.121520,1.121640,1.121490,1.121620,376300.000668,0.000123
3,2020-01-01 22:49:57.855,1.121630,1.121645,1.121550,1.121605,376090.000331,0.000108
4,2020-01-01 22:51:42.030,1.121610,1.121695,1.121600,1.121615,377029.997289,0.000120
...,...,...,...,...,...,...,...
1787189,2024-12-30 23:07:02.466,1.040235,1.040275,1.039970,1.040115,407969.998635,0.000121
1787190,2024-12-30 23:15:00.261,1.040095,1.040165,1.039920,1.040150,409559.999228,0.000079
1787191,2024-12-30 23:28:07.241,1.040170,1.040380,1.040130,1.040320,414089.995466,0.000048
1787192,2024-12-30 23:38:26.048,1.040325,1.040565,1.040245,1.040555,405659.995377,0.000031


In [14]:
dollar_bar_df.to_pickle(RESAMPLED_FILE_PATH)