In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [2]:
# Load Forex tick data
data_dir = "../data/raw/"
file_name = "usdjpy-tick-2020-01-01-2024-12-31.csv"
file_path = os.path.join(data_dir, file_name)
pkl_dir = "../data/pkl"
os.makedirs(pkl_dir, exist_ok=True)
name = os.path.splitext(os.path.basename(file_name))[0]
pkl_file_path = os.path.join(pkl_dir, f"{name}.pkl")

resampled_dir = "../data/resampled"
os.makedirs(resampled_dir, exist_ok=True)
name = os.path.splitext(os.path.basename(file_name))[0]
resampled_file_path = os.path.join(resampled_dir, "usdjpy-bar-m3-2020-01-01-2024-12-31.pkl")

In [3]:
def resample_to_ohlcv_minute(df, minutes: int):
    df_tmp = pd.DataFrame()
    # Convert timestamp to datetime
    df_tmp['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

    # Compute mid price as the average of askPrice and bidPrice
    df_tmp['midPrice'] = (df['askPrice'] + df['bidPrice']) / 2

    df_tmp['volume'] = df['askVolume'] + df['bidVolume']

    # Set timestamp as index for resampling
    df_tmp.set_index('timestamp', inplace=True)

    # Resample by minute and aggregate OHLCV
    ohlcv_df = df_tmp.resample(f'{minutes}min').agg({
        'midPrice': ['first', 'max', 'min', 'last'],  # OHLC for mid price
        'volume': 'sum',  # Total volume
    })

    # Flatten the column names
    ohlcv_df.columns = [
        'open', 'high', 'low', 'close', 'volume',
    ]

    # Reset the index to have timestamp as a column
    ohlcv_df.reset_index(inplace=True)

    return ohlcv_df

In [4]:
import pandas as pd
import numpy as np

def check_time_continuity(ohlcv_df):
    # Ensure timestamp is in datetime format
    ohlcv_df['timestamp'] = pd.to_datetime(ohlcv_df['timestamp'])

    # Set timestamp as index for easier continuity check
    ohlcv_df.set_index('timestamp', inplace=True)

    # Generate a full datetime index at 1-minute intervals from the first to the last timestamp
    full_time_index = pd.date_range(start=ohlcv_df.index.min(), end=ohlcv_df.index.max(), freq='min')

    # Find missing timestamps (gaps in the data)
    missing_timestamps = full_time_index.difference(ohlcv_df.index)

    # Print missing statistics
    missing_count = len(missing_timestamps)
    total_count = len(full_time_index)
    missing_percentage = (missing_count / total_count) * 100
    
    print(f"Total data points: {total_count}")
    print(f"Missing data points: {missing_count}")
    print(f"Percentage of missing data: {missing_percentage:.2f}%")
    print(f"First missing timestamp: {missing_timestamps.min() if missing_timestamps.size > 0 else 'N/A'}")
    print(f"Last missing timestamp: {missing_timestamps.max() if missing_timestamps.size > 0 else 'N/A'}")
    ohlcv_df.reset_index(inplace=True)
    return missing_timestamps



In [5]:
df = pd.read_csv(file_path)

In [6]:
df.shape

(163024077, 5)

In [7]:
df.head()

Unnamed: 0,timestamp,askPrice,bidPrice,askVolume,bidVolume
0,1577916000219,108.786,108.73,750.0,750.0
1,1577916000433,108.79,108.73,750.0,750.0
2,1577916023533,108.79,108.729,750.0,750.0
3,1577916028663,108.79,108.728,750.0,1309.999943
4,1577916041516,108.791,108.728,750.0,560.000002


In [8]:
ohlcv_df = resample_to_ohlcv_minute(df, 3)

In [9]:
ohlcv_df[:100]

Unnamed: 0,timestamp,open,high,low,close,volume
0,2020-01-01 22:00:00,108.7580,108.7600,108.7495,108.7535,2.698000e+04
1,2020-01-01 22:03:00,108.7540,108.7700,108.7535,108.7700,2.315000e+04
2,2020-01-01 22:06:00,108.7685,108.7685,108.7495,108.7545,1.632430e+06
3,2020-01-01 22:09:00,108.7565,108.7625,108.7395,108.7400,6.893100e+05
4,2020-01-01 22:12:00,108.7340,108.7600,108.6495,108.7450,4.687000e+05
...,...,...,...,...,...,...
95,2020-01-02 02:45:00,108.6655,108.6720,108.6610,108.6650,1.212300e+05
96,2020-01-02 02:48:00,108.6660,108.6715,108.6610,108.6715,1.008900e+05
97,2020-01-02 02:51:00,108.6715,108.6800,108.6710,108.6765,1.090300e+05
98,2020-01-02 02:54:00,108.6755,108.6790,108.6745,108.6790,4.363000e+04


In [10]:
ohlcv_df.isna().sum()

timestamp         0
open         252912
high         252912
low          252912
close        252912
volume            0
dtype: int64

In [11]:
print(check_time_continuity(ohlcv_df))

Total data points: 2628118
Missing data points: 1752078
Percentage of missing data: 66.67%
First missing timestamp: 2020-01-01 22:01:00
Last missing timestamp: 2024-12-30 23:56:00
DatetimeIndex(['2020-01-01 22:01:00', '2020-01-01 22:02:00',
               '2020-01-01 22:04:00', '2020-01-01 22:05:00',
               '2020-01-01 22:07:00', '2020-01-01 22:08:00',
               '2020-01-01 22:10:00', '2020-01-01 22:11:00',
               '2020-01-01 22:13:00', '2020-01-01 22:14:00',
               ...
               '2024-12-30 23:43:00', '2024-12-30 23:44:00',
               '2024-12-30 23:46:00', '2024-12-30 23:47:00',
               '2024-12-30 23:49:00', '2024-12-30 23:50:00',
               '2024-12-30 23:52:00', '2024-12-30 23:53:00',
               '2024-12-30 23:55:00', '2024-12-30 23:56:00'],
              dtype='datetime64[ns]', length=1752078, freq=None)


In [12]:
ohlcv_df.to_pickle(resampled_file_path)