In [11]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [12]:
# Load Forex tick data
data_dir = "../data/raw/"
file_name = "usdjpy-tick-2020-01-01-2024-12-31.csv"
file_path = os.path.join(data_dir, file_name)

pkl_dir = "../data/pkl"
os.makedirs(pkl_dir, exist_ok=True)
name = os.path.splitext(os.path.basename(file_name))[0]
pkl_file_path = os.path.join(pkl_dir, f"{name}.pkl")

resampled_dir = "../data/resampled"
os.makedirs(resampled_dir, exist_ok=True)
name = os.path.splitext(os.path.basename(file_name))[0]
resampled_file_path = os.path.join(resampled_dir, "usdjpy-bar-test-2020-01-01-2024-12-31.pkl")

In [13]:
def resample_to_ohlcv_minute(df):
    df_tmp = pd.DataFrame()
    # Convert timestamp to datetime
    df_tmp['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

    # Compute mid price as the average of askPrice and bidPrice
    df_tmp['midPrice'] = (df['askPrice'] + df['bidPrice']) / 2

    df_tmp['volume'] = df['askVolume'] + df['bidVolume']

    # Set timestamp as index for resampling
    df_tmp.set_index('timestamp', inplace=True)

    # Resample by minute and aggregate OHLCV
    ohlcv_df = df_tmp.resample('min').agg({
        'midPrice': ['first', 'max', 'min', 'last'],  # OHLC for mid price
        'volume': 'sum',  # Total volume
    })

    # Flatten the column names
    ohlcv_df.columns = [
        'open', 'high', 'low', 'close', 'volume',
    ]

    # Reset the index to have timestamp as a column
    ohlcv_df.reset_index(inplace=True)

    return ohlcv_df

In [14]:
import pandas as pd
import numpy as np

def check_time_continuity(ohlcv_df):
    # Ensure timestamp is in datetime format
    ohlcv_df['timestamp'] = pd.to_datetime(ohlcv_df['timestamp'])

    # Set timestamp as index for easier continuity check
    ohlcv_df.set_index('timestamp', inplace=True)

    # Generate a full datetime index at 1-minute intervals from the first to the last timestamp
    full_time_index = pd.date_range(start=ohlcv_df.index.min(), end=ohlcv_df.index.max(), freq='min')

    # Find missing timestamps (gaps in the data)
    missing_timestamps = full_time_index.difference(ohlcv_df.index)

    # Print missing statistics
    missing_count = len(missing_timestamps)
    total_count = len(full_time_index)
    missing_percentage = (missing_count / total_count) * 100
    
    print(f"Total data points: {total_count}")
    print(f"Missing data points: {missing_count}")
    print(f"Percentage of missing data: {missing_percentage:.2f}%")
    print(f"First missing timestamp: {missing_timestamps.min() if missing_timestamps.size > 0 else 'N/A'}")
    print(f"Last missing timestamp: {missing_timestamps.max() if missing_timestamps.size > 0 else 'N/A'}")
    ohlcv_df.reset_index(inplace=True)
    return missing_timestamps



In [15]:
df = pd.read_csv(file_path)

In [16]:
df.shape

(163024077, 5)

In [17]:
df.head()

Unnamed: 0,timestamp,askPrice,bidPrice,askVolume,bidVolume
0,1577916000219,108.786,108.73,750.0,750.0
1,1577916000433,108.79,108.73,750.0,750.0
2,1577916023533,108.79,108.729,750.0,750.0
3,1577916028663,108.79,108.728,750.0,1309.999943
4,1577916041516,108.791,108.728,750.0,560.000002


In [18]:
ohlcv_df = resample_to_ohlcv_minute(df)

In [19]:
print(check_time_continuity(ohlcv_df))

Total data points: 2628120
Missing data points: 0
Percentage of missing data: 0.00%
First missing timestamp: N/A
Last missing timestamp: N/A
DatetimeIndex([], dtype='datetime64[ns]', freq='min')


In [20]:
ohlcv_df.to_pickle(resampled_file_path)