In [29]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from datetime import timedelta
import json 


# load to df function

In [30]:
def file_to_df(file_path):
    file_path = file_path
    with open(file_path,'r')as f:
        data = json.load(f)
    df = pd.DataFrame(data['data'])
    return df


# time stamp preparation function

In [31]:


def prep_timestamp_column(df):
    """
    Prepare timestamp column by standardizing to milliseconds

    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame with timestamp column
    Returns:
    --------
    pandas.DataFrame
        DataFrame with converted timestamp column
    """
    # Identify the timestamp column
    possible_cols = ['time', 'timestamp']
    col_name = next((col for col in possible_cols if col in df.columns), None)
    if col_name is None:
        raise ValueError("DataFrame must contain either 'time' or 'timestamp' column")
    

    # Vectorized standardization
    def standardize_timestamps(series):
        # Convert to numeric, coercing errors to NaN
        numeric_series = pd.to_numeric(series, errors='coerce')
        # Handle NaN values
        numeric_series = numeric_series.fillna(0)
        # Convert to string and handle integer/float inputs
        ts_str = numeric_series.astype(str).str.split('.').str[0]
        # Trim long timestamps
        ts_str = ts_str.str.slice(0, 13)
        # Pad short timestamps
        ts_str = ts_str.str.pad(13, side='right', fillchar='0')
        return ts_str.astype(np.int64)
    
    # Standardize timestamps
    df['timestamp'] = standardize_timestamps(df[col_name])
    # Convert to datetime using milliseconds
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    # Drop the original column if it's different from 'timestamp'
    if col_name != 'timestamp':
        df.drop(columns=[col_name], inplace=True)
    
    return df

# set correct datatypes for further processing

In [32]:

def adjust_match_columns_and_types(df):
    list_to_drop = ['sequence', 'symbol', 'tradeId', 'type', 'makerOrderId', 'takerOrderId','time_received']

    for column in list_to_drop:
        if column in df.columns:
            df = df.drop(column, axis=1)

    # Convert columns to appropriate data types
    df['price'] = df['price'].astype(float)
    df['side'] = df['side'].astype(str)
    df['size'] = df['size'].astype(float)
    # move timestamp column 
    df = df[['timestamp', 'price', 'side', 'size']]


    return df

# call all functions to prep match data

In [33]:

path_to_match_file = '/root/trading_systems/kucoin_dir/kucoin_release_data_initial/2024-12-08_11-40_XRP/XRP_match_data.json'

df_match = file_to_df(path_to_match_file)
df_match = prep_timestamp_column(df_match)
df_match = adjust_match_columns_and_types(df_match)
df_match.info()
df_match.to_pickle('df_match_preped.pkl')
df_match

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8100 entries, 0 to 8099
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   timestamp  8100 non-null   datetime64[ns]
 1   price      8100 non-null   float64       
 2   side       8100 non-null   object        
 3   size       8100 non-null   float64       
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 253.2+ KB


Unnamed: 0,timestamp,price,side,size
0,2024-12-08 11:40:00.524,2.54541,buy,328.2866
1,2024-12-08 11:40:01.667,2.54626,sell,21.7891
2,2024-12-08 11:40:05.186,2.54743,buy,3.2935
3,2024-12-08 11:40:06.023,2.54779,buy,0.1971
4,2024-12-08 11:40:06.169,2.54797,sell,20.3782
...,...,...,...,...
8095,2024-12-08 12:40:02.797,2.56072,sell,175.5000
8096,2024-12-08 12:40:02.797,2.56071,sell,171.7576
8097,2024-12-08 12:40:03.247,2.56048,sell,194.9394
8098,2024-12-08 12:40:04.566,2.56025,sell,144.6600


# aggregate match data 

In [34]:
interval_seconds =interval_seconds

def aggregate_match_data(df, interval_seconds=1):
    # Ensure we're working with a copy to avoid modifying the original DataFrame
    df = df.copy()
    # Set the timestamp as the index
    df.set_index('timestamp', inplace=True)
    
    # Resample and aggregate
    # setting coorect timestamp
    buy_volume = df[df['side'] == 'buy'].resample(f'{interval_seconds}s', label='right', closed='right')['size'].sum()
    sell_volume = df[df['side'] == 'sell'].resample(f'{interval_seconds}s', label='right', closed='right')['size'].sum()
    trade_buy = df[df['side'] == 'buy'].resample(f'{interval_seconds}s', label='right', closed='right').size()
    trade_sell = df[df['side'] == 'sell'].resample(f'{interval_seconds}s', label='right', closed='right').size()
    avg_buy_price = df[df['side'] == 'buy'].resample(f'{interval_seconds}s', label='right', closed='right')['price'].mean()
    avg_sell_price = df[df['side'] == 'sell'].resample(f'{interval_seconds}s', label='right', closed='right')['price'].mean()
    
    # Combine the results into a single DataFrame
    aggregated = pd.DataFrame({
        'trade_buy': trade_buy,
        'trade_sell': trade_sell,
        'buy_volume': buy_volume,
        'sell_volume': sell_volume,
        'avg_match_buy_price': avg_buy_price,
        'avg_match_sell_price': avg_sell_price
    }).fillna(0)
    # Reset the index to get 'interval_start' as a column
    aggregated.reset_index(inplace=True)

    # calculate the match imbalance
    aggregated['match_imbalance'] = (aggregated['buy_volume'] - aggregated['sell_volume']) / (aggregated['buy_volume'] + aggregated['sell_volume'])
    
    return aggregated

df_match_agg = aggregate_match_data(df_match, interval_seconds=interval_seconds)
df_match_agg

NameError: name 'interval_seconds' is not defined