In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from datetime import timedelta
import json 
import pandas as pd

def file_to_df(file_path):
    file_path = file_path
    with open(file_path,'r')as f:
        data = json.load(f)
    df = pd.DataFrame(data['data'])
    return df


def prep_timestamp_column(df):
    """
    Prepare timestamp column by standardizing to milliseconds

    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame with timestamp column
    Returns:
    --------
    pandas.DataFrame
        DataFrame with converted timestamp column
    """
    # Identify the timestamp column
    possible_cols = ['time', 'timestamp']
    col_name = next((col for col in possible_cols if col in df.columns), None)
    if col_name is None:
        raise ValueError("DataFrame must contain either 'time' or 'timestamp' column")
    
    # Vectorized standardization
    def standardize_timestamps(series):
        # Convert to numeric, coercing errors to NaN
        numeric_series = pd.to_numeric(series, errors='coerce')
        # Handle NaN values
        numeric_series = numeric_series.fillna(0)
        # Convert to string and handle integer/float inputs
        ts_str = numeric_series.astype(str).str.split('.').str[0]
        # Trim long timestamps
        ts_str = ts_str.str.slice(0, 13)
        # Pad short timestamps
        ts_str = ts_str.str.pad(13, side='right', fillchar='0')
        return ts_str.astype(np.int64)
    
    # Standardize timestamps
    df['timestamp'] = standardize_timestamps(df[col_name])
    # Convert to datetime using milliseconds
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    # Drop the original column if it's different from 'timestamp'
    if col_name != 'timestamp':
        df.drop(columns=[col_name], inplace=True)
    
    return df

# load and prep deep5 data 

In [3]:
import pandas as pd
import numpy as np
level2_deep5_filepath = '/root/trading_systems/kucoin_dir/kucoin_release_data_initial/2024-12-02_10-00_QUILL/QUILL_level2Depth5_data.json'

# Assuming df is your DataFrame loaded from the file
df_deep5 = file_to_df(level2_deep5_filepath)
def prep_deep5_df(df):
    # Convert lists to numpy arrays
    asks_array = np.array(df['asks'].tolist(), dtype=float)
    bids_array = np.array(df['bids'].tolist(), dtype=float)
    # Access the first value of each array
    first_ask_values = asks_array[:, 0, 0]
    first_bid_values = bids_array[:, 0, 0]

    # create column with best ask and best bid and average of the two
    df['best_ask'] = first_ask_values
    df['best_bid'] = first_bid_values
    df['mean_price'] = df[['best_ask', 'best_bid']].mean(axis=1)

    # Extract prices and sizes
    ask_prices = asks_array[:, :, 0]
    ask_sizes = asks_array[:, :, 1]
    bid_prices = bids_array[:, :, 0]
    bid_sizes = bids_array[:, :, 1]

    # Calculate weighted average prices
    weighted_avg_ask_price = np.sum(ask_prices * ask_sizes, axis=1) / np.sum(ask_sizes, axis=1)
    weighted_avg_bid_price = np.sum(bid_prices * bid_sizes, axis=1) / np.sum(bid_sizes, axis=1)

    # Add the results to the DataFrame
    df['weighted_avg_ask_price'] = weighted_avg_ask_price
    df['weighted_avg_bid_price'] = weighted_avg_bid_price

    # Calculate total bid size and total ask size
    total_bid_size = np.sum(bid_sizes, axis=1)
    total_ask_size = np.sum(ask_sizes, axis=1)

    # Calculate imbalance
    imbalance = (total_bid_size - total_ask_size) / (total_bid_size + total_ask_size)

    # Add the imbalance to the DataFrame
    df['imbalance'] = imbalance
    df.drop(['asks', 'bids','time_received'], axis=1, inplace=True)
    return df

df_deep5 = prep_timestamp_column(df_deep5)
df_deep5 = prep_deep5_df(df_deep5)
df_deep5

Unnamed: 0,timestamp,best_ask,best_bid,mean_price,weighted_avg_ask_price,weighted_avg_bid_price,imbalance
0,2024-12-02 10:00:00.085,0.7333,0.6500,0.69165,0.838110,0.604367,0.674873
1,2024-12-02 10:00:00.114,0.7333,0.6500,0.69165,0.838110,0.604367,0.674873
2,2024-12-02 10:00:00.172,1.3143,1.1000,1.20715,1.364230,0.672417,0.909640
3,2024-12-02 10:00:00.277,1.3143,1.1000,1.20715,1.364230,0.725581,0.903226
4,2024-12-02 10:00:00.388,1.5500,1.5000,1.52500,1.614378,1.269828,0.624693
...,...,...,...,...,...,...,...
6029,2024-12-02 10:19:58.148,1.3837,1.3825,1.38310,1.384218,1.360975,-0.054032
6030,2024-12-02 10:19:58.246,1.3837,1.3825,1.38310,1.384218,1.362178,-0.414337
6031,2024-12-02 10:19:58.568,1.3837,1.3825,1.38310,1.384218,1.362178,-0.414337
6032,2024-12-02 10:19:58.986,1.3837,1.3825,1.38310,1.384218,1.363617,-0.446000


### aggregate 

# prep and load match data 

In [4]:


def adjust_match_columns_and_types(df):
    list_to_drop = ['sequence', 'symbol', 'tradeId', 'type', 'makerOrderId', 'takerOrderId','time_received']

    for column in list_to_drop:
        if column in df.columns:
            df = df.drop(column, axis=1)

    # Convert columns to appropriate data types
    df['price'] = df['price'].astype(float)
    df['side'] = df['side'].astype(str)
    df['size'] = df['size'].astype(float)


    return df

path_to_match_file = '/root/trading_systems/kucoin_dir/kucoin_release_data_initial/2024-12-02_10-00_QUILL/QUILL_match_data.json'
df_match = file_to_df(path_to_match_file)
df_match = adjust_match_columns_and_types(df_match)
df_match = prep_timestamp_column(df_match)
df_match.info()
df_match

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4331 entries, 0 to 4330
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   price      4331 non-null   float64       
 1   side       4331 non-null   object        
 2   size       4331 non-null   float64       
 3   timestamp  4331 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 135.5+ KB


Unnamed: 0,price,side,size,timestamp
0,0.6000,sell,1.00,2024-12-02 10:00:00.000
1,0.6444,buy,57.90,2024-12-02 10:00:00.052
2,0.6889,buy,101.20,2024-12-02 10:00:00.052
3,0.7333,buy,146.95,2024-12-02 10:00:00.068
4,0.7778,buy,173.97,2024-12-02 10:00:00.068
...,...,...,...,...
4326,1.3625,sell,1.46,2024-12-02 10:19:54.464
4327,1.3583,sell,20.97,2024-12-02 10:19:54.464
4328,1.3530,sell,2.22,2024-12-02 10:19:54.464
4329,1.3627,buy,18.83,2024-12-02 10:19:54.977


# aggregate match data 

In [5]:
def aggregate_match_data(df, interval_seconds=1):
    # Ensure we're working with a copy to avoid modifying the original DataFrame
    df = df.copy()
    # Set the timestamp as the index
    df.set_index('timestamp', inplace=True)
    
    # Resample and aggregate
    # setting coorect timestamp
    buy_volume = df[df['side'] == 'buy'].resample(f'{interval_seconds}s', label='right', closed='right')['size'].sum()
    sell_volume = df[df['side'] == 'sell'].resample(f'{interval_seconds}s', label='right', closed='right')['size'].sum()
    trade_buy = df[df['side'] == 'buy'].resample(f'{interval_seconds}s', label='right', closed='right').size()
    trade_sell = df[df['side'] == 'sell'].resample(f'{interval_seconds}s', label='right', closed='right').size()
    avg_buy_price = df[df['side'] == 'buy'].resample(f'{interval_seconds}s', label='right', closed='right')['price'].mean()
    avg_sell_price = df[df['side'] == 'sell'].resample(f'{interval_seconds}s', label='right', closed='right')['price'].mean()
    
    # Combine the results into a single DataFrame
    aggregated = pd.DataFrame({
        'trade_buy': trade_buy,
        'trade_sell': trade_sell,
        'buy_volume': buy_volume,
        'sell_volume': sell_volume,
        'avg_match_buy_price': avg_buy_price,
        'avg_match_sell_price': avg_sell_price
    }).fillna(0)
    # Reset the index to get 'interval_start' as a column
    aggregated.reset_index(inplace=True)
    
    return aggregated

df_match_agg_new = aggregate_match_data(df_match, interval_seconds=10)
df_match_agg_new

Unnamed: 0,timestamp,trade_buy,trade_sell,buy_volume,sell_volume,avg_match_buy_price,avg_match_sell_price
0,2024-12-02 10:00:00,0.0,1,0.00,1.00,0.000000,0.600000
1,2024-12-02 10:00:10,117.0,86,10672.70,4431.97,1.485815,1.522079
2,2024-12-02 10:00:20,4.0,78,64.70,3241.60,1.245650,1.126608
3,2024-12-02 10:00:30,18.0,42,926.88,1196.68,0.888878,0.804736
4,2024-12-02 10:00:40,31.0,7,1280.70,229.50,0.886271,0.923543
...,...,...,...,...,...,...,...
116,2024-12-02 10:19:20,45.0,9,103.87,215.92,1.361476,1.351000
117,2024-12-02 10:19:30,33.0,0,43.36,0.00,1.366542,0.000000
118,2024-12-02 10:19:40,17.0,0,24.53,0.00,1.373765,0.000000
119,2024-12-02 10:19:50,5.0,6,7.22,8.72,1.375920,1.370417


In [6]:

level2_deep5_filepath = '/root/trading_systems/kucoin_dir/kucoin_release_data_initial/2024-12-02_10-00_QUILL/QUILL_level2Depth5_data.json'

# Assuming df is your DataFrame loaded from the file
df_deep5 = file_to_df(level2_deep5_filepath)

# Convert lists to numpy arrays
asks_array = np.array(df_deep5['asks'].tolist(), dtype=float)
bids_array = np.array(df_deep5['bids'].tolist(), dtype=float)

# Extract prices and sizes
ask_prices = asks_array[:, :, 0]
ask_sizes = asks_array[:, :, 1]
bid_prices = bids_array[:, :, 0]
bid_sizes = bids_array[:, :, 1]

# Calculate weighted average prices
weighted_avg_ask_price = np.sum(ask_prices * ask_sizes, axis=1) / np.sum(ask_sizes, axis=1)
weighted_avg_bid_price = np.sum(bid_prices * bid_sizes, axis=1) / np.sum(bid_sizes, axis=1)

# Add the results to the DataFrame
df_deep5['weighted_avg_ask_price'] = weighted_avg_ask_price
df_deep5['weighted_avg_bid_price'] = weighted_avg_bid_price

# Calculate total bid size and total ask size
total_bid_size = np.sum(bid_sizes, axis=1)
total_ask_size = np.sum(ask_sizes, axis=1)

# Calculate imbalance
imbalance = (total_bid_size - total_ask_size) / (total_bid_size + total_ask_size)

# Add the imbalance to the DataFrame
df_deep5['imbalance'] = imbalance

# Display the DataFrame with the new columns
df_deep5[['timestamp', 'weighted_avg_ask_price', 'weighted_avg_bid_price', 'imbalance']]

df_deep5 = prep_timestamp_column(df_deep5)
df_deep5.drop(columns=['time_received','bids','asks'], inplace=True)
df_deep5.head(20)

Unnamed: 0,timestamp,weighted_avg_ask_price,weighted_avg_bid_price,imbalance
0,2024-12-02 10:00:00.085,0.83811,0.604367,0.674873
1,2024-12-02 10:00:00.114,0.83811,0.604367,0.674873
2,2024-12-02 10:00:00.172,1.36423,0.672417,0.90964
3,2024-12-02 10:00:00.277,1.36423,0.725581,0.903226
4,2024-12-02 10:00:00.388,1.614378,1.269828,0.624693
5,2024-12-02 10:00:00.485,1.688976,1.417163,0.40543
6,2024-12-02 10:00:00.590,1.735614,1.500009,0.155187
7,2024-12-02 10:00:00.696,1.735677,1.499972,0.155694
8,2024-12-02 10:00:00.803,1.735708,1.499998,0.155949
9,2024-12-02 10:00:00.906,1.735788,1.500073,0.156584


In [10]:
def aggregate_imbalance(df, interval_seconds=1):    
    # copy to prevent altering original df
    df_copy =df.copy()

    print(df_copy.columns)
    df_copy.set_index('timestamp', inplace=True)


    # Resample the DataFrame to the specified time interval (e.g., 10 seconds)
    resampled_df = df_copy.resample(f'{interval_seconds}s',label='right',closed='right').agg({
        'imbalance': 'mean'
    })
    resampled_df= resampled_df.reset_index()
    return resampled_df
resampled_df = aggregate_imbalance(df_deep5)
resampled_df

# merge resampled_df with df_deep5 to get the imbalance matched with avagerage weighted ask and averegaed weighted bit 
df_deep5_interval = pd.merge_asof(resampled_df, df_deep5, on='timestamp', direction='backward')
df_deep5_interval
 


Index(['timestamp', 'weighted_avg_ask_price', 'weighted_avg_bid_price',
       'imbalance'],
      dtype='object')


In [8]:
# Function to create candlestick data
def create_candlestick_data(df, time_frame='1T'):
    # Resample the data
    ohlc_dict = {
        'avg_bid_ask_price_open': ('avg_bid_ask_price', 'first'),
        'avg_bid_ask_price_high': ('avg_bid_ask_price', 'max'),
        'avg_bid_ask_price_low': ('avg_bid_ask_price', 'min'),
        'avg_bid_ask_price_close': ('avg_bid_ask_price', 'last')
    }
    candlestick_df = df.resample(time_frame, on='timestamp').agg(**ohlc_dict)
    candlestick_df = candlestick_df.dropna()
    return candlestick_df

# Create candlestick data with a 1-minute time frame
candlestick_df = create_candlestick_data(df_match_deep_merge, time_frame='10s')
candlestick_df

NameError: name 'df_match_deep_merge' is not defined

In [117]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create subplots with shared x-axis
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, 
                    vertical_spacing=0.1, 
                    subplot_titles=('Candlestick Chart', 'Volume'))

# Add candlestick chart to the first subplot
fig.add_trace(go.Candlestick(
    x=candlestick_df.index,
    open=candlestick_df['avg_bid_ask_price_open'],
    high=candlestick_df['avg_bid_ask_price_high'],
    low=candlestick_df['avg_bid_ask_price_low'],
    close=candlestick_df['avg_bid_ask_price_close'],
    name='Candlestick'
), row=1, col=1)

# Add buy_volume line to the second subplot
fig.add_trace(go.Scatter(
    x=df_match_agg['timestamp'],
    y=df_match_agg['buy_volume'],
    mode='lines',
    name='Buy Volume'
), row=2, col=1)

# Add sell_volume line to the second subplot
fig.add_trace(go.Scatter(
    x=df_match_agg['timestamp'],
    y=df_match_agg['sell_volume'],
    mode='lines',
    name='Sell Volume'
), row=2, col=1)

# Update layout for better visualization
fig.update_layout(
    title='Candlestick Chart with Volume',
    xaxis_title='Timestamp',
    yaxis_title='Average Bid-Ask Price',
    xaxis2_title='Timestamp',
    yaxis2_title='Volume',
    xaxis_rangeslider_visible=False,
    hovermode='x unified'  # Enable cursor lines indicating x and y axis
)

# Display the chart
fig.show()

In [118]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create subplots with shared x-axis
fig = make_subplots(rows=3, cols=1, shared_xaxes=True, 
                    vertical_spacing=0.1, 
                    subplot_titles=('Candlestick Chart', 'Volume', 'Price Data'))

# Add candlestick chart to the first subplot
fig.add_trace(go.Candlestick(
    x=candlestick_df.index,
    open=candlestick_df['avg_bid_ask_price_open'],
    high=candlestick_df['avg_bid_ask_price_high'],
    low=candlestick_df['avg_bid_ask_price_low'],
    close=candlestick_df['avg_bid_ask_price_close'],
    name='Candlestick'
), row=1, col=1)

# Add buy_volume line to the second subplot
fig.add_trace(go.Scatter(
    x=df_match_agg['timestamp'],
    y=df_match_agg['buy_volume'],
    mode='lines',
    name='Buy Volume'
), row=2, col=1)

# Add sell_volume line to the second subplot
fig.add_trace(go.Scatter(
    x=df_match_agg['timestamp'],
    y=df_match_agg['sell_volume'],
    mode='lines',
    name='Sell Volume'
), row=2, col=1)

# Add traces for the specified price columns to the third subplot
price_columns_to_plot = [
    'avg_match_buy_price', 'avg_match_sell_price', 
    'ask_price_1', 'ask_price_2', 'ask_price_3',
    'ask_price_4', 'ask_price_5',
    'bid_price_1', 'bid_price_2', 'bid_price_3',
    'bid_price_4', 'bid_price_5'
]

for column in price_columns_to_plot:
    fig.add_trace(go.Scatter(
        x=df_merge['timestamp'],
        y=df_merge[column],
        mode='lines',
        name=column
    ), row=3, col=1)

# Update layout for better visualization
fig.update_layout(
    title='Candlestick Chart with Volume and Price Data',
    xaxis_title='Timestamp',
    yaxis_title='Average Bid-Ask Price',
    xaxis2_title='Timestamp',
    yaxis2_title='Volume',
    xaxis3_title='Timestamp',
    yaxis3_title='Price',
    xaxis_rangeslider_visible=False,
    hovermode='x unified'  # Enable cursor lines indicating x and y axis
)

# Display the chart
fig.show()

In [119]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create subplots with shared x-axis
fig = make_subplots(rows=3, cols=1, shared_xaxes=True, 
                    vertical_spacing=0.1, 
                    subplot_titles=('Candlestick Chart', 'Volume', 'Size Data'))

# Add candlestick chart to the first subplot
fig.add_trace(go.Candlestick(
    x=candlestick_df.index,
    open=candlestick_df['avg_bid_ask_price_open'],
    high=candlestick_df['avg_bid_ask_price_high'],
    low=candlestick_df['avg_bid_ask_price_low'],
    close=candlestick_df['avg_bid_ask_price_close'],
    name='Candlestick'
), row=1, col=1)

# Add buy_volume line to the second subplot
fig.add_trace(go.Scatter(
    x=df_match_agg['timestamp'],
    y=df_match_agg['buy_volume'],
    mode='lines',
    name='Buy Volume'
), row=2, col=1)

# Add sell_volume line to the second subplot
fig.add_trace(go.Scatter(
    x=df_match_agg['timestamp'],
    y=df_match_agg['sell_volume'],
    mode='lines',
    name='Sell Volume'
), row=2, col=1)

# Add traces for the specified size columns to the third subplot
size_columns_to_plot = [
    'ask_size_1', 'ask_size_2', 'ask_size_3', 'ask_size_4', 'ask_size_5',
    'bid_size_1', 'bid_size_2', 'bid_size_3', 'bid_size_4', 'bid_size_5'
]

for column in size_columns_to_plot:
    fig.add_trace(go.Scatter(
        x=df_merge['timestamp'],
        y=df_merge[column],
        mode='lines',
        name=column
    ), row=3, col=1)

# Update layout for better visualization
fig.update_layout(
    title='Candlestick Chart with Volume and Size Data',
    xaxis_title='Timestamp',
    yaxis_title='Average Bid-Ask Price',
    xaxis2_title='Timestamp',
    yaxis2_title='Volume',
    xaxis3_title='Timestamp',
    yaxis3_title='Size',
    xaxis_rangeslider_visible=False,
    hovermode='x unified'  # Enable cursor lines indicating x and y axis
)

# Display the chart
fig.show()

In [120]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create subplots with shared x-axis
fig = make_subplots(rows=3, cols=1, shared_xaxes=True, 
                    vertical_spacing=0.1, 
                    subplot_titles=('Candlestick Chart', 'Volume', 'Bid and Ask Prices and Sizes'),
                    specs=[[{"secondary_y": False}], [{"secondary_y": False}], [{"secondary_y": True}]])

# Add candlestick chart to the first subplot
fig.add_trace(go.Candlestick(
    x=candlestick_df.index,
    open=candlestick_df['avg_bid_ask_price_open'],
    high=candlestick_df['avg_bid_ask_price_high'],
    low=candlestick_df['avg_bid_ask_price_low'],
    close=candlestick_df['avg_bid_ask_price_close'],
    name='Candlestick'
), row=1, col=1)

# Add buy_volume line to the second subplot
fig.add_trace(go.Scatter(
    x=df_match_agg['timestamp'],
    y=df_match_agg['buy_volume'],
    mode='lines',
    name='Buy Volume'
), row=2, col=1)

# Add sell_volume line to the second subplot
fig.add_trace(go.Scatter(
    x=df_match_agg['timestamp'],
    y=df_match_agg['sell_volume'],
    mode='lines',
    name='Sell Volume'
), row=2, col=1)

# Add traces for the specified bid and ask prices and sizes to the third subplot
price_columns_to_plot = [
    'ask_price_1', 'ask_price_2', 'ask_price_3', 'ask_price_4', 'ask_price_5',
    'bid_price_1', 'bid_price_2', 'bid_price_3', 'bid_price_4', 'bid_price_5'
]

size_columns_to_plot = [
    'ask_size_1', 'ask_size_2', 'ask_size_3', 'ask_size_4', 'ask_size_5',
    'bid_size_1', 'bid_size_2', 'bid_size_3', 'bid_size_4', 'bid_size_5'
]

# Add bid and ask prices to the primary y-axis
for column in price_columns_to_plot:
    fig.add_trace(go.Scatter(
        x=df_merge['timestamp'],
        y=df_merge[column],
        mode='lines',
        name=column
    ), row=3, col=1, secondary_y=False)

# Add bid and ask sizes to the secondary y-axis
for column in size_columns_to_plot:
    fig.add_trace(go.Scatter(
        x=df_merge['timestamp'],
        y=df_merge[column],
        mode='lines',
        name=column
    ), row=3, col=1, secondary_y=True)

# Update layout for better visualization
fig.update_layout(
    title='Candlestick Chart with Volume and Bid/Ask Prices and Sizes',
    xaxis_title='Timestamp',
    yaxis_title='Average Bid-Ask Price',
    xaxis2_title='Timestamp',
    yaxis2_title='Volume',
    xaxis3_title='Timestamp',
    yaxis3_title='Price',
    yaxis4_title='Size',
    xaxis_rangeslider_visible=False,
    hovermode='x unified'  # Enable cursor lines indicating x and y axis
)

# Display the chart
fig.show()