In [1]:
# Setup
import numpy as np
import pandas as pd
import pyarrow as pa

import pykx as kx
kx.q.system.console_size = [15, 100]

# Section #2: Built-In Methods for Analytics on PyKX Objects

## Interacting with PyKX Objects: Indexing & Slicing

In [2]:
# Using PyKX to read in a BTC table
# This datasets holds bitcoin trading data for every minute from 2017.08.17 to early 2024
tab_BTC = kx.q.read.csv('data/BTCUSDT.csv')
tab_BTC.head()

Unnamed: 0,timestamp,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore
,,,,,,,,,,,,
0.0,2017.08.17D04:00:00.000000000,4261.48e,4261.48e,4261.48e,4261.48e,1.775183,2017.08.17D04:00:59.999000000,7564.907,3h,0.075183,320.3909,0b
1.0,2017.08.17D04:01:00.000000000,4261.48e,4261.48e,4261.48e,4261.48e,0f,2017.08.17D04:01:59.999000000,0f,0h,0f,0f,0b
2.0,2017.08.17D04:02:00.000000000,4280.56e,4280.56e,4280.56e,4280.56e,0.261074,2017.08.17D04:02:59.999000000,1117.543,2h,0.261074,1117.543,0b
3.0,2017.08.17D04:03:00.000000000,4261.48e,4261.48e,4261.48e,4261.48e,0.012008,2017.08.17D04:03:59.999000000,51.17185,3h,0.012008,51.17185,0b
4.0,2017.08.17D04:04:00.000000000,4261.48e,4261.48e,4261.48e,4261.48e,0.140796,2017.08.17D04:04:59.999000000,599.9993,1h,0.140796,599.9993,0b


#### Use indexing to identify the number of trades for the first transaction:

In [3]:
tab_BTC['number_of_trades'][0]

pykx.ShortAtom(pykx.q('3h'))

##### Exercise 4
+ Using the `tab_BTC` table, return the close price for the 100th transaction

In [4]:
tab_BTC['close'][99]

pykx.RealAtom(pykx.q('4291.37e'))

#### Use Slicing to get the first 24 hours (1440 minutes) of the dataset:

In [5]:
first_day_prices_BTC = tab_BTC['close'][:1440]
first_day_prices_BTC

pykx.RealVector(pykx.q('4261.48 4261.48 4280.56 4261.48 4261.48 4261.48 4261.48 4261.48 4261.48 4261.48 4261.48 4261.48 4..'))

##### Exercise 5
+ Create a table called `third_day_prices_BTC` and use slicing to get the first 72 hours of the dataset.

In [6]:
third_day_prices_BTC = tab_BTC['close'][:4320]
third_day_prices_BTC

pykx.RealVector(pykx.q('4261.48 4261.48 4280.56 4261.48 4261.48 4261.48 4261.48 4261.48 4261.48 4261.48 4261.48 4261.48 4..'))

## Data Manipulation Operations

#### Using Pandas-like API

In [7]:
first_day_prices_BTC.mean()

pykx.FloatAtom(pykx.q('4343.483'))

In [8]:
first_day_prices_BTC.min()

pykx.RealAtom(pykx.q('4142.67e'))

In [9]:
first_day_prices_BTC.max()

pykx.RealAtom(pykx.q('4485.39e'))

In [10]:
first_day_prices_BTC.median()

pykx.FloatAtom(pykx.q('4328.67'))

##### Exercise 6

+ Using `third_day_prices_BTC`, calculate the sum of the prices on the third day.

In [11]:
third_day_prices_BTC.sum()

pykx.RealAtom(pykx.q('1.816895e+07e'))

## Custom Functions with apply()

In [12]:
# Custom function for custom analytics
def custom_function(x,y):
    return x*y

In [13]:
first_day_prices_BTC.apply(custom_function,5)

pykx.RealVector(pykx.q('21307.4 21307.4 21402.8 21307.4 21307.4 21307.4 21307.4 21307.4 21307.4 21307.4 21307.4 21307.4 2..'))

## PyKX Native Functions

Run faster and more efficiently than Numpy or Pandas operations.
See Documentation: https://code.kx.com/pykx/2.5/api/pykx-execution/q.html

In [14]:
# Max
kx.q.max(first_day_prices_BTC)

pykx.RealAtom(pykx.q('4485.39e'))

In [15]:
# Min
kx.q.min(first_day_prices_BTC)

pykx.RealAtom(pykx.q('4142.67e'))

In [16]:
# Average
kx.q.avg(first_day_prices_BTC)

pykx.FloatAtom(pykx.q('4343.483'))

In [17]:
# Median
kx.q.med(first_day_prices_BTC)

pykx.FloatAtom(pykx.q('4328.67'))

In [18]:
# Count
kx.q.count(first_day_prices_BTC)

pykx.LongAtom(pykx.q('1440'))

In [19]:
# Sorting
kx.q.asc(first_day_prices_BTC)

pykx.RealVector(pykx.q('`s#4142.67 4164.28 4172.81 4196.73 4196.73 4196.73 4200.74 4202.73 4202.73 4202.75 4202.75 4208.1..'))

##### Exercise 7
+ Sort `first_day_prices_BTC` in descending order

In [20]:
kx.q.desc(first_day_prices_BTC)

pykx.RealVector(pykx.q('4485.39 4485.39 4485.39 4485.39 4485.39 4485.39 4485.39 4485.39 4485.39 4485.39 4485.39 4485.39 4..'))

### Analytics Between Datasets

#### Let's load another CSV containing data on ETH to do analytics across datasets:

In [21]:
# Using Pandas
btc_df = pd.read_csv("data/BTCUSDT.csv")
eth_df = pd.read_csv("data/ETHUSDT.csv")
print(eth_df.shape)
eth_df.head()

(3565390, 12)


Unnamed: 0,timestamp,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore
0,2017-08-17 04:00:00,301.13,301.13,301.13,301.13,0.42643,2017-08-17 04:00:59.999,128.410866,2,0.42643,128.410866,0
1,2017-08-17 04:01:00,301.13,301.13,301.13,301.13,2.75787,2017-08-17 04:01:59.999,830.477393,4,2.75787,830.477393,0
2,2017-08-17 04:02:00,300.0,300.0,300.0,300.0,0.0993,2017-08-17 04:02:59.999,29.79,2,0.0993,29.79,0
3,2017-08-17 04:03:00,300.0,300.0,300.0,300.0,0.31389,2017-08-17 04:03:59.999,94.167,3,0.0,0.0,0
4,2017-08-17 04:04:00,301.13,301.13,301.13,301.13,0.23202,2017-08-17 04:04:59.999,69.868183,1,0.23202,69.868183,0


In [22]:
# Load with PyKX
tab_ETH = kx.q.read.csv('data/ETHUSDT.csv')
tab_ETH.head()

Unnamed: 0,timestamp,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore
,,,,,,,,,,,,
0.0,2017.08.17D04:00:00.000000000,301.13e,301.13e,301.13e,301.13e,0.42643,2017.08.17D04:00:59.999000000,128.4109,2h,0.42643,128.4109,0b
1.0,2017.08.17D04:01:00.000000000,301.13e,301.13e,301.13e,301.13e,2.75787,2017.08.17D04:01:59.999000000,830.4774,4h,2.75787,830.4774,0b
2.0,2017.08.17D04:02:00.000000000,300e,300e,300e,300e,0.0993,2017.08.17D04:02:59.999000000,29.79,2h,0.0993,29.79,0b
3.0,2017.08.17D04:03:00.000000000,300e,300e,300e,300e,0.31389,2017.08.17D04:03:59.999000000,94.167,3h,0f,0f,0b
4.0,2017.08.17D04:04:00.000000000,301.13e,301.13e,301.13e,301.13e,0.23202,2017.08.17D04:04:59.999000000,69.86818,1h,0.23202,69.86818,0b


In [23]:
# Confirm the ETH dataset is the same size as the BTC dataset
tab_ETH.shape

(pykx.LongAtom(pykx.q('3565390')), pykx.LongAtom(pykx.q('12')))

In [24]:
# Create a PyKX vector of the first day of ETH prices
first_day_prices_ETH = tab_ETH['close'][:1440]
print(first_day_prices_ETH)

301.13 301.13 300 300 301.13 301.13 300.1 300.1 298 298 298 298 298 298 298 298 298 298 298 299.0..


#### For the first day of prices, find the correlation between ETH and BTC using both PyKX and Pandas Functionality. We see that PyKX runs ~40% faster than Pandas.

In [25]:
%%timeit
kx.q.cor(first_day_prices_BTC, first_day_prices_ETH)

115 µs ± 10.5 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [26]:
%%timeit
btc_df['close'][:1440].corr(eth_df['close'][:1440])

161 µs ± 13.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


#### Let's scale this correlation measurement to the entire dataset for BTC and ETH which spans 3565390 rows, we see PyKX runs ~5x faster than Pandas.

In [27]:
%%timeit
kx.q.cor(tab_BTC['close'], tab_ETH['close'])

9.46 ms ± 417 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [28]:
%%timeit
btc_df['close'].corr(eth_df['close'])

44.8 ms ± 4.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
