In [1]:
import sys
import json

import numpy as np
import matplotlib.pyplot as plt

from io import StringIO
from numba import jit

from typing import Tuple

# Testing the logic

## Set up the arrays to use for storing and analyzing data

Note that the arrays get quite large, so make sure you have enough RAM

In [2]:
num_markets = 12001
num_data_points = 100001

market_data = np.empty((num_markets, num_data_points, 4), dtype=np.float64) * np.nan
market_analysis = np.empty((num_markets, 5), dtype=np.float64) * np.nan

In [3]:
market_analysis.dtype

dtype('float64')

## Use the numby JIT compiler for core analysis
Roughly 10x speedup over vanilla numpy

In [8]:
@jit(nopython=True, fastmath=True)
def analyze_data(market: int, idx: int, market_data: np.ndarray, market_analysis: np.ndarray):
    # Total volume
    market_analysis[market, 0] = np.nansum(market_data[market, :idx + 1, 0])
    # Mean price
    market_analysis[market, 1] = np.nanmean(market_data[market, :idx + 1, 1])
    # Mean volume
    market_analysis[market, 2] = market_analysis[market, 0] / (idx + 1)
    # Volume weighted price mean
    market_analysis[market, 3] = np.nanmean(market_data[market, :idx + 1, 0] * market_data[market, :idx + 1, 1])
    # Percentage buys
    market_analysis[market, 4] = np.count_nonzero(market_data[market, :idx + 1, 0] == 1) / (idx + 1) * 100
    
analyze_data(0, 0, market_data, market_analysis)

## Extract data from the json

In [9]:
def get_data(line: str) -> Tuple[int, int, np.ndarray]:
    idx = line["id"]
    
    market = line["market"]
    market_data[market, idx, 0] = line["volume"]
    market_data[market, idx, 1] = line["price"]
    market_data[market, idx, 2] = line["volume"] * line["price"]
    market_data[market, idx, 3] = line["is_buy"]
    return idx, market, market_data

## Simulated stdin

In [10]:
sys.stdin = StringIO('{"id":100000,"market":5773,"price":1.234,"volume":1234.56,"is_buy":true}')
line = json.loads(sys.stdin.readline())
idx, market, market_data = get_data(line)

### Time the code execution

In [None]:
%%timeit
analyze_data(market, idx, market_data, market_analysis)

## Variation of analysis time with id
Data shown on a log-log plot

Limitations
- The additional elements are just NaNs, not actual float -> this is the minimum analysis time

In [None]:
ids = [1, 10, 100, 1000, 10000, 100000]
analysis_time = [314e-9, 340e-9, 884e-9, 6.01e-6, 58.6e-6, 578e-6]

In [None]:
fig, ax = plt.subplots()
ax.plot(ids, analysis_time, "o-")
ax.set_xlabel("ID of element")
ax.set_ylabel("Analysis time (s)")

ax.set_yscale("log")
ax.set_xscale("log")
plt.savefig("variation_analysis_time_with_id.jpg", dpi=300)