# Formatting Gemini Data
## Ver 2
Formatting Gemini data for input into d3 dashboard

In [29]:
import pandas as pd
from tqdm import tqdm

Read in gemini data

In [3]:
%%time
with open("./gemini_pull_2019_01_16_18_07_06.txt", "r") as file:
    gemini = file.read()

CPU times: user 2.17 ms, sys: 4.69 ms, total: 6.86 ms
Wall time: 80.7 ms


Separate messages

In [4]:
%%time
gemini_list = gemini.replace("}{", "}\n{").split("\n")

CPU times: user 7.25 ms, sys: 7.61 ms, total: 14.9 ms
Wall time: 16.8 ms


In [5]:
len(gemini_list)

10844

The first message is larger, gives remaining interest for all price points at the time of socket open

In [7]:
[len(i) for i in gemini_list[:10]]

[386154, 219, 222, 220, 226, 224, 222, 220, 222, 219]

In [8]:
first_msg = gemini_list[0]

In [10]:
first_msg[:400]

'{"type":"update","eventId":5368619062,"socket_sequence":0,"events":[{"type":"change","reason":"initial","price":"0.01","delta":"191642.102","remaining":"191642.102","side":"bid"},{"type":"change","reason":"initial","price":"0.02","delta":"22959.5","remaining":"22959.5","side":"bid"},{"type":"change","reason":"initial","price":"0.03","delta":"33","remaining":"33","side":"bid"},{"type":"change","rea'

In [11]:
first_message_df = pd.read_json(gemini_list[0])

In [12]:
first_message_df.head()

Unnamed: 0,eventId,events,socket_sequence,type
0,5368619062,"{'type': 'change', 'price': '0.01', 'reason': ...",0,update
1,5368619062,"{'type': 'change', 'price': '0.02', 'reason': ...",0,update
2,5368619062,"{'type': 'change', 'price': '0.03', 'reason': ...",0,update
3,5368619062,"{'type': 'change', 'price': '0.04', 'reason': ...",0,update
4,5368619062,"{'type': 'change', 'price': '0.05', 'reason': ...",0,update


In [13]:
event_keys = first_message_df["events"].iloc[0].keys()

In [14]:
for i in event_keys:
    first_message_df[i] = first_message_df["events"].map(lambda x: x[i])

In [15]:
first_message_df.head()

Unnamed: 0,eventId,events,socket_sequence,type,price,reason,delta,side,remaining
0,5368619062,"{'type': 'change', 'price': '0.01', 'reason': ...",0,change,0.01,initial,191642.102,bid,191642.102
1,5368619062,"{'type': 'change', 'price': '0.02', 'reason': ...",0,change,0.02,initial,22959.5,bid,22959.5
2,5368619062,"{'type': 'change', 'price': '0.03', 'reason': ...",0,change,0.03,initial,33.0,bid,33.0
3,5368619062,"{'type': 'change', 'price': '0.04', 'reason': ...",0,change,0.04,initial,0.00055,bid,0.00055
4,5368619062,"{'type': 'change', 'price': '0.05', 'reason': ...",0,change,0.05,initial,3693.0,bid,3693.0


each price shows up once:

In [17]:
first_message_df.price.value_counts().head()

16490.00    1
8880.00     1
16790.00    1
10600.00    1
4097.00     1
Name: price, dtype: int64

delta always equals remaining:

In [18]:
len(first_message_df[first_message_df.remaining != first_message_df.delta])

0

In [19]:
initial_interest = first_message_df[["side", "price", "remaining"]].drop_duplicates()

In [21]:
len(initial_interest)

3585

In [20]:
gemini_list[1]

'{"type":"update","eventId":5368619071,"timestamp":1547662175,"timestampms":1547662175932,"socket_sequence":1,"events":[{"type":"change","side":"bid","price":"3614.45","remaining":"0","delta":"-0.04","reason":"cancel"}]}'

In [22]:
%%time
update_str = ", ".join(gemini_list[1:])

CPU times: user 1.81 ms, sys: 281 µs, total: 2.09 ms
Wall time: 2.1 ms


In [23]:
%%time
all_updates_df = pd.read_json("[{}]".format(update_str))

CPU times: user 153 ms, sys: 38 ms, total: 191 ms
Wall time: 193 ms


In [24]:
%%time
all_updates_df["num_events"] = all_updates_df.events.map(len)

CPU times: user 5.2 ms, sys: 127 µs, total: 5.32 ms
Wall time: 7.23 ms


In [30]:
for i in tqdm(event_keys):
    all_updates_df[i] = all_updates_df["events"].map(lambda x: x[-1][i] if i in x[-1].keys() else "missing")

100%|██████████| 6/6 [00:00<00:00, 91.83it/s]


In [59]:
all_updates_df["price_double"] = all_updates_df.price.astype("double").round(2)

In [60]:
all_updates_df.num_events.value_counts()

1    10645
2      198
Name: num_events, dtype: int64

In [61]:
trades = all_updates_df[all_updates_df.num_events == 2]

In [62]:
trades.iloc[:2]

Unnamed: 0,eventId,events,socket_sequence,timestamp,timestampms,type,num_events,price,reason,delta,side,remaining,price_double
37,5368619504,"[{'type': 'trade', 'amount': '0.45', 'tid': 53...",38,2019-01-16 18:09:37,2019-01-16 18:09:37.570,change,2,3620.42,trade,-0.45,bid,0.518,3620.42
107,5368620111,"[{'type': 'trade', 'amount': '0.1', 'tid': 536...",108,2019-01-16 18:09:40,2019-01-16 18:09:40.469,change,2,3620.5,trade,-0.1,bid,0.611,3620.5


In [64]:
trades_range

[3612.6399999999999, 3622.96]

In [73]:
trades.sort_values("price_double").price_double.round(3).iloc[0]

3612.6399999999999

In [77]:
all_updates_df.price.map(len).value_counts()

7    10843
Name: price, dtype: int64

In [79]:
all_updates_df.price.nunique()

1775

In [83]:
all_updates_df["price_padded"] = all_updates_df.price.str.pad(7, "left", "0")

In [92]:
trades_range = [
    all_updates_df[all_updates_df.num_events == 2].price_padded.min(),
    all_updates_df[all_updates_df.num_events == 2].price_padded.max()]

In [86]:
interest_changes = all_updates_df[["timestampms", "price_padded", "remaining", "side"]]

In [96]:
interest_changes_in_trades_range = interest_changes[interest_changes.price_padded.between(*trades_range)]

In [99]:
interest_changes_in_trades_range.shape

(6716, 5)

In [104]:
interest_changes_in_trades_range["next_tim"] = interest_changes_in_trades_range.sort_values(
    "timestampms").groupby(
    ["price_padded", "side"])["timestampms"].shift(-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [105]:
interest_changes_in_trades_range.iloc[:10]

Unnamed: 0,timestampms,price_padded,remaining,side,next_tim
0,2019-01-16 18:09:35.932,3614.45,0.0,bid,2019-01-16 18:19:33.636
10,2019-01-16 18:09:36.331,3615.56,0.08,bid,2019-01-16 18:09:40.055
13,2019-01-16 18:09:36.470,3613.42,0.54,bid,2019-01-16 18:09:36.569
15,2019-01-16 18:09:36.546,3619.86,0.0,bid,NaT
16,2019-01-16 18:09:36.563,3619.54,6.8651,bid,2019-01-16 18:09:37.612
17,2019-01-16 18:09:36.569,3613.42,0.62,bid,2019-01-16 18:09:37.871
31,2019-01-16 18:09:37.209,3615.99,0.18,bid,2019-01-16 18:09:37.308
33,2019-01-16 18:09:37.308,3615.99,0.36,bid,2019-01-16 18:09:37.609
37,2019-01-16 18:09:37.570,3620.42,0.518,bid,2019-01-16 18:09:39.077
38,2019-01-16 18:09:37.609,3615.99,0.44,bid,2019-01-16 18:09:37.709


In [106]:
interest_changes_in_trades_range.to_json("gemini_interest.j")