In [1]:
import glob
import pandas as pd
import numpy as np
from typing import List

In [2]:
pickle_files_path = "../data/temp_test_data/*.pkl"

In [3]:
def _read_pickle_files(path: str) -> List:
  pickle_files  = glob.glob(path)
  data          = []

  pickle_files.sort()

  for pickle_file in pickle_files:
    with open(pickle_file, 'rb') as f:
      file_contents = pd.read_pickle(f)
      data.append(file_contents)

  return data 



def _calc_euclidean_distance(arr1, arr2):
  return np.sqrt(np.sum((np.array(arr1) - np.array(arr2)) ** 2))



def _calculate_slope(weights):
    t = np.arange(len(weights))
    slope, intercept = np.polyfit(t, weights, 1)
    return slope

In [4]:
raw_pickle_data = _read_pickle_files(pickle_files_path)

In [5]:
first_file = raw_pickle_data[0]

In [6]:
type(first_file['price_data'])

pandas.core.frame.DataFrame

In [7]:
first_file['price_data']['close_time']

65982    1.641687e+09
65983    1.641687e+09
65984    1.641687e+09
65985    1.641688e+09
65986    1.641689e+09
             ...     
66073    1.641770e+09
66074    1.641770e+09
66075    1.641771e+09
66076    1.641771e+09
66077    1.641772e+09
Name: close_time, Length: 96, dtype: float64

In [8]:
df1 = pd.DataFrame(first_file['price_data']['close_time'], columns=['close_time'])
df1.reset_index(drop=True, inplace=True)

In [9]:
df2 = pd.DataFrame(first_file['prediction_details'], columns=['long', 'short'])
df2.reset_index(drop=True, inplace=True)

In [10]:
df = pd.concat([df1, df2], axis=1)
df.sort_values('close_time', inplace=True)

In [11]:
df

Unnamed: 0,close_time,long,short
0,1.641687e+09,"[0.00028395813, 0.00018264189, 0.00082501565, ...","[1.5434945e-05, 1.1722226e-05, 8.31921e-06, 3...."
1,1.641687e+09,"[6.961259e-05, 5.0820454e-05, 0.00030092167, 0...","[4.5889006e-05, 3.128108e-05, 1.8344024e-05, 6..."
2,1.641687e+09,"[0.00015814097, 0.0001234603, 0.00070723606, 0...","[5.8321515e-05, 2.863668e-05, 1.5200444e-05, 5..."
3,1.641688e+09,"[0.00012365845, 0.00010381089, 0.000671683, 0....","[3.4932964e-05, 2.065421e-05, 1.0802528e-05, 3..."
4,1.641689e+09,"[5.238538e-05, 4.9717157e-05, 7.11477e-05, 9.8...","[0.00023490381, 7.9151985e-05, 3.4724213e-05, ..."
...,...,...,...
91,1.641770e+09,"[0.0646079, 0.088854805, 0.12852715, 0.1199270...","[0.00012455195, 3.9227798e-05, 9.144978e-06, 4..."
92,1.641770e+09,"[0.06294112, 0.13441809, 0.38791734, 0.3512447...","[1.1209628e-05, 1.4858751e-06, 6.5118496e-07, ..."
93,1.641771e+09,"[0.15458693, 0.28162307, 0.34183595, 0.1739281...","[3.764567e-06, 1.0988465e-06, 7.35294e-07, 3.3..."
94,1.641771e+09,"[0.0842035, 0.18735978, 0.42329744, 0.2712586,...","[5.7789043e-06, 1.1860399e-06, 5.8768524e-07, ..."


In [12]:
df['long_slope']        = df['long'].apply(_calculate_slope)
df['short_slope']       = df['short'].apply(_calculate_slope)
df['long_minus_short']  = df.apply(lambda row: _calc_euclidean_distance(row['long'], row['short']), axis=1) # this is the difference between the long and short predictions

In [13]:
df

Unnamed: 0,close_time,long,short,long_slope,short_slope,long_minus_short
0,1.641687e+09,"[0.00028395813, 0.00018264189, 0.00082501565, ...","[1.5434945e-05, 1.1722226e-05, 8.31921e-06, 3....",-0.002697,0.000454,0.714199
1,1.641687e+09,"[6.961259e-05, 5.0820454e-05, 0.00030092167, 0...","[4.5889006e-05, 3.128108e-05, 1.8344024e-05, 6...",-0.002642,0.000602,0.753278
2,1.641687e+09,"[0.00015814097, 0.0001234603, 0.00070723606, 0...","[5.8321515e-05, 2.863668e-05, 1.5200444e-05, 5...",-0.002670,0.000548,0.711587
3,1.641688e+09,"[0.00012365845, 0.00010381089, 0.000671683, 0....","[3.4932964e-05, 2.065421e-05, 1.0802528e-05, 3...",-0.002742,0.000671,0.765314
4,1.641689e+09,"[5.238538e-05, 4.9717157e-05, 7.11477e-05, 9.8...","[0.00023490381, 7.9151985e-05, 3.4724213e-05, ...",-0.001814,0.001375,0.744892
...,...,...,...,...,...,...
91,1.641770e+09,"[0.0646079, 0.088854805, 0.12852715, 0.1199270...","[0.00012455195, 3.9227798e-05, 9.144978e-06, 4...",-0.002933,0.003952,0.803543
92,1.641770e+09,"[0.06294112, 0.13441809, 0.38791734, 0.3512447...","[1.1209628e-05, 1.4858751e-06, 6.5118496e-07, ...",-0.003559,0.002267,0.742605
93,1.641771e+09,"[0.15458693, 0.28162307, 0.34183595, 0.1739281...","[3.764567e-06, 1.0988465e-06, 7.35294e-07, 3.3...",-0.003674,0.001954,0.735269
94,1.641771e+09,"[0.0842035, 0.18735978, 0.42329744, 0.2712586,...","[5.7789043e-06, 1.1860399e-06, 5.8768524e-07, ...",-0.003612,0.002022,0.744928


In [14]:
long_arr = df.iloc[0].long
short_arr = df.iloc[0].short

In [15]:
long_slope = _calculate_slope(long_arr)
short_slope = _calculate_slope(short_arr)
long_minus_short = _calc_euclidean_distance(long_arr, short_arr)

In [41]:
print(long_slope)
print(short_slope)
print(long_minus_short)

-0.0026965933220893267
0.0004540277800336521
0.7141988


In [19]:
short_arr

array([1.5434945e-05, 1.1722226e-05, 8.3192099e-06, 3.6023023e-06,
       4.2805736e-06, 3.9104411e-06, 3.2810940e-06, 1.7069186e-06,
       1.9690181e-06, 8.0227966e-07, 1.5354541e-06, 1.6222237e-06,
       1.6068579e-05, 1.4560219e-05, 1.6056418e-05, 1.9931197e-04,
       8.3784567e-04, 5.4696565e-03, 3.3471279e-02, 1.4629954e-01,
       2.4904542e-01, 3.5774630e-01, 1.8818459e-01, 1.1717024e-02,
       3.5268713e-03, 1.6545877e-03, 1.4383583e-04, 1.1129983e-04,
       8.1453858e-05, 8.3636267e-05, 1.6045864e-04, 5.6381605e-04,
       4.2643840e-04, 7.2842486e-05, 6.8814115e-05, 1.5250671e-05,
       3.8515818e-06, 1.1045595e-05], dtype=float32)