In [11]:
import pandas as pd
import numpy as np
from datamanager import DGMT
from sklearn.preprocessing import MinMaxScaler
from collections import deque
import random
import pickle
import os

In [12]:
dgmt = DGMT('../../ETHUSDT.csv','time')
dgmt.change_resolution('60min')
df = dgmt.df


In [13]:
def poly_reg(df,n):
    y = df.values.reshape(-1, 1)
    t = np.arange(len(y))
    X = np.c_[np.ones_like(y), t, t ** 2]
    betas = np.linalg.inv(X.T @ X) @ X.T @ y
    new_vals = np.array([1, t[-1]+n, (t[-1]+n)**2])
    pred = new_vals@betas  # beta0 + beta1 * t[-1]+n + beta2 * (t[-1]+n)**2
    return pred
def create_target(df):
    df['poly'] = df.close.rolling(12).apply(poly_reg, args=(3,), raw=False)
    df['poly_sig'] = (df.poly >= df.close) * 1
    df.drop(['poly'], axis=1 ,inplace=True)
    df = df[df.poly_sig != np.inf]
    df['target'] = (df['close'].shift(-3) > df['close']).astype(int)
    return df

def preprocessing_df(df:pd.DataFrame):
    for c in df.columns:
        if c != 'target':
            if c == 'poly_sig':
                continue
            print(c)
            df[c] = df[c].pct_change()
            df.dropna(inplace=True)
            scaler = MinMaxScaler()
            scaler.fit(np.array(df[c].values).reshape(-1,1))
            df[c] = scaler.transform(np.array(df[c].values).reshape(-1,1))
        df.dropna(inplace=True)
    df.dropna(inplace=True)
    return df

def create_sequences(df:pd.DataFrame, lookback:int):
    sequences =  []
    lookback_arr = deque(maxlen = lookback)
    for i in df.values:
        lookback_arr.append([x for x in i[:-1]])
        if len(lookback_arr) == lookback:
            sequences.append([np.array(lookback_arr), i[-1]])
    random.shuffle(sequences)


    buys = [sequences[x] for x in range(len(sequences)) if sequences[x][-1] == 1]
    sells = [sequences[x] for x in range(len(sequences)) if sequences[x][-1] == 0]
    

    min_idx = min(len(buys), len(sells))
    print(len(buys), len(sells))
    buys = buys[:min_idx]
    sells = sells[:min_idx]


    sequences = buys + sells
    random.shuffle(sequences)
    X = [sequences[x][0] for x in range(len(sequences))]
    y = [sequences[x][1] for x in range(len(sequences))]

    return np.array(X),np.array(y)


In [14]:
df = create_target(df)
df = preprocessing_df(df)
val_df = df.iloc[int(len(df)*0.7):]
df = df.iloc[:int(len(df)*0.7)]

volume
open
low
high
close


In [15]:
X_train, y_train = create_sequences(df, 30)
X_test, y_test = create_sequences(val_df,30)

17474 16645
7379 7227


In [16]:
data = {'X_train':X_train, 'y_train':y_train, 'X_test':X_test, 'y_test':y_test}

for i in data.keys():
    if 'eth_15min_filtered_data' not in os.listdir('./'):
        os.mkdir('./eth_15min_filtered_data')
    with open(f'eth_15min_filtered_data/{i}.pickle','wb') as f:
        pickle.dump(data.get(i),f)

In [17]:
len(X_train)

33290

In [18]:
df[df.values == np.inf]

Unnamed: 0_level_0,volume,open,low,high,close,poly_sig,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
