In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from torch.utils.data import TensorDataset, DataLoader
from db.session import get_last_data_by_symbol
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import tracemalloc
import time

In [47]:
recordsAll = get_last_data_by_symbol(symbol="xar", until="2025-06-26 00:00:00")

# Convert result to Pandas DataFrame
df = pd.DataFrame(recordsAll.fetchall(), columns=list(recordsAll.keys()))
# print count of rows
print(f"Total records: {len(df)}")


Total records: 1946


In [48]:

# Set the timestamp column as the index
df.set_index('created_at', inplace=True)

df["price_change"] = df["lastPrice"].pct_change()
df["volume_change"] = df["volume24h"].pct_change()
df["bid_ask_ratio"] = (df["bid1Size"] / (df["ask1Size"] + 1e-9))

df["mean_price_2"] = df["price_change"].rolling(window=2).mean()
df["mean_price_4"] = df["price_change"].rolling(window=4).mean()
#df["mean_price_8"] = df["price_change"].rolling(window=8).mean()

df["mean_volume_2"] = df["volume_change"].rolling(window=2).mean()
df["mean_volume_4"] = df["volume_change"].rolling(window=4).mean()
#df["mean_volume_8"] = df["volume_change"].rolling(window=8).mean()
df["mean_volume_8_16"] = df["volume_change"].shift(8).rolling(window=8).mean()

df["mean_bid_ask_ratio_4"] = df["bid_ask_ratio"].rolling(window=4).mean()
df["mean_bid_ask_ratio_8_16"] = df["bid_ask_ratio"].shift(8).rolling(window=8).mean()

# Target
lp_shift_3 = df["lastPrice"].shift(-3).rolling(window=3).mean()
lp_shift_3_6 = df["lastPrice"].shift(-6).rolling(window=3).mean()

df["future_return_pcnt_3"] = (lp_shift_3 - df["lastPrice"]) / df["lastPrice"] * 100
df["future_return_pcnt_6"] = (lp_shift_3_6 - df["lastPrice"]) / df["lastPrice"] * 100


df["target_lp_sh_3"] = (df["future_return_pcnt_3"] >= 2).astype(int)
df["target_lp_sh_6"] = (df["future_return_pcnt_6"] >= 5).astype(int)

# Final target combining both conditions
# 1 if either condition is met, else 0
df["final_target"] = (df["target_lp_sh_6"]).astype(int)

skewness = df["lastPrice"].skew()
kurtosis = df["lastPrice"].kurtosis()
min_price = df["lastPrice"].min()
max_price = df["lastPrice"].max()
price_difference = max_price - min_price
percentage_difference_from_min = (price_difference / min_price) * 100

print(f"Skewness of lastPrice: {skewness}")
print(f"Kurtosis of lastPrice: {kurtosis}")
print(f"Minimum of lastPrice: {min_price}")
print(f"Maximum of lastPrice: {max_price}")
print(f"Percentage Difference from Minimum: {percentage_difference_from_min}%")

# Drop rows with NaN values
df.dropna(inplace=True)

Skewness of lastPrice: -0.3037851097481479
Kurtosis of lastPrice: -1.4517396552829014
Minimum of lastPrice: 0.002351
Maximum of lastPrice: 0.005917
Percentage Difference from Minimum: 151.68013611229267%


In [None]:
df["future_return_pcnt_6"].plot(figsize=(17, 5), title="future_return_pcnt_6 over time")
plt.xlabel("Date")
plt.ylabel("future_return_pcnt_6")
plt.grid(True)
plt.axhline(y=5, color='r', linestyle='--', label='Threshold 5%')
plt.legend()
plt.show()

df["final_target"].plot(figsize=(17, 3), title="final_target over time")
#df["future_return_pcnt_3"].plot(figsize=(17, 3), title="final_target over time3")
#df["future_return_pcnt_6"].plot(figsize=(17, 3), title="final_target over time6")
plt.xlabel("Date")
plt.ylabel("final_target")
plt.grid(True)
plt.show()


In [50]:
df_for_loader = df[["price_change", "volume_change", "bid_ask_ratio", "mean_price_2", "mean_price_4", "final_target"]]
X = df_for_loader.drop(columns=["final_target"])
y = df_for_loader["final_target"]

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

from db.dataset import TimeSeriesDataset, create_sequences

TEST_SIZE = 0.2  # 20% для тестовой выборки
RANDOM_SEED = 42
VALIDATION_SPLIT_RATIO = 0.15
SEQUENCE_LENGTH = 16
BATCH_SIZE = 32

# 1. Разделение исходных данных на обучающую и тестовую выборки (хронологически)
# Используем меньший test_size для сохранения большего количества данных для обучения/валидации
X_train_val_df, X_test_df, y_train_val_series, y_test_series = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, shuffle=False
)

print(f"Train/Validation X_df shape: {X_train_val_df.shape}, y_series shape: {y_train_val_series.shape}")
print(f"Test X_df shape: {X_test_df.shape}, y_series shape: {y_test_series.shape}")

# 2. Разделение Train+Validation на Train и Validation (хронологически!)
# Валидационная выборка - это последние X% от train_val_series
val_size = int(len(X_train_val_df) * VALIDATION_SPLIT_RATIO)
# Убедимся, что val_size хотя бы 1, чтобы избежать пустых выборок
val_size = max(1, val_size)

X_train_df = X_train_val_df.iloc[:-val_size]
y_train_series = y_train_val_series.iloc[:-val_size]
X_val_df = X_train_val_df.iloc[-val_size:]
y_val_series = y_train_val_series.iloc[-val_size:]

print(f"\nTrain X_df shape: {X_train_df.shape}, y_series shape: {y_train_series.shape}")
print(f"Validation X_df shape: {X_val_df.shape}, y_series shape: {y_val_series.shape}")

# Очень важно: Проверить количество единиц в val и test наборах!
print(f"Classes in y_train_series:\n{y_train_series.value_counts()}")
print(f"Classes in y_val_series:\n{y_val_series.value_counts()}")
print(f"Classes in y_test_series:\n{y_test_series.value_counts()}")


# 3. Создание временных последовательностей
X_train_sequences, y_train_sequences = create_sequences(X_train_df, y_train_series, SEQUENCE_LENGTH)
X_val_sequences, y_val_sequences = create_sequences(X_val_df, y_val_series, SEQUENCE_LENGTH)
X_test_sequences, y_test_sequences = create_sequences(X_test_df, y_test_series, SEQUENCE_LENGTH)

print(f"\nShapes of generated sequences:")
print(f"X_train_sequences: {X_train_sequences.shape}, y_train_sequences: {y_train_sequences.shape}")
print(f"X_val_sequences: {X_val_sequences.shape}, y_val_sequences: {y_val_sequences.shape}")
print(f"X_test_sequences: {X_test_sequences.shape}, y_test_sequences: {y_test_sequences.shape}")

# 4. Преобразование в PyTorch тензоры
X_train_tensor = torch.tensor(X_train_sequences, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_sequences, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_sequences, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_sequences, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_sequences, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_sequences, dtype=torch.float32)

# 5. Создание DataLoader'ов
train_dataset = TimeSeriesDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TimeSeriesDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

test_dataset = TimeSeriesDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)