Датасет: https://www.kaggle.com/datasets/olegshpagin/russia-stocks-prices-ohlcv

# Imports

In [1]:
from google.colab import auth, drive
from googleapiclient.discovery import build

auth.authenticate_user()
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import numpy as np
import random
import json

# Data (Not scaled)

In [None]:
root_dir = '/content/drive/MyDrive/archive/H1/'
moex_tickers = [
    'GAZP', 'SBER', 'SBERP', 'LKOH', 'GMKN', 'YNDX',
    'NVTK', 'TATN', 'TATNP', 'ROSN', 'SNGS', 'SNGSP',
    'MGNT', 'FIVE', 'MTSS', 'POLY', 'ALRS', 'CHMF',
    'PLZL', 'IRAO', 'NLMK', 'VTBR', 'MOEX', 'PHOR',
    'TRNFP', 'MAGN', 'RTKM', 'RUALR', 'AFLT', 'PIKK',
    'HYDR', 'FEES', 'AFKS', 'LSRG', 'CBOM', 'UPRO',
    'DSKY', 'LNTA', 'SFIN', 'RNFT', 'MVID', 'UWGN'
]

min_candles_per_day = 7
min_sessions = 200
train_ratio = 0.7
val_ratio = 0.1
test_ratio = 0.2

random.seed(42)

train_data = []
val_data = []
test_data = []

for filename in os.listdir(root_dir):
    if not filename.endswith('_H1.csv'):
        continue

    stock_id = filename.replace('_H1.csv', '')
    if stock_id not in moex_tickers:
        continue

    filepath = os.path.join(root_dir, filename)
    df = pd.read_csv(filepath, parse_dates=['datetime'])

    df['date'] = df['datetime'].dt.date
    df['stock_id'] = stock_id

    # Собираем все дни, где >= 7 свечей, и обрезаем до первых 7
    daily_groups = []
    for _, group in df.groupby('date'):
        if len(group) >= min_candles_per_day:
            trimmed = group.sort_values('datetime').iloc[:min_candles_per_day]
            daily_groups.append(trimmed)

    # Пропускаем акции с недостаточным числом сессий
    if len(daily_groups) < min_sessions:
        continue

    # Перемешиваем и делим
    random.shuffle(daily_groups)
    n = len(daily_groups)
    n_test = int(n * test_ratio)
    n_val = int(n * val_ratio)

    test_data.extend(daily_groups[:n_test])
    val_data.extend(daily_groups[n_test:n_test + n_val])
    train_data.extend(daily_groups[n_test + n_val:])

# Склеиваем в датафреймы
train_df = pd.concat(train_data).reset_index(drop=True)
val_df = pd.concat(val_data).reset_index(drop=True)
test_df = pd.concat(test_data).reset_index(drop=True)

# Вывод инфо
print("Tickets:", sorted(train_df['stock_id'].unique()))
print(f"Train: {train_df['stock_id'].nunique()} акций, {len(train_df)} свечей")
print(f"Val:   {val_df['stock_id'].nunique()} акций, {len(val_df)} свечей")
print(f"Test:  {test_df['stock_id'].nunique()} акций, {len(test_df)} свечей")

# Сохраняем
train_df.to_csv('/content/drive/MyDrive/archive/train_h1.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/archive/val_h1.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/archive/test_h1.csv', index=False)


Tickets: ['AFKS', 'AFLT', 'ALRS', 'CBOM', 'CHMF', 'DSKY', 'FEES', 'FIVE', 'GAZP', 'GMKN', 'HYDR', 'IRAO', 'LKOH', 'LSRG', 'MAGN', 'MGNT', 'MOEX', 'MTSS', 'MVID', 'NLMK', 'NVTK', 'PHOR', 'PIKK', 'PLZL', 'POLY', 'RNFT', 'ROSN', 'RTKM', 'SBER', 'SBERP', 'SFIN', 'SNGS', 'SNGSP', 'TATN', 'TATNP', 'TRNFP', 'UPRO', 'VTBR', 'YNDX']
Train: 39 акций, 810957 свечей
Val:   39 акций, 115689 свечей
Test:  39 акций, 231532 свечей


# Data (scaled by company (all history))

In [3]:
root_dir = '/content/drive/MyDrive/archive/H1/'
moex_tickers = [
    'GAZP', 'SBER', 'SBERP', 'LKOH', 'GMKN', 'YNDX',
    'NVTK', 'TATN', 'TATNP', 'ROSN', 'SNGS', 'SNGSP',
    'MGNT', 'FIVE', 'MTSS', 'POLY', 'ALRS', 'CHMF',
    'PLZL', 'IRAO', 'NLMK', 'VTBR', 'MOEX', 'PHOR',
    'TRNFP', 'MAGN', 'RTKM', 'RUALR', 'AFLT', 'PIKK',
    'HYDR', 'FEES', 'AFKS', 'LSRG', 'CBOM', 'UPRO',
    'DSKY', 'LNTA', 'SFIN', 'RNFT', 'MVID', 'UWGN'
]

min_candles_per_day = 7
min_sessions = 200
train_ratio = 0.7
val_ratio = 0.1
test_ratio = 0.2

random.seed(42)

train_data, val_data, test_data = [], [], []

for filename in os.listdir(root_dir):
    if not filename.endswith('_H1.csv'):
        continue

    stock_id = filename.replace('_H1.csv', '')
    if stock_id not in moex_tickers:
        continue

    filepath = os.path.join(root_dir, filename)
    df = pd.read_csv(filepath, parse_dates=['datetime'])

    df['date'] = df['datetime'].dt.date
    df['stock_id'] = stock_id

    daily_groups = []
    for _, group in df.groupby('date'):
        if len(group) >= min_candles_per_day:
            trimmed = group.sort_values('datetime').iloc[:min_candles_per_day]
            daily_groups.append(trimmed)

    if len(daily_groups) < min_sessions:
        continue

    random.shuffle(daily_groups)
    n = len(daily_groups)
    n_test = int(n * test_ratio)
    n_val = int(n * val_ratio)

    test_data.extend(daily_groups[:n_test])
    val_data.extend(daily_groups[n_test:n_test + n_val])
    train_data.extend(daily_groups[n_test + n_val:])

# Объединяем в датафреймы
train_df = pd.concat(train_data).reset_index(drop=True)
val_df = pd.concat(val_data).reset_index(drop=True)
test_df = pd.concat(test_data).reset_index(drop=True)

# Добавим log1p(volume)
for df in [train_df, val_df, test_df]:
    df['log_volume'] = np.log1p(df['volume'])

# Колонки для нормализации
features = ['open', 'high', 'low', 'close', 'log_volume']
scaler_stats = {}

train_df['year'] = train_df['datetime'].dt.year

# Статистика: stock_id -> year -> feature -> mean/std
yearly_stats = defaultdict(lambda: defaultdict(dict))

# Нормализуем по каждой акции отдельно, только по train
for stock_id in train_df['stock_id'].unique():
    df_stock = train_df[train_df['stock_id'] == stock_id]
    for year in sorted(df_stock['year'].unique()):
        df_year = df_stock[df_stock['year'] == year]
        stats = {}
        for col in features:
            mean = df_year[col].mean()
            std = df_year[col].std()
            stats[col] = {'mean': mean, 'std': std}
        yearly_stats[stock_id][str(year)] = stats

# Сохраняем scaler параметры в JSON
with open('/content/drive/MyDrive/archive/scaler_stats.json', 'w') as f:
    json.dump(scaler_stats, f, indent=2)

# Сохраняем CSV
train_df.to_csv('/content/drive/MyDrive/archive/train_h1_scaled.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/archive/val_h1_scaled.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/archive/test_h1_scaled.csv', index=False)

# Вывод инфо
print("Tickets:", sorted(train_df['stock_id'].unique()))
print(f"Train: {train_df['stock_id'].nunique()} акций, {len(train_df)} свечей")
print(f"Val:   {val_df['stock_id'].nunique()} акций, {len(val_df)} свечей")
print(f"Test:  {test_df['stock_id'].nunique()} акций, {len(test_df)} свечей")


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Tickets: ['AFKS', 'AFLT', 'ALRS', 'CBOM', 'CHMF', 'DSKY', 'FEES', 'FIVE', 'GAZP', 'GMKN', 'HYDR', 'IRAO', 'LKOH', 'LSRG', 'MAGN', 'MGNT', 'MOEX', 'MTSS', 'MVID', 'NLMK', 'NVTK', 'PHOR', 'PIKK', 'PLZL', 'POLY', 'RNFT', 'ROSN', 'RTKM', 'SBER', 'SBERP', 'SFIN', 'SNGS', 'SNGSP', 'TATN', 'TATNP', 'TRNFP', 'UPRO', 'VTBR', 'YNDX']
Train: 39 акций, 810957 свечей
Val:   39 акций, 115689 свечей
Test:  39 акций, 231532 свечей


In [4]:
df = pd.read_csv('/content/drive/MyDrive/archive/train_h1_scaled.csv', parse_dates=['datetime'])
df.head()

Unnamed: 0,datetime,open,high,low,close,volume,date,stock_id,log_volume
0,2016-11-08 10:00:00,0.439333,0.436659,0.368306,0.422081,463830,2016-11-08,ALRS,1.191913
1,2016-11-08 11:00:00,0.421481,0.489276,0.430674,0.4842,389900,2016-11-08,ALRS,1.10124
2,2016-11-08 12:00:00,0.482476,0.48557,0.488561,0.483084,384160,2016-11-08,ALRS,1.093495
3,2016-11-08 13:00:00,0.482848,0.481124,0.488561,0.488663,118060,2016-11-08,ALRS,0.477341
4,2016-11-08 14:00:00,0.488427,0.479642,0.469514,0.468949,165780,2016-11-08,ALRS,0.654619


# Data (scaled by company (every year))

In [3]:
root_dir = '/content/drive/MyDrive/archive/H1/'
moex_tickers = [
    'GAZP', 'SBER', 'SBERP', 'LKOH', 'GMKN', 'YNDX',
    'NVTK', 'TATN', 'TATNP', 'ROSN', 'SNGS', 'SNGSP',
    'MGNT', 'FIVE', 'MTSS', 'POLY', 'ALRS', 'CHMF',
    'PLZL', 'IRAO', 'NLMK', 'VTBR', 'MOEX', 'PHOR',
    'TRNFP', 'MAGN', 'RTKM', 'RUALR', 'AFLT', 'PIKK',
    'HYDR', 'FEES', 'AFKS', 'LSRG', 'CBOM', 'UPRO',
    'DSKY', 'LNTA', 'SFIN', 'RNFT', 'MVID', 'UWGN'
]

min_candles_per_day = 7
min_sessions = 200
train_ratio = 0.7
val_ratio = 0.1
test_ratio = 0.2

random.seed(42)

train_data, val_data, test_data = [], [], []

for filename in os.listdir(root_dir):
    if not filename.endswith('_H1.csv'):
        continue

    stock_id = filename.replace('_H1.csv', '')
    if stock_id not in moex_tickers:
        continue

    filepath = os.path.join(root_dir, filename)
    df = pd.read_csv(filepath, parse_dates=['datetime'])

    df['date'] = df['datetime'].dt.date
    df['year'] = df['datetime'].dt.year
    df['stock_id'] = stock_id
    df['log_volume'] = np.log1p(df['volume'])

    daily_groups = []
    for _, group in df.groupby('date'):
        if len(group) >= min_candles_per_day:
            trimmed = group.sort_values('datetime').iloc[:min_candles_per_day]
            daily_groups.append(trimmed)

    if len(daily_groups) < min_sessions:
        continue

    random.shuffle(daily_groups)
    n = len(daily_groups)
    n_test = int(n * test_ratio)
    n_val = int(n * val_ratio)

    test_data.extend(daily_groups[:n_test])
    val_data.extend(daily_groups[n_test:n_test + n_val])
    train_data.extend(daily_groups[n_test + n_val:])

# Объединяем в датафреймы
train_df = pd.concat(train_data).reset_index(drop=True)
val_df = pd.concat(val_data).reset_index(drop=True)
test_df = pd.concat(test_data).reset_index(drop=True)

# Колонки для нормализации
features = ['open', 'high', 'low', 'close', 'log_volume']
scaler_stats = {}

# Нормализуем по каждой акции и году, используя только train
for stock_id in train_df['stock_id'].unique():
    scaler_stats[stock_id] = {}
    for year in train_df[train_df['stock_id'] == stock_id]['year'].unique():
        stats = {}
        subset = train_df[(train_df['stock_id'] == stock_id) & (train_df['year'] == year)]

        for col in features:
            mean = subset[col].mean()
            std = subset[col].std()
            stats[col] = {'mean': mean, 'std': std}

            for df in [train_df, val_df, test_df]:
                mask = (df['stock_id'] == stock_id) & (df['year'] == year)
                df.loc[mask, col] = (df.loc[mask, col] - mean) / std

        scaler_stats[stock_id][str(year)] = stats

# Сохраняем scaler параметры в JSON
with open('/content/drive/MyDrive/archive/scaler_stats_by_year.json', 'w') as f:
    json.dump(scaler_stats, f, indent=2)

# Сохраняем CSV
train_df.to_csv('/content/drive/MyDrive/archive/train_h1_scaled_v2.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/archive/val_h1_scaled_v2.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/archive/test_h1_scaled_v2.csv', index=False)

# Вывод инфо
print("Tickets:", sorted(train_df['stock_id'].unique()))
print(f"Train: {train_df['stock_id'].nunique()} акций, {len(train_df)} свечей")
print(f"Val:   {val_df['stock_id'].nunique()} акций, {len(val_df)} свечей")
print(f"Test:  {test_df['stock_id'].nunique()} акций, {len(test_df)} свечей")


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Tickets: ['AFKS', 'AFLT', 'ALRS', 'CBOM', 'CHMF', 'DSKY', 'FEES', 'FIVE', 'GAZP', 'GMKN', 'HYDR', 'IRAO', 'LKOH', 'LSRG', 'MAGN', 'MGNT', 'MOEX', 'MTSS', 'MVID', 'NLMK', 'NVTK', 'PHOR', 'PIKK', 'PLZL', 'POLY', 'RNFT', 'ROSN', 'RTKM', 'SBER', 'SBERP', 'SFIN', 'SNGS', 'SNGSP', 'TATN', 'TATNP', 'TRNFP', 'UPRO', 'VTBR', 'YNDX']
Train: 39 акций, 810957 свечей
Val:   39 акций, 115689 свечей
Test:  39 акций, 231532 свечей


In [6]:
df = pd.read_csv('/content/drive/MyDrive/archive/train_h1_scaled_v2.csv', parse_dates=['datetime'])
df.describe()

Unnamed: 0,datetime,open,high,low,close,volume,year,log_volume
count,810957,810957.0,810957.0,810957.0,810957.0,810957.0,810957.0,810725.0
mean,2015-04-06 14:21:06.405986816,7.584197e-17,-1.794413e-17,6.364557e-17,2.3201200000000003e-17,10760310.0,2014.755108,-6.931675e-17
min,1999-06-01 11:00:00,-5.041008,-4.602066,-6.134346,-4.991584,-2146173000.0,1999.0,-12.03898
25%,2010-10-04 12:00:00,-0.7344968,-0.7340536,-0.7330132,-0.7344014,7040.0,2010.0,-0.6144841
50%,2015-12-16 12:00:00,-0.05905945,-0.06110914,-0.0574819,-0.05923327,32900.0,2015.0,0.02674942
75%,2020-05-05 14:00:00,0.7183251,0.715584,0.7200091,0.7182088,140066.0,2020.0,0.6506255
max,2024-08-27 16:00:00,7.956351,12.00131,5.490437,6.711188,2139326000.0,2024.0,6.108641
std,,0.9995621,0.9995621,0.9995621,0.9995621,85531650.0,6.088093,0.999562
