In [1]:
from pybit.unified_trading import HTTP

import dotenv
import os

from time import sleep, time
import datetime as dt

import pandas as pd

import tqdm

# dotenv.load_dotenv('secrets.env')

# API_KEY = os.getenv('API')
# SECRET_KEY = os.getenv('SECRET')

# session = HTTP(
#     testnet=False,
#     api_key=API_KEY,
#     api_secret=SECRET_KEY,
# )

In [None]:
base = int(dt.datetime(2022, 1, 1, 0, 00, 00, 000000).timestamp() * 1000)
dt.datetime.fromtimestamp(base / 1000).strftime('%Y-%m-%d %H:%M:%S')

In [None]:
60 * 24

In [None]:
def get_kline(interval, days_forward=365, start_year=2022, symbol='BTCUSD'):
    t1 = time()

    data = []
    # Начало отсчёта: 1 января start_year
    base = int(dt.datetime(start_year, 1, 1, 0, 0, 0, 0).timestamp() * 1000)

    # Определяем количество "кадров" (баров), которое безопасно получать за один запрос.
    safe_frames = 720  # для 1-минутного интервала – 720 баров (12 часов)
    # Вычисляем длительность одного кадра (в мс) исходя из переданного интервала (в минутах)
    frame_duration_ms = interval * 60 * 1000
    # Вычисляем общее окно запроса в мс, пропорционально интервалу
    window_ms = frame_duration_ms * safe_frames

    # Общий период выборки в миллисекундах: days_forward дней
    total_ms = (60 * 1000) * (60 * 24 * days_forward)

    if interval == 1440:
        interval = 'D'

    for i in tqdm.tqdm(range(base, base + total_ms, window_ms)):
        returns = session.get_kline(
            symbol=symbol,
            interval=interval,
            start=i,
            end=(i + window_ms) - 1,  # конец окна запроса
            limit=1000
        )
        
        data.extend(returns['result']['list'])

        # sleep(1)  # можно включить задержку, если API требует ограничения по запросам

    output_df = pd.DataFrame(data, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume', 'turnover'])

    # Преобразуем столбцы в числовой формат
    for col in output_df.columns:
        try:
            output_df[col] = output_df[col].astype(int)
        except ValueError:
            output_df[col] = output_df[col].astype(float)

    # Преобразуем метки времени в удобный формат
    output_df['timestamp'] = output_df['timestamp'].apply(
        lambda x: dt.datetime.fromtimestamp(x / 1000).strftime('%Y-%m-%d %H:%M:%S')
    )
    output_df['timestamp'] = pd.to_datetime(output_df['timestamp'])
    output_df.sort_values(by='timestamp', inplace=True)

    print(f'Done in {int(time() - t1)} seconds.')
    return output_df


# df = get_kline(1)
df = get_kline(interval=1440, days_forward=(365*4))


In [None]:
df.info()

In [None]:
df['timestamp'].max() - df['timestamp'].min()

In [None]:
df['timestamp'].min()

In [None]:
df['timestamp'].max()

In [None]:
def check_time_series(df=df, interval=5):
    df['diff'] = df['timestamp'].diff()

    # Проверяем, что все разницы равны 1 минуте
    if (df['diff'].iloc[1:] != pd.Timedelta(minutes=interval)).any():
        print("Временной ряд не является непрерывным")
    else:
        print("Временной ряд непрерывен")

    df.drop(columns=['diff'], inplace=True)

check_time_series(interval=1440)

In [None]:
# df.reset_index(drop=True, inplace=True)

In [None]:
# df.to_parquet('btcusd_1d_4years_2022.parquet')

---

In [34]:
df_min = pd.read_parquet('btcusd_1min_4years_2022.parquet')

In [35]:
df_min_filtered = df_min[df_min['timestamp'] < '2023-01-01 00:00:00']

In [36]:
df_h = pd.read_parquet('btcusd_1h_4years_2022.parquet')

In [37]:
df_h_filtered = df_h[df_h['timestamp'] < '2023-01-01 00:00:00']

In [38]:
df_min_filtered['dt_hour'] = df_min_filtered['timestamp'].dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_min_filtered['dt_hour'] = df_min_filtered['timestamp'].dt.hour


In [39]:
df_min_groupped = df_min_filtered.groupby('dt_hour').agg(
        {'open': 'first',
         'close': 'last',
         'high': 'max',
         'low': 'min',
         'volume': 'sum',
         'timestamp': 'first'}
    ).reset_index(drop=True)

In [40]:
combine = df_h_filtered.join(df_min_groupped, lsuffix='_h', rsuffix='_m')
combine.head()

Unnamed: 0,timestamp_h,open_h,high_h,low_h,close_h,volume_h,turnover,open_m,close_m,high_m,low_m,volume_m,timestamp_m
0,2022-01-01 00:00:00,45849.5,46489.0,45755.0,46299.5,84360941,1828.673287,45849.5,16575.0,48141.0,15437.0,14418070000.0,2022-01-01 00:00:00
1,2022-01-01 01:00:00,46299.5,46544.5,46241.0,46293.0,57502020,1239.454159,46299.5,16583.0,47755.0,15450.0,16317590000.0,2022-01-01 01:00:00
2,2022-01-01 02:00:00,46293.0,46481.5,46136.0,46195.0,26773919,577.847611,46293.0,16606.0,47827.0,15512.0,14522900000.0,2022-01-01 02:00:00
3,2022-01-01 03:00:00,46195.0,46723.0,46195.0,46640.5,56585852,1216.558051,46195.0,16585.0,47745.0,15681.5,17276380000.0,2022-01-01 03:00:00
4,2022-01-01 04:00:00,46640.5,46934.0,46584.5,46779.5,38023186,812.870805,46640.5,16563.5,47648.0,15769.0,14267690000.0,2022-01-01 04:00:00


In [41]:
# проверка, что в агрегации минутных графиков равны часовым
for i in ['open', 'close', 'high', 'low', 'timestamp', 'volume']:
    print(f"{i}: {(combine[i+'_h'] - combine[i+'_m']).sum()}")

open: 0.0
close: 729567.5
high: -16112.0
low: 745359.0
timestamp: 0 days 00:00:00
volume: -407574396779.0


In [None]:
# base = int(dt.datetime(2022, 1, 1, 0, 00, 00, 000000).timestamp() * 1000)

# dt.datetime.fromtimestamp(base / 1000).strftime('%Y-%m-%d %H:%M:%S')

In [None]:
# base + 60 * 1000
# dt.datetime.fromtimestamp((base + 60 * 1000) / 1000).strftime('%Y-%m-%d %H:%M:%S')

In [None]:
# l = []

# for i in range(base, base + (60 * 1000) * ((60*24)*(365*1)), (60 * 1000)*(12*60)):
#     # l.append(i)
#     print(dt.datetime.fromtimestamp(i / 1000).strftime('%Y-%m-%d %H:%M:%S'))
#     print(dt.datetime.fromtimestamp(((i + (60*(60*1000))*12)-1) / 1000).strftime('%Y-%m-%d %H:%M:%S'))
#     # print('')