In [1]:
import pandas as pd

In [2]:
# ../data_local is included in the .gitignore
!ls ../data_local/

binance_1h_2021-2025.csv       binance_1h_ohlcv_2021-2025.csv
binance_1h_2022-2025.csv       ohlcv.zip
binance_1h_2023-2025.csv


In [3]:
# read the original data
price_df = pd.read_csv('../data_local/binance_1h_ohlcv_2021-2025.csv')
price_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20209 entries, 0 to 20208
Columns: 821 entries, Unnamed: 0 to volume_zrx-usdt
dtypes: float64(820), object(1)
memory usage: 126.6+ MB


In [4]:
# fix the first column name
price_df.rename(columns={'Unnamed: 0': 'datetime'}, inplace=True)
price_df['year'] = pd.to_datetime(price_df['datetime']).dt.year
price_df.head()

Unnamed: 0,datetime,open_1inch-usdt,open_aave-usdt,open_ach-usdt,open_ada-usdt,open_akro-usdt,open_algo-usdt,open_amp-usdt,open_ankr-usdt,open_ape-usdt,...,volume_woo-usdt,volume_xec-usdt,volume_xrp-usdt,volume_xvg-usdt,volume_yfi-usdt,volume_yfii-usdt,volume_ygg-usdt,volume_zec-usdt,volume_zrx-usdt,year
0,2023-01-01 00:00:00,0.383,52.0,0.0078,0.2458,0.00303,0.1725,0.00305,0.01535,3.636,...,216361.5,1338965000.0,11231966.0,306448.0,15.08445,12.4876,48403.4,527.123,167485.0,2023
1,2023-01-01 01:00:00,0.382,51.8,0.00774,0.2446,0.00303,0.1715,0.00305,0.01533,3.62,...,116588.6,1388112000.0,6013573.0,195568.0,3.14928,14.1337,87053.9,1328.673,22863.0,2023
2,2023-01-01 02:00:00,0.384,51.9,0.00774,0.2452,0.00302,0.1718,0.00304,0.01532,3.632,...,877513.8,364200700.0,6860256.0,90395.0,3.09037,16.8438,149015.0,1083.789,56707.0,2023
3,2023-01-01 03:00:00,0.384,51.9,0.00778,0.2456,0.00303,0.1715,0.00305,0.01541,3.639,...,286559.5,634258900.0,3024733.0,142658.0,2.34794,29.3838,15423.0,348.333,243078.0,2023
4,2023-01-01 04:00:00,0.384,51.7,0.00779,0.2454,0.00302,0.1731,0.00304,0.0154,3.637,...,90040.7,533842900.0,5002481.0,473194.0,2.61242,17.2952,26154.6,732.394,110919.0,2023


In [6]:
target_fpath = '../data_local/binance_1h_ohlcv_2023-2025.parquet'

In [7]:
# save the dataset as a compressed parquet file
price_df.to_parquet(
    target_fpath,
    index=False,
    compression='gzip',
    engine='pyarrow',
    partition_cols=['year'],
)

In [8]:
# check if the parquet file can be read
price_df_p = pd.read_parquet(
    target_fpath,
    engine='pyarrow',
)
price_df_p.head()

Unnamed: 0,datetime,open_1inch-usdt,open_aave-usdt,open_ach-usdt,open_ada-usdt,open_akro-usdt,open_algo-usdt,open_amp-usdt,open_ankr-usdt,open_ape-usdt,...,volume_woo-usdt,volume_xec-usdt,volume_xrp-usdt,volume_xvg-usdt,volume_yfi-usdt,volume_yfii-usdt,volume_ygg-usdt,volume_zec-usdt,volume_zrx-usdt,year
0,2023-01-01 00:00:00,0.383,52.0,0.0078,0.2458,0.00303,0.1725,0.00305,0.01535,3.636,...,216361.5,1338965000.0,11231966.0,306448.0,15.08445,12.4876,48403.4,527.123,167485.0,2023
1,2023-01-01 01:00:00,0.382,51.8,0.00774,0.2446,0.00303,0.1715,0.00305,0.01533,3.62,...,116588.6,1388112000.0,6013573.0,195568.0,3.14928,14.1337,87053.9,1328.673,22863.0,2023
2,2023-01-01 02:00:00,0.384,51.9,0.00774,0.2452,0.00302,0.1718,0.00304,0.01532,3.632,...,877513.8,364200700.0,6860256.0,90395.0,3.09037,16.8438,149015.0,1083.789,56707.0,2023
3,2023-01-01 03:00:00,0.384,51.9,0.00778,0.2456,0.00303,0.1715,0.00305,0.01541,3.639,...,286559.5,634258900.0,3024733.0,142658.0,2.34794,29.3838,15423.0,348.333,243078.0,2023
4,2023-01-01 04:00:00,0.384,51.7,0.00779,0.2454,0.00302,0.1731,0.00304,0.0154,3.637,...,90040.7,533842900.0,5002481.0,473194.0,2.61242,17.2952,26154.6,732.394,110919.0,2023


In [9]:
# check the file size
!ls -lh ../data_local/ 

total 439704
-rw-rw-r--  1 son  staff    17M Apr  8 23:50 binance_1h_2021-2025.csv
-rw-rw-r--  1 son  staff    19M Apr  8 23:51 binance_1h_2022-2025.csv
-rw-rw-r--  1 son  staff    15M Apr  8 23:47 binance_1h_2023-2025.csv
-rw-rw-r--@ 1 son  staff   118M Apr 23 02:39 binance_1h_ohlcv_2021-2025.csv
drwxr-xr-x  5 son  staff   160B Apr 26 21:30 [34mbinance_1h_ohlcv_2023-2025.parquet[m[m
-rw-r--r--@ 1 son  staff    45M Apr 26 20:50 ohlcv.zip


In [10]:
# move the parquet file to the dataset folder
!mv ../data_local/binance_1h_ohlcv_2023-2025.parquet ../dataset/