# How to load data
This notebook is to show how to load data from files

## Import packages

In [1]:
from pathlib import Path
from utils import check_integrity, datestring_to_timestamp, load_data, get_sorted_fluctuation_coins

## Set parameters and read data

In [2]:
data_path = "../data"
start_from = "2021-01-11"
end_before = "2021-03-15"
interval = "3h"

In [3]:
start_from_timestamp = datestring_to_timestamp(start_from)
end_before_timestamp = datestring_to_timestamp(end_before)
data_interval_path_list_raw = list(Path(data_path, interval).iterdir())
data_1D_path_list = list(Path(data_path, '1D').iterdir())

coins_1D_pass = []
for file_path in data_1D_path_list:
    _coin_code = file_path.name.split("_")[0]
    if check_integrity(start_from, end_before, file_path) is not None:
        coins_1D_pass.append(_coin_code)
print(f"{len(coins_1D_pass)} coins have all 1D data")

86 coins have all 1D data


In [4]:
data_interval_path_list = [_p for _p in data_interval_path_list_raw if _p.name.split('_')[0] in coins_1D_pass]

**Load the data of one coin from the given period**

In [5]:
_file_path = data_interval_path_list[0]

_data_df =  load_data(start_from_timestamp, end_before_timestamp, _file_path)
_data_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 505 entries, (1610323200000, 'BTC') to (1615766400000, 'BTC')
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype              
---  ------    --------------  -----              
 0   close     505 non-null    float64            
 1   volume    505 non-null    float64            
 2   datetime  505 non-null    datetime64[ns, UTC]
 3   is_fill   505 non-null    bool               
dtypes: bool(1), datetime64[ns, UTC](1), float64(2)
memory usage: 101.4+ KB


**Load another data of one coin from the given period then concatenate the data**

In [6]:
_file_path_2 = data_interval_path_list[9]

_data_df_2 =  load_data(start_from_timestamp, end_before_timestamp, _file_path_2)
_data_df_2.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 503 entries, (1610323200000, 'MKR') to (1615766400000, 'MKR')
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype              
---  ------    --------------  -----              
 0   close     503 non-null    float64            
 1   volume    503 non-null    float64            
 2   datetime  503 non-null    datetime64[ns, UTC]
 3   is_fill   503 non-null    bool               
dtypes: bool(1), datetime64[ns, UTC](1), float64(2)
memory usage: 100.6+ KB


**Load and fill null values**

In [7]:
_data_df_2_filled =  load_data(start_from_timestamp, end_before_timestamp, _file_path_2, fill_na=True)
_data_df_2_filled.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 505 entries, (1610323200000, 'MKR') to (1615766400000, 'MKR')
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype              
---  ------    --------------  -----              
 0   close     505 non-null    float64            
 1   volume    505 non-null    float64            
 2   datetime  505 non-null    datetime64[ns, UTC]
 3   is_fill   505 non-null    bool               
dtypes: bool(1), datetime64[ns, UTC](1), float64(2)
memory usage: 50.3+ KB


**Concatenate dataframes**

In [8]:
_new_data_df = _data_df.append(_data_df_2)
_new_data_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1008 entries, (1610323200000, 'BTC') to (1615766400000, 'MKR')
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype              
---  ------    --------------  -----              
 0   close     1008 non-null   float64            
 1   volume    1008 non-null   float64            
 2   datetime  1008 non-null   datetime64[ns, UTC]
 3   is_fill   1008 non-null   bool               
dtypes: bool(1), datetime64[ns, UTC](1), float64(2)
memory usage: 47.9+ KB


**Sorted coins by fluctuation**

In [9]:
"""
get_sorted_fluctuation_coins(
    start_from (str, int): the timestamp or datestring of the first day
    end_before (str, int): the timestamp or datestring of the last day
    data_dir   (str):   the path if directory which has .csv files
    normalize_price (bool): use MinMaxScaler() to normalize the close price if True
    incl_coins (list, None): only return the coins in this provided list; None to return all coins
    return_details (bool): return the standard deviations in a DataFrame if True; False to only return a list with coin codes
    [DEPRECATED] start_from_timestamp (int): the timestamp of the first day
    [DEPRECATED] end_before_timestamp (int): the timestamp of the last day
)
"""

fluc_df = get_sorted_fluctuation_coins(
    start_from="2021-04-14",
    end_before="2021-06-15",
    data_dir='../data/1D',
    # incl_coins=ONLY_WANT_THESE_COINS,
    normalize_price=True,
    return_details=True,
    interval='1D'
)
fluc_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 136 entries, ORS to EOSDT
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   close_std   136 non-null    float64
 1   volume_sum  136 non-null    float64
dtypes: float64(2)
memory usage: 3.2+ KB


In [10]:
fluc_df.head()

Unnamed: 0_level_0,close_std,volume_sum
coin,Unnamed: 1_level_1,Unnamed: 2_level_1
ORS,0.353142,677559100.0
LUNA,0.352348,1337474000.0
SAN,0.345045,1630334000.0
QTF,0.344317,4344900.0
ICE,0.340424,43910320.0


In [11]:
fluc_df.tail()

Unnamed: 0_level_0,close_std,volume_sum
coin,Unnamed: 1_level_1,Unnamed: 2_level_1
UDC,0.171852,25916560000.0
EUS,0.168937,192913600.0
TSD,0.154663,506718500.0
EUT,0.153312,91536540.0
EOSDT,0.127864,1214146.0


In [15]:
fluc_df = get_sorted_fluctuation_coins(
    start_from="2021-04-14",
    end_before="2021-06-15",
    data_dir='../data/1D',
    # incl_coins=ONLY_WANT_THESE_COINS,
    normalize_price=False,
    return_details=True,
    interval='1D'
)
fluc_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 136 entries, DOG to YGG
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   close_std   136 non-null    float64
 1   volume_sum  136 non-null    float64
dtypes: float64(2)
memory usage: 3.2+ KB


In [16]:
fluc_df.head()

Unnamed: 0_level_0,close_std,volume_sum
coin,Unnamed: 1_level_1,Unnamed: 2_level_1
DOG,110046.581555,334.33656
YFI,10151.399072,4253.65761
BTC,9443.343583,735776.442452
WBT,9383.94786,11.386429
RBT,9313.638109,13.89412


In [17]:
fluc_df.tail()

Unnamed: 0_level_0,close_std,volume_sum
coin,Unnamed: 1_level_1,Unnamed: 2_level_1
DAI,0.001095,33762380.0
PAX,0.000927,34267210.0
UST,0.000914,9166260000.0
UDC,0.000693,411374000.0
YGG,0.000294,5663083000.0
