# How to load data
This notebook is to show how to load data from files

## Import packages

In [30]:
from pathlib import Path
from utils import check_integrity, datestring_to_timestamp, load_data

## Set parameters and read data

In [23]:
data_root_dir = "../data"
start_from = "2021-01-11"
end_before = "2021-03-15"
interval = "3h"

In [24]:
start_from_timestamp = datestring_to_timestamp(start_from)
end_before_timestamp = datestring_to_timestamp(end_before)
data_interval_path_list_raw = list(Path(data_root_dir, interval).iterdir())
data_1D_path_list = list(Path(data_root_dir, '1D').iterdir())

coins_1D_pass = []
for file_path in data_1D_path_list:
    _coin_code = file_path.name.split("_")[0]
    if check_integrity(start_from, end_before, file_path) is not None:
        coins_1D_pass.append(_coin_code)
print(f"{len(coins_1D_pass)} coins have all 1D data")

73 coins have all 1D data


In [25]:
data_interval_path_list = [_p for _p in data_interval_path_list_raw if _p.name.split('_')[0] in coins_1D_pass]

**Load the data of one coin from the given period**

In [26]:
_file_path = data_interval_path_list[0]

_data_df =  load_data(start_from_timestamp, end_before_timestamp, _file_path)
_data_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 505 entries, (1610323200000, 'BTC') to (1615766400000, 'BTC')
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype              
---  ------    --------------  -----              
 0   close     505 non-null    float64            
 1   volume    505 non-null    float64            
 2   datetime  505 non-null    datetime64[ns, UTC]
 3   is_fill   505 non-null    bool               
dtypes: bool(1), datetime64[ns, UTC](1), float64(2)
memory usage: 54.4+ KB


**Load another data of one coin from the given period then concatenate the data**

In [27]:
_file_path_2 = data_interval_path_list[9]

_data_df_2 =  load_data(start_from_timestamp, end_before_timestamp, _file_path_2)
_data_df_2.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 503 entries, (1610323200000, 'MKR') to (1615766400000, 'MKR')
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype              
---  ------    --------------  -----              
 0   close     503 non-null    float64            
 1   volume    503 non-null    float64            
 2   datetime  503 non-null    datetime64[ns, UTC]
 3   is_fill   503 non-null    bool               
dtypes: bool(1), datetime64[ns, UTC](1), float64(2)
memory usage: 54.2+ KB


**Load and fill null values**

In [28]:
_data_df_2_filled =  load_data(start_from_timestamp, end_before_timestamp, _file_path_2, fill_na=True)
_data_df_2_filled.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 505 entries, (1610323200000, 'MKR') to (1615766400000, 'MKR')
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype              
---  ------    --------------  -----              
 0   close     505 non-null    float64            
 1   volume    505 non-null    float64            
 2   datetime  505 non-null    datetime64[ns, UTC]
 3   is_fill   505 non-null    bool               
dtypes: bool(1), datetime64[ns, UTC](1), float64(2)
memory usage: 50.3+ KB


**Concatenate dataframes**

In [29]:
_new_data_df = _data_df.append(_data_df_2)
_new_data_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1008 entries, (1610323200000, 'BTC') to (1615766400000, 'MKR')
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype              
---  ------    --------------  -----              
 0   close     1008 non-null   float64            
 1   volume    1008 non-null   float64            
 2   datetime  1008 non-null   datetime64[ns, UTC]
 3   is_fill   1008 non-null   bool               
dtypes: bool(1), datetime64[ns, UTC](1), float64(2)
memory usage: 47.9+ KB
