# HDF5 Datastore Creation

In [11]:
# Import libraries
from pathlib import Path
import requests
from io import BytesIO
from zipfile import ZipFile, BadZipFile

import numpy as np
import pandas as pd
import pandas_datareader.data as web
from sklearn.datasets import fetch_openml

pd.set_option('display.expand_frame_repr', False)

import warnings
warnings.filterwarnings('ignore')

In [12]:
# Set data store path
DATA_STORE = Path('data/assets.h5')

In [13]:
# Tesla price data via ALpaca API
df = (pd.read_csv('data_collection/data/tesla_prices.csv',
                 index_col=['timestamp', 'symbol'],
                 infer_datetime_format=True)
     .sort_index())

print(df.info(null_counts=True))
with pd.HDFStore(DATA_STORE) as store:
    store.put('alpaca/tesla/prices', df)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 934339 entries, ('2017-01-03 09:00:00+00:00', 'TSLA') to ('2022-10-06 23:59:00+00:00', 'TSLA')
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   open         934339 non-null  float64
 1   high         934339 non-null  float64
 2   low          934339 non-null  float64
 3   close        934339 non-null  float64
 4   volume       934339 non-null  int64  
 5   trade_count  934339 non-null  int64  
 6   vwap         934339 non-null  float64
dtypes: float64(5), int64(2)
memory usage: 93.7+ MB
None


In [14]:
df = pd.read_csv('data/wiki_stocks.csv')

print(df.info(null_counts=True))
with pd.HDFStore(DATA_STORE) as store:
    store.put('quandl/wiki/stocks', df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3199 entries, 0 to 3198
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   code    3199 non-null   object
 1   name    3199 non-null   object
dtypes: object(2)
memory usage: 50.1+ KB
None


In [15]:
df = pd.read_csv('data/us_equities_meta_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6834 entries, 0 to 6833
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ticker     6834 non-null   object 
 1   name       6834 non-null   object 
 2   lastsale   6718 non-null   float64
 3   marketcap  5766 non-null   float64
 4   ipoyear    3038 non-null   float64
 5   sector     5288 non-null   object 
 6   industry   5288 non-null   object 
dtypes: float64(3), object(4)
memory usage: 373.9+ KB


In [16]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('us_equities/stocks', df.set_index('ticker'))