# HDF5 Storage

Let's convert our big dataframe into hdf..

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os

os.chdir("../..")

In [39]:
import pandas as pd
import numpy as np

In [40]:
hdf = pd.HDFStore('tmp/storage.h5')
hdf.put('tables/t1', pd.DataFrame(np.random.rand(20,5)), format="t")
hdf.put('tables/t2',pd.DataFrame(np.random.rand(10,3)))
hdf.put('new_tables/t1', pd.DataFrame(np.random.rand(15,2)))

In [41]:
help(hdf.put)

Help on method put in module pandas.io.pytables:

put(key, value, format=None, append=False, **kwargs) method of pandas.io.pytables.HDFStore instance
    Store object in HDFStore
    
    Parameters
    ----------
    key      : object
    value    : {Series, DataFrame, Panel}
    format   : 'fixed(f)|table(t)', default is 'fixed'
        fixed(f) : Fixed format
                   Fast writing/reading. Not-appendable, nor searchable
        table(t) : Table format
                   Write as a PyTables Table structure which may perform
                   worse but allow more flexible operations like searching
                   / selecting subsets of the data
    append   : boolean, default False
        This will force Table format, append the input data to the
        existing.
    data_columns : list of columns to create as data columns, or True to
        use all columns. See
        `here <http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns>`__ # noqa
    en

In [42]:
hdf.get("tables/t1")

Unnamed: 0,0,1,2,3,4
0,0.179183,0.267437,0.722251,0.106144,0.349499
1,0.078775,0.80635,0.569484,0.746635,0.812667
2,0.499658,0.748355,0.086639,0.685492,0.257722
3,0.43189,0.574197,0.791111,0.434674,0.471247
4,0.613762,0.889421,0.394755,0.622091,0.86508
5,0.378108,0.740913,0.425776,0.64238,0.453861
6,0.02523,0.708406,0.892937,0.308665,0.466234
7,0.725209,0.619059,0.950994,0.509067,0.372927
8,0.770649,0.149383,0.683293,0.69328,0.365809
9,0.486929,0.506334,0.902914,0.871832,0.830428


In [5]:
hdf.put("tables/t1", pd.DataFrame(np.random.rand(20,5)), append=True, format='t')

In [43]:
df = hdf.get('tables/t1')
df

Unnamed: 0,0,1,2,3,4
0,0.179183,0.267437,0.722251,0.106144,0.349499
1,0.078775,0.80635,0.569484,0.746635,0.812667
2,0.499658,0.748355,0.086639,0.685492,0.257722
3,0.43189,0.574197,0.791111,0.434674,0.471247
4,0.613762,0.889421,0.394755,0.622091,0.86508
5,0.378108,0.740913,0.425776,0.64238,0.453861
6,0.02523,0.708406,0.892937,0.308665,0.466234
7,0.725209,0.619059,0.950994,0.509067,0.372927
8,0.770649,0.149383,0.683293,0.69328,0.365809
9,0.486929,0.506334,0.902914,0.871832,0.830428


Observemos que el índice se caga con esto...

In [7]:
df.loc[0]

Unnamed: 0,0,1,2,3,4
0,0.756944,0.293185,0.46007,0.663484,0.495344
0,0.451373,0.719774,0.327892,0.532801,0.211725


In [8]:
df.iloc[0]

0    0.756944
1    0.293185
2    0.460070
3    0.663484
4    0.495344
Name: 0, dtype: float64

# Remove dataframe

In [9]:
help(hdf.remove)

Help on method remove in module pandas.io.pytables:

remove(key, where=None, start=None, stop=None) method of pandas.io.pytables.HDFStore instance
    Remove pandas object partially by specifying the where condition
    
    Parameters
    ----------
    key : string
        Node to remove or delete rows from
    where : list of Term (or convertable) objects, optional
    start : integer (defaults to None), row number to start selection
    stop  : integer (defaults to None), row number to stop selection
    
    Returns
    -------
    number of rows removed (or None if not a Table)
    
    Exceptions
    ----------
    raises KeyError if key is not a valid store



# Our dataset

In [61]:
hdf = pd.HDFStore("output/instances.h5")

In [62]:
hdf.keys()

[]

In [60]:
hdf.select(u"plain")

KeyError: 'No object named plain in the file'

In [19]:
help(hdf.select)

Help on method select in module pandas.io.pytables:

select(key, where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, auto_close=False, **kwargs) method of pandas.io.pytables.HDFStore instance
    Retrieve pandas object stored in file, optionally based on where
    criteria
    
    Parameters
    ----------
    key : object
    where : list of Term (or convertable) objects, optional
    start : integer (defaults to None), row number to start selection
    stop  : integer (defaults to None), row number to stop selection
    columns : a list of columns that if not None, will limit the return
        columns
    iterator : boolean, return an iterator, default False
    chunksize : nrows to include in iteration, return an iterator
    auto_close : boolean, should automatically close the store when
        finished, default is False
    
    Returns
    -------
    The selected object



In [35]:
keys = hdf.select("plain", columns=["subject_id"]).subject_id.unique()

In [27]:

for subject_id in keys[:4]:
    df = hdf.select("plain", where="subject_id = '{}'".format(subject_id) )
    print(df.shape)

(1980, 9)
(1980, 9)
(2700, 9)
(1980, 9)


In [37]:
hdf.put

Help on method copy in module pandas.io.pytables:

copy(file, mode='w', propindexes=True, keys=None, complib=None, complevel=None, fletcher32=False, overwrite=True) method of pandas.io.pytables.HDFStore instance
    copy the existing store to a new file, upgrading in place
    
    Parameters
    ----------
    propindexes: restore indexes in copied file (defaults to True)
    keys       : list of keys to include in the copy (defaults to all)
    overwrite  : overwrite (remove and replace) existing nodes in the
        new store (default is True)
    mode, complib, complevel, fletcher32 same as in HDFStore.__init__
    
    Returns
    -------
    open file handle of the new store

