Adapted from the tutorial <a href="https://realpython.com/fast-flexible-pandas/">here</a>.

In [1]:
import numpy as np
import pandas as pd

pd.__version__

'1.3.5'

In [2]:
df = pd.read_csv('../data/demand_profile.csv')
df.head()

Unnamed: 0,date_time,energy_kwh
0,1/1/13 0:00,0.586
1,1/1/13 1:00,0.58
2,1/1/13 2:00,0.572
3,1/1/13 3:00,0.596
4,1/1/13 4:00,0.592


In [3]:
df.dtypes

date_time      object
energy_kwh    float64
dtype: object

In [4]:
type(df.iat[0, 0])

str

In [5]:
%%timeit
df['date_time2'] = pd.to_datetime(df.date_time)
df.date_time.dtype

408 ms ± 7.07 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
df.head()

Unnamed: 0,date_time,energy_kwh,date_time2
0,1/1/13 0:00,0.586,2013-01-01 00:00:00
1,1/1/13 1:00,0.58,2013-01-01 01:00:00
2,1/1/13 2:00,0.572,2013-01-01 02:00:00
3,1/1/13 3:00,0.596,2013-01-01 03:00:00
4,1/1/13 4:00,0.592,2013-01-01 04:00:00


In [7]:
%%timeit
df['date_time'] = pd.to_datetime(df.date_time, format='%d/%m/%y %H:%M')

4.87 ms ± 14 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [8]:
def apply_tariff(kwh, hour):
    '''Calculate electricity cost by hour'''
    if 0 <= hour < 7:
        rate = 12
    elif 7 <= hour < 17:
        rate = 20
    elif 17 <= hour < 24:
        rate = 28
    else:
        raise ValueError(f'Invalid hour: {hour}')
    return rate * kwh

In [9]:
%%timeit
# BAD!
def apply_tariff_loop(df):
    '''Calculate cost in loop. Modifies <df> in place'''
    energy_cost_list = []
    for i in range(len(df)):
        energy_used = df.iloc[i]['energy_kwh']
        hour = df.iloc[i]['date_time'].hour
        energy_cost = apply_tariff(energy_used, hour)
        energy_cost_list.append(energy_cost)
    df['cost_cents'] = energy_cost_list

apply_tariff_loop(df)

1.11 s ± 4.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%timeit
def apply_tariff_iterrows(df):
    costs = []
    for i, row in df.iterrows():
        energy = row.energy_kwh
        hour = row.date_time.hour
        cost = apply_tariff(energy, hour)
        costs.append(cost)
    df['cost_cents'] = costs
    
apply_tariff_iterrows(df)

289 ms ± 1.21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%%timeit
def apply_tariff_apply(df):
    df['cost_cents'] = df.apply(
        lambda row: apply_tariff(row.energy_kwh, row.date_time.hour),
        axis=1)
    
apply_tariff_apply(df)

135 ms ± 1.45 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
df.head()

Unnamed: 0,date_time,energy_kwh,date_time2,cost_cents
0,2013-01-01 00:00:00,0.586,2013-01-01 00:00:00,7.032
1,2013-01-01 01:00:00,0.58,2013-01-01 01:00:00,6.96
2,2013-01-01 02:00:00,0.572,2013-01-01 02:00:00,6.864
3,2013-01-01 03:00:00,0.596,2013-01-01 03:00:00,7.152
4,2013-01-01 04:00:00,0.592,2013-01-01 04:00:00,7.104


In [13]:
df.set_index('date_time', inplace=True)
df.head()

Unnamed: 0_level_0,energy_kwh,date_time2,cost_cents
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-01-01 00:00:00,0.586,2013-01-01 00:00:00,7.032
2013-01-01 01:00:00,0.58,2013-01-01 01:00:00,6.96
2013-01-01 02:00:00,0.572,2013-01-01 02:00:00,6.864
2013-01-01 03:00:00,0.596,2013-01-01 03:00:00,7.152
2013-01-01 04:00:00,0.592,2013-01-01 04:00:00,7.104


In [14]:
%%timeit
def apply_tariff_isin(df):
    peak_hrs = df.index.hour.isin(range(17, 24))
    mid_hrs = df.index.hour.isin(range(7, 17))
    low_hrs = df.index.hour.isin(range(7))
    df.loc[peak_hrs, 'cost_cents'] = df.loc[peak_hrs, 'energy_kwh'] * 28
    df.loc[mid_hrs, 'cost_cents'] = df.loc[mid_hrs, 'energy_kwh'] * 20
    df.loc[low_hrs, 'cost_cents'] = df.loc[low_hrs, 'energy_kwh'] * 12
    
apply_tariff_isin(df)

1.93 ms ± 6.19 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [15]:
%%timeit
def apply_tariff_cut(df):
    cents_per_kwh = pd.cut(
        x=df.index.hour,
        bins=[0, 7, 17, 24],
        include_lowest=True,
        labels=[12, 20, 28]
    ).astype(int)
    df['cost_cents'] = cents_per_kwh * df.energy_kwh
    
apply_tariff_cut(df)

650 µs ± 1.53 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [16]:
%%timeit
def apply_tariff_np(df):
    prices = np.array([12, 20, 28])
    bins = np.digitize(df.index.hour.values, bins=[7, 17, 24])
    df.cost_cents = prices[bins] * df.energy_kwh.values
    
apply_tariff_np(df)

315 µs ± 655 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


General performance hierarchy:
- vectorized ops
- apply()
- itertuples()
- iterrows()
- element-wise

In [40]:
df = pd.DataFrame({
    'A': ['2020-01-01', '2020-01-02'],
    'B': [[0, 1, 2], [3, 4, 5]],
    'C': [{0, 1, 2}, {3, 5, 8}]})
df.A = pd.to_datetime(df.A)
df

Unnamed: 0,A,B,C
0,2020-01-01,"[0, 1, 2]","{0, 1, 2}"
1,2020-01-02,"[3, 4, 5]","{8, 3, 5}"


In [41]:
df.dtypes

A    datetime64[ns]
B            object
C            object
dtype: object

In [42]:
type(df.B[0])

list

In [43]:
# Typical save
df.to_csv('test.csv', index=False)

# Using HDFStore
data_store = pd.HDFStore('test.h5')
data_store['test'] = df
data_store.close()

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['B', 'C'], dtype='object')]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [45]:
from_csv = pd.read_csv('test.csv')
from_csv.head()

Unnamed: 0,A,B,C
0,2020-01-01,"[0, 1, 2]","{0, 1, 2}"
1,2020-01-02,"[3, 4, 5]","{8, 3, 5}"


In [46]:
from_csv.dtypes # A no longer datetime

A    object
B    object
C    object
dtype: object

In [47]:
type(from_csv.B[0])  # B and C serialized as strings

str

In [49]:
data_store = pd.HDFStore('test.h5')
from_h5 = data_store['test']
data_store.close()

In [50]:
from_h5.dtypes  # datetime preserved

A    datetime64[ns]
B            object
C            object
dtype: object

In [51]:
type(from_h5.B[0]) # serialized types preserved

list