Adapted from the tutorial <a href="https://realpython.com/fast-flexible-pandas/">here</a>.

In [1]:
import pandas as pd

pd.__version__

'1.3.5'

In [2]:
df = pd.read_csv('../data/demand_profile.csv')
df.head()

Unnamed: 0,date_time,energy_kwh
0,1/1/13 0:00,0.586
1,1/1/13 1:00,0.58
2,1/1/13 2:00,0.572
3,1/1/13 3:00,0.596
4,1/1/13 4:00,0.592


In [3]:
df.dtypes

date_time      object
energy_kwh    float64
dtype: object

In [4]:
type(df.iat[0, 0])

str

In [5]:
%%timeit
df['date_time2'] = pd.to_datetime(df.date_time)
df.date_time.dtype

401 ms ± 1.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
df.head()

Unnamed: 0,date_time,energy_kwh,date_time2
0,1/1/13 0:00,0.586,2013-01-01 00:00:00
1,1/1/13 1:00,0.58,2013-01-01 01:00:00
2,1/1/13 2:00,0.572,2013-01-01 02:00:00
3,1/1/13 3:00,0.596,2013-01-01 03:00:00
4,1/1/13 4:00,0.592,2013-01-01 04:00:00


In [12]:
%%timeit
df['date_time'] = pd.to_datetime(df.date_time, format='%d/%m/%y %H:%M')

4.69 ms ± 10.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [8]:
def apply_tariff(kwh, hour):
    '''Calculate electricity cost by hour'''
    if 0 <= hour < 7:
        rate = 12
    elif 7 <= hour < 17:
        rate = 20
    elif 17 <= hour < 24:
        rate = 28
    else:
        raise ValueError(f'Invalid hour: {hour}')
    return rate * kwh

In [14]:
%%timeit
# BAD!
def apply_tariff_loop(df):
    '''Calculate cost in loop. Modifies <df> in place'''
    energy_cost_list = []
    for i in range(len(df)):
        energy_used = df.iloc[i]['energy_kwh']
        hour = df.iloc[i]['date_time'].hour
        energy_cost = apply_tariff(energy_used, hour)
        energy_cost_list.append(energy_cost)
    df['cost_cents'] = energy_cost_list

apply_tariff_loop(df)

1.21 s ± 6.53 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%%timeit
def apply_tariff_iterrows(df):
    costs = []
    for i, row in df.iterrows():
        energy = row.energy_kwh
        hour = row.date_time.hour
        cost = apply_tariff(energy, hour)
        costs.append(cost)
    df['cost_cents'] = costs
    
apply_tariff_iterrows(df)

312 ms ± 1.75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%%timeit
def apply_tariff_apply(df):
    df['cost_cents'] = df.apply(
        lambda row: apply_tariff(row.energy_kwh, row.date_time.hour),
        axis=1)
    
apply_tariff_apply(df)

159 ms ± 1.04 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
