# Testing CPI with arrays of dates

###### 10-22-18

In [1]:
import cpi
import pandas as pd
import numpy as np

from datetime import date

Using `pandas.util.testing` we can create 2 dataframes with datetime indexes and using `numpy.random.randint()` we can create a set of corresponding _income_ values.

We use `freq = 'W'` so we have 100 different dates but ~25 different months as an example of data that could be found _in the wild_.

In [2]:
first_date = pd.date_range(end = '2018-01-01', periods = 3000, freq = 'W')

second_date = pd.date_range(start = '1930-01-01', periods = 3000, freq = 'W')

incomes = np.random.randint(low = 1500, high = 200_000, size = 3000)

From there we can construct our working dataframe

In [3]:
df = pd.DataFrame(columns=['date_from', 'date_to', 'incomes'])

In [4]:
df['date_from'] = second_date
df['date_to'] = first_date
df['incomes'] = incomes

df.head()

Unnamed: 0,date_from,date_to,incomes
0,1930-01-05,1960-07-10,76223
1,1930-01-12,1960-07-17,168785
2,1930-01-19,1960-07-24,131969
3,1930-01-26,1960-07-31,36297
4,1930-02-02,1960-08-07,172199


***
`CPI` works in a simple fashion:
1. Look for `year_or_month` for when the values are from and retrieve its ___source_index___.
2. Look for `to` for the ___target_index___ to inflate the values _to_.
3. `return (value * target_index) / float(source_index)`

The simplicity of the conversion lends `CPI` to be useful when `value` is a pandas series or numpy array. 


The goal is to be able to provide a pandas series or numpy array with dates as well.

Here's an example of a work-around:

In [5]:
# source_index
cpi_values_source = {}

dates_from = df['date_from'].astype(str).str[:7] # 1234-56

for item in dates_from.unique():
    # retrieve all values and store them in a dict()
    y_m = str(item).split("-")
    y_m = [int(y_m[0]), int(y_m[1])] # year and month
    target_date = date(y_m[0], y_m[1], 1)
    cpi_values_source[item] = cpi.get(target_date)
    
# Map those values to another series
source_index = dates_from.map(cpi_values_source)

In [6]:
# targe_index
cpi_values_target = {}

dates_to = df['date_to'].astype(str).str[:7]

for item in dates_to.unique():
    # retrieve all values and store them in a dict()
    y_m = str(item).split("-")
    y_m = [int(y_m[0]), int(y_m[1])] # year and month
    target_date = date(y_m[0], y_m[1], 1)
    cpi_values_target[item] = cpi.get(target_date)
    
# Map those values to another series
target_index = dates_to.map(cpi_values_target)

In [7]:
df['inflated'] = df['incomes'] * target_index / source_index

df.head()

Unnamed: 0,date_from,date_to,incomes,inflated
0,1930-01-05,1960-07-10,76223,131941.567251
1,1930-01-12,1960-07-17,168785,292165.847953
2,1930-01-19,1960-07-24,131969,228437.567251
3,1930-01-26,1960-07-31,36297,62829.894737
4,1930-02-02,1960-08-07,172199,299828.847059


***
The value here is that even if you have 6000 different weekly observations you only have around ($6000 / 4=$) 1500 different months and so you should only call `cpi.get()` 1500 times, not 6000.

A much more common case would be to have a set of weekly observations and to _inflate_ them all to the most up-to-date index.

In [8]:
cpi.update()

In [9]:
cpi.LATEST_MONTH

datetime.date(2018, 9, 1)

In [10]:
date_from = pd.date_range(end = '2018-01-01', periods = 3000, freq = 'W')

incomes = np.random.randint(low = 1500, high = 200_000, size = 3000)

df = pd.DataFrame(columns=['date_from', 'date_to', 'incomes'])

df['date_from'] = date_from
df['date_to'] = "2018-09-01"
df['incomes'] = incomes

df.head()

Unnamed: 0,date_from,date_to,incomes
0,1960-07-10,2018-09-01,161370
1,1960-07-17,2018-09-01,188067
2,1960-07-24,2018-09-01,117156
3,1960-07-31,2018-09-01,104511
4,1960-08-07,2018-09-01,160086


Currently, the example from `CPI` documentation for working with pandas uses the `.apply()` method.

In [11]:
df['YEAR'] = df['date_from'].dt.year # prepping for CPI README example

In [12]:
%%timeit
df['ADJUSTED'] = df.apply(lambda x: cpi.inflate(x['incomes'], x['YEAR']), axis=1)

130 ms ± 5.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
df[['YEAR', 'incomes', 'ADJUSTED']].head()

Unnamed: 0,YEAR,incomes,ADJUSTED
0,1960,161370,1336318.0
1,1960,188067,1557398.0
2,1960,117156,970178.3
3,1960,104511,865464.1
4,1960,160086,1325685.0


Using this _new_ method, not only can we do this for each month (not just year) but it is also a bit faster.

In [14]:
%%timeit
# source_index
cpi_values_source = {}

dates_from = df['date_from'].astype(str).str[:7] # 1234-56

for item in dates_from.unique():
    # retrieve all values and store them in a dict()
    y_m = str(item).split("-")
    y_m = [int(y_m[0]), int(y_m[1])] # year and month
    target_date = date(y_m[0], y_m[1], 1)
    cpi_values_source[item] = cpi.get(target_date)
    
# Map those values to another series
source_index = dates_from.map(cpi_values_source)
target_index = cpi.get(date(2018,9,1))

df['ADJUSTED_2'] = df['incomes'] * target_index / source_index

25.2 ms ± 1.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
df[['date_from', 'incomes', 'ADJUSTED_2']].head()

Unnamed: 0,date_from,incomes,ADJUSTED_2
0,1960-07-10,161370,1376219.0
1,1960-07-17,188067,1603900.0
2,1960-07-24,117156,999146.7
3,1960-07-31,104511,891305.8
4,1960-08-07,160086,1365269.0
