In [1]:
'''
Transform the product_categories.csv data
Calculate the average daily values.
- Diff the (cumulative) values provided by the BCC
- Divide by number of days in the period

'''

'\nTransform the product_categories.csv data\nCalculate the average daily values.\n- Diff the (cumulative) values provided by the BCC\n- Divide by number of days in the period\n\n'

In [2]:
import pandas as pd

In [3]:
# Read fail_categories.csv created by bcc_etl.py
pc = pd.read_csv('../../etl_data/product_categories.csv')

In [4]:
pc

Unnamed: 0,Total Failed,Total Tested,Tested Batches,Date,Category,Failed Batches,Percent Failed,Percent of Failures,Percent Tested
0,3164,20797,10415,2018-11-13,Flower,1029,9.879981,32.522124,50.079338
1,3164,20797,6464,2018-11-13,Inhalable\n(cartridges waxes etc.),1054,16.305693,33.312263,31.081406
2,3164,20797,3918,2018-11-13,Other\n(edibles tinctures topicals etc.),1081,27.590607,34.165613,18.839256
3,3164,20797,20797,2018-11-13,Total,3164,15.213733,100.000000,100.000000
4,4679,46503,23347,2019-04-29,Flower,1627,6.968775,34.772387,50.205363
...,...,...,...,...,...,...,...,...,...
247,3310,22096,22096,2018-11-26,Total,3310,14.980087,100.000000,100.000000
248,4589,44017,22277,2019-04-15,Flower,1589,7.132917,34.626280,50.609992
249,4589,44017,14150,2019-04-15,Inhalable\n(cartridges waxes etc.),1598,11.293286,34.822401,32.146671
250,4589,44017,7590,2019-04-15,Other\n(edibles tinctures topicals etc.),1402,18.471673,30.551318,17.243338


In [5]:
# Parse only the data needed
keep_cols = ['Date', 'Category', 'Tested Batches', 'Failed Batches']
pc = pc[keep_cols]

# Do not want 'Total' as a row
pc = pc[pc['Category'] != 'Total']

# Display
pc

Unnamed: 0,Date,Category,Tested Batches,Failed Batches
0,2018-11-13,Flower,10415,1029
1,2018-11-13,Inhalable\n(cartridges waxes etc.),6464,1054
2,2018-11-13,Other\n(edibles tinctures topicals etc.),3918,1081
4,2019-04-29,Flower,23347,1627
5,2019-04-29,Inhalable\n(cartridges waxes etc.),15138,1645
...,...,...,...,...
245,2018-11-26,Inhalable\n(cartridges waxes etc.),6928,1082
246,2018-11-26,Other\n(edibles tinctures topicals etc.),4145,1141
248,2019-04-15,Flower,22277,1589
249,2019-04-15,Inhalable\n(cartridges waxes etc.),14150,1598


In [6]:
# Pivot to get all data for each date on the same row
pc_pivot = pc.pivot_table(index='Date', columns='Category')
# pc_pivot.fillna(value=0, inplace=True) # NaNs represent 0s in reality
pc_pivot = pc_pivot.astype('int')
pc_pivot.sort_index(inplace=True)
pc_pivot['Date'] = pd.to_datetime(pc_pivot.index) # Need a 'Date' column for .diff() to calculate num_days

In [7]:
pc_pivot

Unnamed: 0_level_0,Failed Batches,Failed Batches,Failed Batches,Tested Batches,Tested Batches,Tested Batches,Date
Category,Flower,Inhalable\n(cartridges waxes etc.),Other\n(edibles tinctures topicals etc.),Flower,Inhalable\n(cartridges waxes etc.),Other\n(edibles tinctures topicals etc.),Unnamed: 7_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2018-10-09,853,915,915,7993,4991,3065,2018-10-09
2018-10-15,918,942,944,8446,5208,3215,2018-10-15
2018-10-22,933,966,969,8978,5483,3372,2018-10-22
2018-10-29,968,1006,1011,9426,5858,3579,2018-10-29
2018-11-05,991,1026,1036,9829,6138,3740,2018-11-05
...,...,...,...,...,...,...,...
2020-01-13,2382,2144,1548,43558,31303,14798,2020-01-13
2020-01-20,2401,2170,1553,44108,31715,14970,2020-01-20
2020-01-27,2422,2187,1555,44738,32220,15150,2020-01-27
2020-02-03,2454,2211,1560,45485,32831,15416,2020-02-03


In [8]:
# Take the difference of each row with the previous row
delta_pc = pc_pivot.diff()
delta_pc = delta_pc.iloc[1:] # Drop first row, which is all NaNs

# Convert to int's where appropriate
keep_cols = ['Failed Batches', 'Tested Batches']
for col in keep_cols:
    delta_pc[col] = delta_pc[col].astype('int')
delta_pc.dtypes

# Diff of 'Date' is 'timespan'

delta_pc

Unnamed: 0_level_0,Failed Batches,Failed Batches,Failed Batches,Tested Batches,Tested Batches,Tested Batches,Date
Category,Flower,Inhalable\n(cartridges waxes etc.),Other\n(edibles tinctures topicals etc.),Flower,Inhalable\n(cartridges waxes etc.),Other\n(edibles tinctures topicals etc.),Unnamed: 7_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2018-10-15,65,27,29,453,217,150,6 days
2018-10-22,15,24,25,532,275,157,7 days
2018-10-29,35,40,42,448,375,207,7 days
2018-11-05,23,20,25,403,280,161,7 days
2018-11-13,38,28,45,586,326,178,8 days
...,...,...,...,...,...,...,...
2020-01-13,22,10,4,468,422,147,7 days
2020-01-20,19,26,5,550,412,172,7 days
2020-01-27,21,17,2,630,505,180,7 days
2020-02-03,32,24,5,747,611,266,7 days


In [9]:
# Calculate 'Days in Period'. Can divide by int, not Timedelta
oneday = pd.Timedelta(days=1)
delta_pc['Days in Period'] = delta_pc['Date'] / oneday # Set unit to oneday
delta_pc['Days in Period'] = delta_pc['Days in Period'].astype('int')
delta_pc['Days in Period']

Date
2018-10-15    6
2018-10-22    7
2018-10-29    7
2018-11-05    7
2018-11-13    8
             ..
2020-01-13    7
2020-01-20    7
2020-01-27    7
2020-02-03    7
2020-02-10    7
Name: Days in Period, Length: 62, dtype: int64

In [10]:
# Divide by number of days
# avg_daily_product_categories is new df
keep_cols = ['Failed Batches', 'Tested Batches']
avg_daily_product_categories = delta_pc[keep_cols].div(delta_pc['Days in Period'], axis=0)
avg_daily_product_categories['Days in Period'] = delta_pc['Days in Period']
avg_daily_product_categories

Unnamed: 0_level_0,Failed Batches,Failed Batches,Failed Batches,Tested Batches,Tested Batches,Tested Batches,Days in Period
Category,Flower,Inhalable\n(cartridges waxes etc.),Other\n(edibles tinctures topicals etc.),Flower,Inhalable\n(cartridges waxes etc.),Other\n(edibles tinctures topicals etc.),Unnamed: 7_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2018-10-15,10.833333,4.500000,4.833333,75.500000,36.166667,25.000000,6
2018-10-22,2.142857,3.428571,3.571429,76.000000,39.285714,22.428571,7
2018-10-29,5.000000,5.714286,6.000000,64.000000,53.571429,29.571429,7
2018-11-05,3.285714,2.857143,3.571429,57.571429,40.000000,23.000000,7
2018-11-13,4.750000,3.500000,5.625000,73.250000,40.750000,22.250000,8
...,...,...,...,...,...,...,...
2020-01-13,3.142857,1.428571,0.571429,66.857143,60.285714,21.000000,7
2020-01-20,2.714286,3.714286,0.714286,78.571429,58.857143,24.571429,7
2020-01-27,3.000000,2.428571,0.285714,90.000000,72.142857,25.714286,7
2020-02-03,4.571429,3.428571,0.714286,106.714286,87.285714,38.000000,7


In [11]:
# Stack (un-pivot) to get data in standard form for Tableau ingestion
pc_perday = avg_daily_product_categories[avg_daily_product_categories.columns[:-1]].stack()
pc_perday

Unnamed: 0_level_0,Unnamed: 1_level_0,Failed Batches,Tested Batches
Date,Category,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-10-15,Flower,10.833333,75.500000
2018-10-15,Inhalable\n(cartridges waxes etc.),4.500000,36.166667
2018-10-15,Other\n(edibles tinctures topicals etc.),4.833333,25.000000
2018-10-22,Flower,2.142857,76.000000
2018-10-22,Inhalable\n(cartridges waxes etc.),3.428571,39.285714
...,...,...,...
2020-02-03,Inhalable\n(cartridges waxes etc.),3.428571,87.285714
2020-02-03,Other\n(edibles tinctures topicals etc.),0.714286,38.000000
2020-02-10,Flower,5.000000,93.285714
2020-02-10,Inhalable\n(cartridges waxes etc.),1.428571,84.714286


In [12]:
# Make 'Date' and 'Category' explicit columns
pc_perday['Date'] = pc_perday.index.get_level_values(0)
pc_perday['Category'] = pc_perday.index.get_level_values(1)
pc_perday

Unnamed: 0_level_0,Unnamed: 1_level_0,Failed Batches,Tested Batches,Date,Category
Date,Category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-10-15,Flower,10.833333,75.500000,2018-10-15,Flower
2018-10-15,Inhalable\n(cartridges waxes etc.),4.500000,36.166667,2018-10-15,Inhalable\n(cartridges waxes etc.)
2018-10-15,Other\n(edibles tinctures topicals etc.),4.833333,25.000000,2018-10-15,Other\n(edibles tinctures topicals etc.)
2018-10-22,Flower,2.142857,76.000000,2018-10-22,Flower
2018-10-22,Inhalable\n(cartridges waxes etc.),3.428571,39.285714,2018-10-22,Inhalable\n(cartridges waxes etc.)
...,...,...,...,...,...
2020-02-03,Inhalable\n(cartridges waxes etc.),3.428571,87.285714,2020-02-03,Inhalable\n(cartridges waxes etc.)
2020-02-03,Other\n(edibles tinctures topicals etc.),0.714286,38.000000,2020-02-03,Other\n(edibles tinctures topicals etc.)
2020-02-10,Flower,5.000000,93.285714,2020-02-10,Flower
2020-02-10,Inhalable\n(cartridges waxes etc.),1.428571,84.714286,2020-02-10,Inhalable\n(cartridges waxes etc.)


In [13]:
# Feature Engineering: 'Fail Rate'
pc_perday['Fail Rate'] = 100 * pc_perday['Failed Batches'] / pc_perday['Tested Batches']
pc_perday

Unnamed: 0_level_0,Unnamed: 1_level_0,Failed Batches,Tested Batches,Date,Category,Fail Rate
Date,Category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-10-15,Flower,10.833333,75.500000,2018-10-15,Flower,14.348786
2018-10-15,Inhalable\n(cartridges waxes etc.),4.500000,36.166667,2018-10-15,Inhalable\n(cartridges waxes etc.),12.442396
2018-10-15,Other\n(edibles tinctures topicals etc.),4.833333,25.000000,2018-10-15,Other\n(edibles tinctures topicals etc.),19.333333
2018-10-22,Flower,2.142857,76.000000,2018-10-22,Flower,2.819549
2018-10-22,Inhalable\n(cartridges waxes etc.),3.428571,39.285714,2018-10-22,Inhalable\n(cartridges waxes etc.),8.727273
...,...,...,...,...,...,...
2020-02-03,Inhalable\n(cartridges waxes etc.),3.428571,87.285714,2020-02-03,Inhalable\n(cartridges waxes etc.),3.927987
2020-02-03,Other\n(edibles tinctures topicals etc.),0.714286,38.000000,2020-02-03,Other\n(edibles tinctures topicals etc.),1.879699
2020-02-10,Flower,5.000000,93.285714,2020-02-10,Flower,5.359877
2020-02-10,Inhalable\n(cartridges waxes etc.),1.428571,84.714286,2020-02-10,Inhalable\n(cartridges waxes etc.),1.686341


In [14]:
print('Saving pc_perday.csv...')
pc_perday.to_csv(path_or_buf='../../etl_data/pc_perday.csv', index=False)
print('Done saving pc_perday.')

Saving pc_perday.csv...
Done saving pc_perday.
