In [1]:
'''
Transform the fail_categories.csv data
Calculate the average daily values.
- Diff the (cumulative) values provided by the BCC
- Divide by number of days in the period

'''

'\nTransform the fail_categories.csv data\nCalculate the average daily values.\n- Diff the (cumulative) values provided by the BCC\n- Divide by number of days in the period\n\n'

In [2]:
import pandas as pd

In [3]:
# Read fail_categories.csv created by bcc_etl.py
fc = pd.read_csv('../../etl_data/fail_categories.csv')

In [4]:
# Parse only the data needed
keep_cols = ['Date', 'Total Failed', 'Tested Batches', 'Failed Batches', 'Failure Reason']
fc = fc[keep_cols]
fc

Unnamed: 0,Date,Total Failed,Tested Batches,Failed Batches,Failure Reason
0,2018-11-13,3375,20797,2043,Label Claims
1,2018-11-13,3375,20797,685,Pesticides
2,2018-11-13,3375,20797,380,Microbial Impurities
3,2018-11-13,3375,20797,171,Residual Solvents
4,2018-11-13,3375,20797,35,Homogeneity
...,...,...,...,...,...
736,2019-04-15,4699,44017,8,Water Activity
737,2019-04-15,4699,44017,26,Cannabinoids
738,2019-04-15,4699,44017,1,Mycotoxins
739,2019-04-15,4699,44017,2,Injurious to Human Health


In [5]:
# Pivot to get all data for each date on the same row
fc_pivot = fc.pivot_table(index='Date', columns='Failure Reason')
fc_pivot.fillna(value=0, inplace=True) # NaNs represent 0s in reality
fc_pivot = fc_pivot.astype('int')
fc_pivot.sort_index(inplace=True)
fc_pivot['Date'] = pd.to_datetime(fc_pivot.index) # Need a 'Date' column for .diff() to calculate num_days

In [6]:
# 'Tested Batches' does not apply to each 'Failure Type'
# Collapse to a single column
tested_batches = fc_pivot['Tested Batches'].max(axis=1)
del fc_pivot['Tested Batches']
fc_pivot['Tested Batches'] = tested_batches

In [7]:
# 'Total Failed' does not apply to each 'Failure Type'
# Collapse to a single column
total_failed = fc_pivot['Total Failed'].max(axis=1)
del fc_pivot['Total Failed']
fc_pivot['Total Failed'] = total_failed

In [8]:
# Show the final df
fc_pivot

Unnamed: 0_level_0,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Date,Tested Batches,Total Failed
Failure Reason,Cannabinoids,Foreign Material,Heavy Metals,Homogeneity,Injurious to Human Health,Label Claims,Microbial Impurities,Moisture,Mycotoxins,Pesticides,Residual Solvents,Total,Water Activity,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
2018-10-09,0,10,0,35,0,1751,201,40,0,602,130,2769,0,2018-10-09,16049,2769
2018-10-15,0,10,0,35,0,1823,217,45,0,632,137,2899,0,2018-10-15,16869,2899
2018-10-22,0,10,0,35,0,1867,224,45,0,645,145,2971,0,2018-10-22,17833,2971
2018-10-29,0,10,0,35,0,1953,233,45,0,663,154,3093,0,2018-10-29,18863,3093
2018-11-05,0,10,0,35,0,1979,249,50,0,672,167,3162,0,2018-11-05,19707,3162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-13,74,25,531,36,2,2379,910,71,10,1777,360,6223,48,2020-01-13,89659,6223
2020-01-20,77,26,541,36,2,2379,929,71,10,1790,361,6273,51,2020-01-20,90793,6273
2020-01-27,79,26,552,36,2,2379,940,71,10,1804,362,6314,53,2020-01-27,92108,6314
2020-02-03,81,27,573,36,2,2379,955,71,10,1822,366,6377,55,2020-02-03,93732,6377


In [9]:
# Take the difference of each row with the previous row
delta_fc = fc_pivot.diff()
delta_fc = delta_fc.iloc[1:] # Drop first row, which is all NaNs

# Convert to int's where appropriate
keep_cols = ['Failed Batches', 'Tested Batches', 'Total Failed']
for col in keep_cols:
    delta_fc[col] = delta_fc[col].astype('int')
delta_fc.dtypes

delta_fc

Unnamed: 0_level_0,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Date,Tested Batches,Total Failed
Failure Reason,Cannabinoids,Foreign Material,Heavy Metals,Homogeneity,Injurious to Human Health,Label Claims,Microbial Impurities,Moisture,Mycotoxins,Pesticides,Residual Solvents,Total,Water Activity,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
2018-10-15,0,0,0,0,0,72,16,5,0,30,7,130,0,6 days,820,130
2018-10-22,0,0,0,0,0,44,7,0,0,13,8,72,0,7 days,964,72
2018-10-29,0,0,0,0,0,86,9,0,0,18,9,122,0,7 days,1030,122
2018-11-05,0,0,0,0,0,26,16,5,0,9,13,69,0,7 days,844,69
2018-11-13,0,0,0,0,0,64,131,1,0,13,4,213,0,8 days,1090,213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-13,1,0,5,0,0,0,18,0,2,9,0,38,3,7 days,1037,38
2020-01-20,3,1,10,0,0,0,19,0,0,13,1,50,3,7 days,1134,50
2020-01-27,2,0,11,0,0,0,11,0,0,14,1,41,2,7 days,1315,41
2020-02-03,2,1,21,0,0,0,15,0,0,18,4,63,2,7 days,1624,63


In [10]:
# Calculate 'Days in Period'. Can divide by int, not Timedelta
oneday = pd.Timedelta(days=1)
delta_fc['Days in Period'] = delta_fc['Date'] / oneday # Set unit to oneday
delta_fc['Days in Period'] = delta_fc['Days in Period'].astype('int')
delta_fc['Days in Period']

Date
2018-10-15    6
2018-10-22    7
2018-10-29    7
2018-11-05    7
2018-11-13    8
             ..
2020-01-13    7
2020-01-20    7
2020-01-27    7
2020-02-03    7
2020-02-10    7
Name: Days in Period, Length: 62, dtype: int64

In [11]:
# Divide by number of days
# avg_daily_failure_categories is new df
keep_cols = ['Failed Batches', 'Tested Batches', 'Total Failed']
avg_daily_failure_categories = delta_fc[keep_cols].div(delta_fc['Days in Period'], axis=0)
avg_daily_failure_categories['Days in Period'] = delta_fc['Days in Period']
avg_daily_failure_categories

Unnamed: 0_level_0,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Tested Batches,Total Failed,Days in Period
Failure Reason,Cannabinoids,Foreign Material,Heavy Metals,Homogeneity,Injurious to Human Health,Label Claims,Microbial Impurities,Moisture,Mycotoxins,Pesticides,Residual Solvents,Total,Water Activity,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
2018-10-15,0.000000,0.000000,0.000000,0.0,0.0,12.000000,2.666667,0.833333,0.000000,5.000000,1.166667,21.666667,0.000000,136.666667,21.666667,6
2018-10-22,0.000000,0.000000,0.000000,0.0,0.0,6.285714,1.000000,0.000000,0.000000,1.857143,1.142857,10.285714,0.000000,137.714286,10.285714,7
2018-10-29,0.000000,0.000000,0.000000,0.0,0.0,12.285714,1.285714,0.000000,0.000000,2.571429,1.285714,17.428571,0.000000,147.142857,17.428571,7
2018-11-05,0.000000,0.000000,0.000000,0.0,0.0,3.714286,2.285714,0.714286,0.000000,1.285714,1.857143,9.857143,0.000000,120.571429,9.857143,7
2018-11-13,0.000000,0.000000,0.000000,0.0,0.0,8.000000,16.375000,0.125000,0.000000,1.625000,0.500000,26.625000,0.000000,136.250000,26.625000,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-13,0.142857,0.000000,0.714286,0.0,0.0,0.000000,2.571429,0.000000,0.285714,1.285714,0.000000,5.428571,0.428571,148.142857,5.428571,7
2020-01-20,0.428571,0.142857,1.428571,0.0,0.0,0.000000,2.714286,0.000000,0.000000,1.857143,0.142857,7.142857,0.428571,162.000000,7.142857,7
2020-01-27,0.285714,0.000000,1.571429,0.0,0.0,0.000000,1.571429,0.000000,0.000000,2.000000,0.142857,5.857143,0.285714,187.857143,5.857143,7
2020-02-03,0.285714,0.142857,3.000000,0.0,0.0,0.000000,2.142857,0.000000,0.000000,2.571429,0.571429,9.000000,0.285714,232.000000,9.000000,7


In [13]:
keep_cols = ['Tested Batches', 'Total Failed', 'Days in Period']
avg_daily_failure_categories[keep_cols] 
# Remove 'Failure Reason' index, which is not applicable for this data

Unnamed: 0_level_0,Tested Batches,Total Failed,Days in Period
Failure Reason,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2018-10-15,136.666667,21.666667,6
2018-10-22,137.714286,10.285714,7
2018-10-29,147.142857,17.428571,7
2018-11-05,120.571429,9.857143,7
2018-11-13,136.250000,26.625000,8
...,...,...,...
2020-01-13,148.142857,5.428571,7
2020-01-20,162.000000,7.142857,7
2020-01-27,187.857143,5.857143,7
2020-02-03,232.000000,9.000000,7


In [14]:
'''
Lots to do: 
- feature engineering
- structure properly for Tableau
- save to disk
'''

'\nLots to do: \n- feature engineering\n- structure properly for Tableau\n- save to disk\n'