In [1]:
'''
Transform the fail_categories.csv data
Calculate the average daily values.
- Diff the (cumulative) values provided by the BCC
- Divide by number of days in the period

'''

'\nTransform the fail_categories.csv data\nCalculate the average daily values.\n- Diff the (cumulative) values provided by the BCC\n- Divide by number of days in the period\n\n'

In [2]:
import pandas as pd

In [3]:
# Read fail_categories.csv created by bcc_etl.py
fc = pd.read_csv('../../etl_data/fail_categories.csv')

In [4]:
# Parse only the data needed
keep_cols = ['Date', 'Total Failed', 'Tested Batches', 'Failed Batches', 'Failure Reason']
fc = fc[keep_cols]
fc

Unnamed: 0,Date,Total Failed,Tested Batches,Failed Batches,Failure Reason
0,2018-11-13,3375,20797,2043,Label Claims
1,2018-11-13,3375,20797,685,Pesticides
2,2018-11-13,3375,20797,380,Microbial Impurities
3,2018-11-13,3375,20797,171,Residual Solvents
4,2018-11-13,3375,20797,35,Homogeneity
...,...,...,...,...,...
736,2019-04-15,4699,44017,8,Water Activity
737,2019-04-15,4699,44017,26,Cannabinoids
738,2019-04-15,4699,44017,1,Mycotoxins
739,2019-04-15,4699,44017,2,Injurious to Human Health


In [5]:
# Pivot to get all data for each date on the same row
fc_pivot = fc.pivot_table(index='Date', columns='Failure Reason')
fc_pivot.fillna(value=0, inplace=True) # NaNs represent 0s in reality
fc_pivot = fc_pivot.astype('int')
fc_pivot.sort_index(inplace=True)
fc_pivot['Date'] = pd.to_datetime(fc_pivot.index) # Need a 'Date' column for .diff() to calculate num_days

In [6]:
# # 'Tested Batches' does not apply to each 'Failure Type'
# # Collapse to a single column
# tested_batches = fc_pivot['Tested Batches'].max(axis=1)
# del fc_pivot['Tested Batches']
# fc_pivot['Tested Batches'] = tested_batches

In [7]:
# # 'Total Failed' does not apply to each 'Failure Type'
# # Collapse to a single column
# total_failed = fc_pivot['Total Failed'].max(axis=1)
# del fc_pivot['Total Failed']
# fc_pivot['Total Failed'] = total_failed

In [8]:
# Show the final df
fc_pivot

Unnamed: 0_level_0,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,...,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Date
Failure Reason,Cannabinoids,Foreign Material,Heavy Metals,Homogeneity,Injurious to Human Health,Label Claims,Microbial Impurities,Moisture,Mycotoxins,Pesticides,...,Injurious to Human Health,Label Claims,Microbial Impurities,Moisture,Mycotoxins,Pesticides,Residual Solvents,Total,Water Activity,Unnamed: 21_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-10-09,0,10,0,35,0,1751,201,40,0,602,...,0,2769,2769,2769,0,2769,2769,2769,0,2018-10-09
2018-10-15,0,10,0,35,0,1823,217,45,0,632,...,0,2899,2899,2899,0,2899,2899,2899,0,2018-10-15
2018-10-22,0,10,0,35,0,1867,224,45,0,645,...,0,2971,2971,2971,0,2971,2971,2971,0,2018-10-22
2018-10-29,0,10,0,35,0,1953,233,45,0,663,...,0,3093,3093,3093,0,3093,3093,3093,0,2018-10-29
2018-11-05,0,10,0,35,0,1979,249,50,0,672,...,0,3162,3162,3162,0,3162,3162,3162,0,2018-11-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-13,74,25,531,36,2,2379,910,71,10,1777,...,6223,6223,6223,6223,6223,6223,6223,6223,6223,2020-01-13
2020-01-20,77,26,541,36,2,2379,929,71,10,1790,...,6273,6273,6273,6273,6273,6273,6273,6273,6273,2020-01-20
2020-01-27,79,26,552,36,2,2379,940,71,10,1804,...,6314,6314,6314,6314,6314,6314,6314,6314,6314,2020-01-27
2020-02-03,81,27,573,36,2,2379,955,71,10,1822,...,6377,6377,6377,6377,6377,6377,6377,6377,6377,2020-02-03


In [9]:
# Take the difference of each row with the previous row
delta_fc = fc_pivot.diff()
delta_fc = delta_fc.iloc[1:] # Drop first row, which is all NaNs

# Convert to int's where appropriate
keep_cols = ['Failed Batches', 'Tested Batches', 'Total Failed']
for col in keep_cols:
    delta_fc[col] = delta_fc[col].astype('int')
delta_fc.dtypes

delta_fc

Unnamed: 0_level_0,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,...,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Date
Failure Reason,Cannabinoids,Foreign Material,Heavy Metals,Homogeneity,Injurious to Human Health,Label Claims,Microbial Impurities,Moisture,Mycotoxins,Pesticides,...,Injurious to Human Health,Label Claims,Microbial Impurities,Moisture,Mycotoxins,Pesticides,Residual Solvents,Total,Water Activity,Unnamed: 21_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-10-15,0,0,0,0,0,72,16,5,0,30,...,0,130,130,130,0,130,130,130,0,6 days
2018-10-22,0,0,0,0,0,44,7,0,0,13,...,0,72,72,72,0,72,72,72,0,7 days
2018-10-29,0,0,0,0,0,86,9,0,0,18,...,0,122,122,122,0,122,122,122,0,7 days
2018-11-05,0,0,0,0,0,26,16,5,0,9,...,0,69,69,69,0,69,69,69,0,7 days
2018-11-13,0,0,0,0,0,64,131,1,0,13,...,0,213,213,213,0,213,213,213,0,8 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-13,1,0,5,0,0,0,18,0,2,9,...,38,38,38,38,38,38,38,38,38,7 days
2020-01-20,3,1,10,0,0,0,19,0,0,13,...,50,50,50,50,50,50,50,50,50,7 days
2020-01-27,2,0,11,0,0,0,11,0,0,14,...,41,41,41,41,41,41,41,41,41,7 days
2020-02-03,2,1,21,0,0,0,15,0,0,18,...,63,63,63,63,63,63,63,63,63,7 days


In [10]:
# Calculate 'Days in Period'. Can divide by int, not Timedelta
oneday = pd.Timedelta(days=1)
delta_fc['Days in Period'] = delta_fc['Date'] / oneday # Set unit to oneday
delta_fc['Days in Period'] = delta_fc['Days in Period'].astype('int')
delta_fc['Days in Period']

Date
2018-10-15    6
2018-10-22    7
2018-10-29    7
2018-11-05    7
2018-11-13    8
             ..
2020-01-13    7
2020-01-20    7
2020-01-27    7
2020-02-03    7
2020-02-10    7
Name: Days in Period, Length: 62, dtype: int64

In [11]:
# Divide by number of days
# avg_daily_failure_categories is new df
keep_cols = ['Failed Batches', 'Tested Batches', 'Total Failed']
avg_daily_failure_categories = delta_fc[keep_cols].div(delta_fc['Days in Period'], axis=0)
avg_daily_failure_categories['Days in Period'] = delta_fc['Days in Period']
avg_daily_failure_categories

Unnamed: 0_level_0,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,...,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Days in Period
Failure Reason,Cannabinoids,Foreign Material,Heavy Metals,Homogeneity,Injurious to Human Health,Label Claims,Microbial Impurities,Moisture,Mycotoxins,Pesticides,...,Injurious to Human Health,Label Claims,Microbial Impurities,Moisture,Mycotoxins,Pesticides,Residual Solvents,Total,Water Activity,Unnamed: 21_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-10-15,0.000000,0.000000,0.000000,0.0,0.0,12.000000,2.666667,0.833333,0.000000,5.000000,...,0.000000,21.666667,21.666667,21.666667,0.000000,21.666667,21.666667,21.666667,0.000000,6
2018-10-22,0.000000,0.000000,0.000000,0.0,0.0,6.285714,1.000000,0.000000,0.000000,1.857143,...,0.000000,10.285714,10.285714,10.285714,0.000000,10.285714,10.285714,10.285714,0.000000,7
2018-10-29,0.000000,0.000000,0.000000,0.0,0.0,12.285714,1.285714,0.000000,0.000000,2.571429,...,0.000000,17.428571,17.428571,17.428571,0.000000,17.428571,17.428571,17.428571,0.000000,7
2018-11-05,0.000000,0.000000,0.000000,0.0,0.0,3.714286,2.285714,0.714286,0.000000,1.285714,...,0.000000,9.857143,9.857143,9.857143,0.000000,9.857143,9.857143,9.857143,0.000000,7
2018-11-13,0.000000,0.000000,0.000000,0.0,0.0,8.000000,16.375000,0.125000,0.000000,1.625000,...,0.000000,26.625000,26.625000,26.625000,0.000000,26.625000,26.625000,26.625000,0.000000,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-13,0.142857,0.000000,0.714286,0.0,0.0,0.000000,2.571429,0.000000,0.285714,1.285714,...,5.428571,5.428571,5.428571,5.428571,5.428571,5.428571,5.428571,5.428571,5.428571,7
2020-01-20,0.428571,0.142857,1.428571,0.0,0.0,0.000000,2.714286,0.000000,0.000000,1.857143,...,7.142857,7.142857,7.142857,7.142857,7.142857,7.142857,7.142857,7.142857,7.142857,7
2020-01-27,0.285714,0.000000,1.571429,0.0,0.0,0.000000,1.571429,0.000000,0.000000,2.000000,...,5.857143,5.857143,5.857143,5.857143,5.857143,5.857143,5.857143,5.857143,5.857143,7
2020-02-03,0.285714,0.142857,3.000000,0.0,0.0,0.000000,2.142857,0.000000,0.000000,2.571429,...,9.000000,9.000000,9.000000,9.000000,9.000000,9.000000,9.000000,9.000000,9.000000,7


In [12]:
# keep_cols = ['Tested Batches', 'Total Failed', 'Days in Period']
# avg_daily_failure_categories[keep_cols] 
# # Remove 'Failure Reason' index, which is not applicable for this data

In [13]:
# a = avg_daily_failure_categories.stack()
fc_perday = avg_daily_failure_categories[avg_daily_failure_categories.columns[:-1]].stack()
# del a['Days in Period']
fc_perday

Unnamed: 0_level_0,Unnamed: 1_level_0,Failed Batches,Tested Batches,Total Failed
Date,Failure Reason,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-10-15,Cannabinoids,0.000000,0.000000,0.000000
2018-10-15,Foreign Material,0.000000,136.666667,21.666667
2018-10-15,Heavy Metals,0.000000,0.000000,0.000000
2018-10-15,Homogeneity,0.000000,136.666667,21.666667
2018-10-15,Injurious to Human Health,0.000000,0.000000,0.000000
...,...,...,...,...
2020-02-10,Mycotoxins,0.000000,207.000000,6.714286
2020-02-10,Pesticides,1.571429,207.000000,6.714286
2020-02-10,Residual Solvents,0.000000,207.000000,6.714286
2020-02-10,Total,6.714286,207.000000,6.714286


In [14]:
fc_perday.iloc[-1]

Failed Batches      0.285714
Tested Batches    207.000000
Total Failed        6.714286
Name: (2020-02-10, Water Activity), dtype: float64

In [15]:
fc_perday.index.get_level_values(0)

Index(['2018-10-15', '2018-10-15', '2018-10-15', '2018-10-15', '2018-10-15',
       '2018-10-15', '2018-10-15', '2018-10-15', '2018-10-15', '2018-10-15',
       ...
       '2020-02-10', '2020-02-10', '2020-02-10', '2020-02-10', '2020-02-10',
       '2020-02-10', '2020-02-10', '2020-02-10', '2020-02-10', '2020-02-10'],
      dtype='object', name='Date', length=806)

In [16]:
fc_perday.index.get_level_values(1)

Index(['Cannabinoids', 'Foreign Material', 'Heavy Metals', 'Homogeneity',
       'Injurious to Human Health', 'Label Claims', 'Microbial Impurities',
       'Moisture', 'Mycotoxins', 'Pesticides',
       ...
       'Homogeneity', 'Injurious to Human Health', 'Label Claims',
       'Microbial Impurities', 'Moisture', 'Mycotoxins', 'Pesticides',
       'Residual Solvents', 'Total', 'Water Activity'],
      dtype='object', name='Failure Reason', length=806)

In [17]:
fc_perday['Date'] = fc_perday.index.get_level_values(0)
fc_perday['Failure Reason'] = fc_perday.index.get_level_values(1)
fc_perday

Unnamed: 0_level_0,Unnamed: 1_level_0,Failed Batches,Tested Batches,Total Failed,Date,Failure Reason
Date,Failure Reason,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-10-15,Cannabinoids,0.000000,0.000000,0.000000,2018-10-15,Cannabinoids
2018-10-15,Foreign Material,0.000000,136.666667,21.666667,2018-10-15,Foreign Material
2018-10-15,Heavy Metals,0.000000,0.000000,0.000000,2018-10-15,Heavy Metals
2018-10-15,Homogeneity,0.000000,136.666667,21.666667,2018-10-15,Homogeneity
2018-10-15,Injurious to Human Health,0.000000,0.000000,0.000000,2018-10-15,Injurious to Human Health
...,...,...,...,...,...,...
2020-02-10,Mycotoxins,0.000000,207.000000,6.714286,2020-02-10,Mycotoxins
2020-02-10,Pesticides,1.571429,207.000000,6.714286,2020-02-10,Pesticides
2020-02-10,Residual Solvents,0.000000,207.000000,6.714286,2020-02-10,Residual Solvents
2020-02-10,Total,6.714286,207.000000,6.714286,2020-02-10,Total


In [18]:
# Remove rows with 'Failure Reason' == 'Total'
# (do not want 'Total' as a 'Failure Reason' in Tableau)
fc_perday = fc_perday.loc[fc_perday['Failure Reason']!='Total'].copy()

In [19]:
# Remove rows with 'Total Failed' == 0. Artifact from pivot -> unstack
fc_perday = fc_perday.loc[fc_perday['Total Failed']!=0].copy()

In [20]:
fc_perday['Fail Rate'] = fc_perday['Failed Batches'] / fc_perday['Tested Batches']
fc_perday

Unnamed: 0_level_0,Unnamed: 1_level_0,Failed Batches,Tested Batches,Total Failed,Date,Failure Reason,Fail Rate
Date,Failure Reason,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-10-15,Foreign Material,0.000000,136.666667,21.666667,2018-10-15,Foreign Material,0.000000
2018-10-15,Homogeneity,0.000000,136.666667,21.666667,2018-10-15,Homogeneity,0.000000
2018-10-15,Label Claims,12.000000,136.666667,21.666667,2018-10-15,Label Claims,0.087805
2018-10-15,Microbial Impurities,2.666667,136.666667,21.666667,2018-10-15,Microbial Impurities,0.019512
2018-10-15,Moisture,0.833333,136.666667,21.666667,2018-10-15,Moisture,0.006098
...,...,...,...,...,...,...,...
2020-02-10,Moisture,0.000000,207.000000,6.714286,2020-02-10,Moisture,0.000000
2020-02-10,Mycotoxins,0.000000,207.000000,6.714286,2020-02-10,Mycotoxins,0.000000
2020-02-10,Pesticides,1.571429,207.000000,6.714286,2020-02-10,Pesticides,0.007591
2020-02-10,Residual Solvents,0.000000,207.000000,6.714286,2020-02-10,Residual Solvents,0.000000


In [21]:
print('Saving fc_perday.csv...')
fc_perday.to_csv(path_or_buf='../../etl_data/fc_perday.csv', index=False)
print('Done saving fc_perday.')

Saving fc_perday.csv...
Done saving fc_perday.


In [22]:
'''
To do:
- Save to disk
- Create Tableau dashboard
'''

'\nTo do:\n- Save to disk\n- Create Tableau dashboard\n'

In [23]:
'''
Lots to do: 
- feature engineering
- structure properly for Tableau
- save to disk
'''

'\nLots to do: \n- feature engineering\n- structure properly for Tableau\n- save to disk\n'