In [1]:
'''
Transform the fail_categories.csv data
Calculate the average daily values.
- Diff the (cumulative) values provided by the BCC
- Divide by number of days in the period

'''

'\nTransform the fail_categories.csv data\nCalculate the average daily values.\n- Diff the (cumulative) values provided by the BCC\n- Divide by number of days in the period\n\n'

In [2]:
import pandas as pd

In [3]:
# Read fail_categories.csv created by bcc_etl.py
fc = pd.read_csv('../../etl_data/fail_categories.csv')

In [4]:
# Parse only the data needed
keep_cols = ['Date', 'Total Failed', 'Tested Batches', 'Failed Batches', 'Failure Reason']
fc = fc[keep_cols]
fc

Unnamed: 0,Date,Total Failed,Tested Batches,Failed Batches,Failure Reason
0,2018-11-13,3325,20797,2043,Label Claims
1,2018-11-13,3325,20797,685,Pesticides
2,2018-11-13,3325,20797,280,Microbial Impurities
3,2018-11-13,3325,20797,171,Residual Solvents
4,2018-11-13,3325,20797,35,Homogeneity
5,2018-11-13,3325,20797,10,Foreign Material
6,2018-11-13,3325,20797,51,Moisture
7,2018-11-13,3325,20797,3375,Total
8,2019-04-29,4798,46503,2379,Label Claims
9,2019-04-29,4798,46503,1237,Pesticides


In [5]:
# Pivot to get all data for each date on the same row
fc_pivot = fc.pivot_table(index='Date', columns='Failure Reason')
fc_pivot.fillna(value=0, inplace=True) # NaNs represent 0s in reality
fc_pivot = fc_pivot.astype('int')
fc_pivot.sort_index(inplace=True)
fc_pivot['Date'] = pd.to_datetime(fc_pivot.index) # Need a 'Date' column for .diff() to calculate num_days

In [6]:
# # 'Tested Batches' does not apply to each 'Failure Type'
# # Collapse to a single column
# tested_batches = fc_pivot['Tested Batches'].max(axis=1)
# del fc_pivot['Tested Batches']
# fc_pivot['Tested Batches'] = tested_batches

In [7]:
# # 'Total Failed' does not apply to each 'Failure Type'
# # Collapse to a single column
# total_failed = fc_pivot['Total Failed'].max(axis=1)
# del fc_pivot['Total Failed']
# fc_pivot['Total Failed'] = total_failed

In [8]:
# Show the final df
fc_pivot

Unnamed: 0_level_0,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,...,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Date
Failure Reason,Cannabinoids,Foreign Material,Heavy Metals,Homogeneity,Injurious to Human Health,Label Claims,Microbial Impurities,Moisture,Mycotoxins,Pesticides,...,Injurious to Human Health,Label Claims,Microbial Impurities,Moisture,Mycotoxins,Pesticides,Residual Solvents,Total,Water Activity,Unnamed: 21_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-10-09,0,10,0,35,0,1751,201,40,0,602,...,0,2769,2769,2769,0,2769,2769,2769,0,2018-10-09
2018-10-15,0,10,0,35,0,1823,217,45,0,632,...,0,2899,2899,2899,0,2899,2899,2899,0,2018-10-15
2018-10-22,0,10,0,35,0,1867,224,45,0,645,...,0,2971,2971,2971,0,2971,2971,2971,0,2018-10-22
2018-10-29,0,10,0,35,0,1953,233,45,0,663,...,0,3093,3093,3093,0,3093,3093,3093,0,2018-10-29
2018-11-05,0,10,0,35,0,1979,249,50,0,672,...,0,3162,3162,3162,0,3162,3162,3162,0,2018-11-05
2018-11-13,0,10,0,35,0,2043,280,51,0,685,...,0,3325,3325,3325,0,3325,3325,3325,0,2018-11-13
2018-11-19,0,11,0,35,0,2097,291,51,0,697,...,0,3362,3362,3362,0,3362,3362,3362,0,2018-11-19
2018-11-26,0,11,0,35,0,2118,304,51,0,714,...,0,3419,3419,3419,0,3419,3419,3419,0,2018-11-26
2018-12-03,0,12,0,35,0,2135,317,51,0,739,...,0,3483,3483,3483,0,3483,3483,3483,0,2018-12-03
2018-12-10,0,12,0,35,0,2173,322,53,0,762,...,0,3558,3558,3558,0,3558,3558,3558,0,2018-12-10


In [9]:
# Take the difference of each row with the previous row
delta_fc = fc_pivot.diff()
delta_fc = delta_fc.iloc[1:] # Drop first row, which is all NaNs

# Convert to int's where appropriate
keep_cols = ['Failed Batches', 'Tested Batches', 'Total Failed']
for col in keep_cols:
    delta_fc[col] = delta_fc[col].astype('int')
delta_fc.dtypes

delta_fc

Unnamed: 0_level_0,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,...,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Date
Failure Reason,Cannabinoids,Foreign Material,Heavy Metals,Homogeneity,Injurious to Human Health,Label Claims,Microbial Impurities,Moisture,Mycotoxins,Pesticides,...,Injurious to Human Health,Label Claims,Microbial Impurities,Moisture,Mycotoxins,Pesticides,Residual Solvents,Total,Water Activity,Unnamed: 21_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-10-15,0,0,0,0,0,72,16,5,0,30,...,0,130,130,130,0,130,130,130,0,6 days
2018-10-22,0,0,0,0,0,44,7,0,0,13,...,0,72,72,72,0,72,72,72,0,7 days
2018-10-29,0,0,0,0,0,86,9,0,0,18,...,0,122,122,122,0,122,122,122,0,7 days
2018-11-05,0,0,0,0,0,26,16,5,0,9,...,0,69,69,69,0,69,69,69,0,7 days
2018-11-13,0,0,0,0,0,64,31,1,0,13,...,0,163,163,163,0,163,163,163,0,8 days
2018-11-19,0,1,0,0,0,54,11,0,0,12,...,0,37,37,37,0,37,37,37,0,6 days
2018-11-26,0,0,0,0,0,21,13,0,0,17,...,0,57,57,57,0,57,57,57,0,7 days
2018-12-03,0,1,0,0,0,17,13,0,0,25,...,0,64,64,64,0,64,64,64,0,7 days
2018-12-10,0,0,0,0,0,38,5,2,0,23,...,0,75,75,75,0,75,75,75,0,7 days
2018-12-17,0,0,1,1,0,62,7,3,0,23,...,0,108,108,108,0,108,108,108,0,7 days


In [10]:
# Calculate 'Days in Period'. Can divide by int, not Timedelta
oneday = pd.Timedelta(days=1)
delta_fc['Days in Period'] = delta_fc['Date'] / oneday # Set unit to oneday
delta_fc['Days in Period'] = delta_fc['Days in Period'].astype('int')
delta_fc['Days in Period']

Date
2018-10-15     6
2018-10-22     7
2018-10-29     7
2018-11-05     7
2018-11-13     8
2018-11-19     6
2018-11-26     7
2018-12-03     7
2018-12-10     7
2018-12-17     7
2018-12-26     9
2019-01-08    13
2019-01-22    14
2019-02-04    13
2019-02-18    14
2019-03-04    14
2019-03-18    14
2019-04-02    15
2019-04-08     6
2019-04-15     7
2019-04-22     7
2019-04-29     7
2019-05-06     7
2019-05-13     7
2019-05-20     7
2019-05-27     7
2019-06-03     7
2019-06-10     7
2019-06-17     7
2019-06-24     7
              ..
2019-09-23     7
2019-09-30     7
2019-10-07     7
2019-10-14     7
2019-10-21     7
2019-10-28     7
2019-11-04     7
2019-11-11     7
2019-11-25    14
2019-12-02     7
2019-12-09     7
2019-12-16     7
2019-12-23     7
2019-12-30     7
2020-01-06     7
2020-01-13     7
2020-01-20     7
2020-01-27     7
2020-02-03     7
2020-02-10     7
2020-02-17     7
2020-02-24     7
2020-03-02     7
2020-03-09     7
2020-03-16     7
2020-03-23     7
2020-03-30     7
2020-04-0

In [11]:
# Divide by number of days
# avg_daily_failure_categories is new df
keep_cols = ['Failed Batches', 'Tested Batches', 'Total Failed']
avg_daily_failure_categories = delta_fc[keep_cols].div(delta_fc['Days in Period'], axis=0)
avg_daily_failure_categories['Days in Period'] = delta_fc['Days in Period']
avg_daily_failure_categories

Unnamed: 0_level_0,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,...,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Total Failed,Days in Period
Failure Reason,Cannabinoids,Foreign Material,Heavy Metals,Homogeneity,Injurious to Human Health,Label Claims,Microbial Impurities,Moisture,Mycotoxins,Pesticides,...,Injurious to Human Health,Label Claims,Microbial Impurities,Moisture,Mycotoxins,Pesticides,Residual Solvents,Total,Water Activity,Unnamed: 21_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-10-15,0.000000,0.000000,0.000000,0.000000,0.000000,12.000000,2.666667,0.833333,0.000000,5.000000,...,0.000000,21.666667,21.666667,21.666667,0.000000,21.666667,21.666667,21.666667,0.000000,6
2018-10-22,0.000000,0.000000,0.000000,0.000000,0.000000,6.285714,1.000000,0.000000,0.000000,1.857143,...,0.000000,10.285714,10.285714,10.285714,0.000000,10.285714,10.285714,10.285714,0.000000,7
2018-10-29,0.000000,0.000000,0.000000,0.000000,0.000000,12.285714,1.285714,0.000000,0.000000,2.571429,...,0.000000,17.428571,17.428571,17.428571,0.000000,17.428571,17.428571,17.428571,0.000000,7
2018-11-05,0.000000,0.000000,0.000000,0.000000,0.000000,3.714286,2.285714,0.714286,0.000000,1.285714,...,0.000000,9.857143,9.857143,9.857143,0.000000,9.857143,9.857143,9.857143,0.000000,7
2018-11-13,0.000000,0.000000,0.000000,0.000000,0.000000,8.000000,3.875000,0.125000,0.000000,1.625000,...,0.000000,20.375000,20.375000,20.375000,0.000000,20.375000,20.375000,20.375000,0.000000,8
2018-11-19,0.000000,0.166667,0.000000,0.000000,0.000000,9.000000,1.833333,0.000000,0.000000,2.000000,...,0.000000,6.166667,6.166667,6.166667,0.000000,6.166667,6.166667,6.166667,0.000000,6
2018-11-26,0.000000,0.000000,0.000000,0.000000,0.000000,3.000000,1.857143,0.000000,0.000000,2.428571,...,0.000000,8.142857,8.142857,8.142857,0.000000,8.142857,8.142857,8.142857,0.000000,7
2018-12-03,0.000000,0.142857,0.000000,0.000000,0.000000,2.428571,1.857143,0.000000,0.000000,3.571429,...,0.000000,9.142857,9.142857,9.142857,0.000000,9.142857,9.142857,9.142857,0.000000,7
2018-12-10,0.000000,0.000000,0.000000,0.000000,0.000000,5.428571,0.714286,0.285714,0.000000,3.285714,...,0.000000,10.714286,10.714286,10.714286,0.000000,10.714286,10.714286,10.714286,0.000000,7
2018-12-17,0.000000,0.000000,0.142857,0.142857,0.000000,8.857143,1.000000,0.428571,0.000000,3.285714,...,0.000000,15.428571,15.428571,15.428571,0.000000,15.428571,15.428571,15.428571,0.000000,7


In [12]:
# keep_cols = ['Tested Batches', 'Total Failed', 'Days in Period']
# avg_daily_failure_categories[keep_cols] 
# # Remove 'Failure Reason' index, which is not applicable for this data

In [13]:
# a = avg_daily_failure_categories.stack()
fc_perday = avg_daily_failure_categories[avg_daily_failure_categories.columns[:-1]].stack()
# del a['Days in Period']
fc_perday

Unnamed: 0_level_0,Unnamed: 1_level_0,Failed Batches,Tested Batches,Total Failed
Date,Failure Reason,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-10-15,Cannabinoids,0.000000,0.000000,0.000000
2018-10-15,Foreign Material,0.000000,136.666667,21.666667
2018-10-15,Heavy Metals,0.000000,0.000000,0.000000
2018-10-15,Homogeneity,0.000000,136.666667,21.666667
2018-10-15,Injurious to Human Health,0.000000,0.000000,0.000000
2018-10-15,Label Claims,12.000000,136.666667,21.666667
2018-10-15,Microbial Impurities,2.666667,136.666667,21.666667
2018-10-15,Moisture,0.833333,136.666667,21.666667
2018-10-15,Mycotoxins,0.000000,0.000000,0.000000
2018-10-15,Pesticides,5.000000,136.666667,21.666667


In [14]:
fc_perday.iloc[-1]

Failed Batches      0.142857
Tested Batches    235.285714
Total Failed        4.571429
Name: (2020-04-20, Water Activity), dtype: float64

In [15]:
fc_perday.index.get_level_values(0)

Index(['2018-10-15', '2018-10-15', '2018-10-15', '2018-10-15', '2018-10-15',
       '2018-10-15', '2018-10-15', '2018-10-15', '2018-10-15', '2018-10-15',
       ...
       '2020-04-20', '2020-04-20', '2020-04-20', '2020-04-20', '2020-04-20',
       '2020-04-20', '2020-04-20', '2020-04-20', '2020-04-20', '2020-04-20'],
      dtype='object', name='Date', length=936)

In [16]:
fc_perday.index.get_level_values(1)

Index(['Cannabinoids', 'Foreign Material', 'Heavy Metals', 'Homogeneity',
       'Injurious to Human Health', 'Label Claims', 'Microbial Impurities',
       'Moisture', 'Mycotoxins', 'Pesticides',
       ...
       'Homogeneity', 'Injurious to Human Health', 'Label Claims',
       'Microbial Impurities', 'Moisture', 'Mycotoxins', 'Pesticides',
       'Residual Solvents', 'Total', 'Water Activity'],
      dtype='object', name='Failure Reason', length=936)

In [17]:
fc_perday['Date'] = fc_perday.index.get_level_values(0)
fc_perday['Failure Reason'] = fc_perday.index.get_level_values(1)
fc_perday

Unnamed: 0_level_0,Unnamed: 1_level_0,Failed Batches,Tested Batches,Total Failed,Date,Failure Reason
Date,Failure Reason,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-10-15,Cannabinoids,0.000000,0.000000,0.000000,2018-10-15,Cannabinoids
2018-10-15,Foreign Material,0.000000,136.666667,21.666667,2018-10-15,Foreign Material
2018-10-15,Heavy Metals,0.000000,0.000000,0.000000,2018-10-15,Heavy Metals
2018-10-15,Homogeneity,0.000000,136.666667,21.666667,2018-10-15,Homogeneity
2018-10-15,Injurious to Human Health,0.000000,0.000000,0.000000,2018-10-15,Injurious to Human Health
2018-10-15,Label Claims,12.000000,136.666667,21.666667,2018-10-15,Label Claims
2018-10-15,Microbial Impurities,2.666667,136.666667,21.666667,2018-10-15,Microbial Impurities
2018-10-15,Moisture,0.833333,136.666667,21.666667,2018-10-15,Moisture
2018-10-15,Mycotoxins,0.000000,0.000000,0.000000,2018-10-15,Mycotoxins
2018-10-15,Pesticides,5.000000,136.666667,21.666667,2018-10-15,Pesticides


In [18]:
# Remove rows with 'Failure Reason' == 'Total'
# (do not want 'Total' as a 'Failure Reason' in Tableau)
fc_perday = fc_perday.loc[fc_perday['Failure Reason']!='Total'].copy()

In [19]:
# Remove rows with 'Total Failed' == 0. Artifact from pivot -> unstack
fc_perday = fc_perday.loc[fc_perday['Total Failed']!=0].copy()

In [20]:
# Feature Engineering: 'Fail Rate' 
# 
fc_perday['Fail Rate'] = 100 * fc_perday['Failed Batches'] / fc_perday['Tested Batches']
fc_perday

Unnamed: 0_level_0,Unnamed: 1_level_0,Failed Batches,Tested Batches,Total Failed,Date,Failure Reason,Fail Rate
Date,Failure Reason,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-10-15,Foreign Material,0.000000,136.666667,21.666667,2018-10-15,Foreign Material,0.000000
2018-10-15,Homogeneity,0.000000,136.666667,21.666667,2018-10-15,Homogeneity,0.000000
2018-10-15,Label Claims,12.000000,136.666667,21.666667,2018-10-15,Label Claims,8.780488
2018-10-15,Microbial Impurities,2.666667,136.666667,21.666667,2018-10-15,Microbial Impurities,1.951220
2018-10-15,Moisture,0.833333,136.666667,21.666667,2018-10-15,Moisture,0.609756
2018-10-15,Pesticides,5.000000,136.666667,21.666667,2018-10-15,Pesticides,3.658537
2018-10-15,Residual Solvents,1.166667,136.666667,21.666667,2018-10-15,Residual Solvents,0.853659
2018-10-22,Foreign Material,0.000000,137.714286,10.285714,2018-10-22,Foreign Material,0.000000
2018-10-22,Homogeneity,0.000000,137.714286,10.285714,2018-10-22,Homogeneity,0.000000
2018-10-22,Label Claims,6.285714,137.714286,10.285714,2018-10-22,Label Claims,4.564315


In [21]:
print('Saving fc_perday.csv...')
fc_perday.to_csv(path_or_buf='../../etl_data/fc_perday.csv', index=False)
print('Done saving fc_perday.')

Saving fc_perday.csv...
Done saving fc_perday.
