In [1]:
import xlrd
import pandas as pd

In [2]:
def dateFromFilename(file='20200120_report.xlsx'):
    date = file.replace('_report.xlsx', '').replace('_update.xlsx', '')
    return date

In [3]:
def openBCCfile(file = '20200120_report.xlsx'):
    date = dateFromFilename(file)
    data_dir = '~/projects/bcc_weekly_reports/data/'
    loc = (data_dir + file)

    print('Opening {}'.format(loc))
    wb = xlrd.open_workbook(loc)
    sheet = wb.sheet_by_name('COA Stats')
#     if '_update' in file:
#         sheet = wb.sheet_by_index(0)
#     else:
#         sheet = wb.sheet_by_index(2)
    print('Rows: {}, Cols: {}'.format(sheet.ncols, sheet.nrows))
    
    return wb, sheet

wb, sheet = openBCCfile(file = '20200120_report.xlsx')

Opening ~/projects/bcc_weekly_reports/data/20200120_report.xlsx
Rows: 4, Cols: 24


In [4]:
def importCOAdata(table='totals', file = '20200120_report.xlsx'):
    '''
    Return a df of one of the 3 tables found in the COA weekly report
    '''
    # Which table to import
    if table == 'totals':
        rows = range(1,3)
    elif table == 'product_category':
        rows = range(4,9)
    elif table == 'fail_category':
        rows = range(9,sheet.nrows)

    # Create dataframe from table
    data = []
    for row in rows:
        data.append(sheet.row_values(row))
    data_df = pd.DataFrame(data=data[1:], columns=data[0], dtype='int')
    
    # Create date column and make dtype=datetime
    date = dateFromFilename(file)
    data_df['Date'] = pd.to_datetime(date)
    
    return data_df


In [5]:
totals = importCOAdata(table='totals')
totals

Unnamed: 0,Certificates of Analysis Received,Tested Batches,Unnamed: 3,Failed Batches,Date
0,90793,90793,,6124,2020-01-20


In [6]:
product_categories = importCOAdata(table='product_category')
product_categories

Unnamed: 0,Tested Batches By Category,Tested Batches,Unnamed: 3,Failed Batches By Category,Date
0,Flower:,44108,,2401,2020-01-20
1,"Inhalable\n(cartridges, waxes, etc.):",31715,,2170,2020-01-20
2,"Other\n(edibles, tinctures, topicals, etc.):",14970,,1553,2020-01-20
3,Total:,90793,,6124,2020-01-20


In [7]:
fail_categories = importCOAdata(table='fail_category')
fail_categories

Unnamed: 0,*Reasons For Failure,Unnamed: 2,Failed Batches By Category,Unnamed: 4,Date
0,Label Claims:,,2379.0,,2020-01-20
1,Pesticides:,,1790.0,,2020-01-20
2,Microbial Impurities:,,929.0,,2020-01-20
3,Residual Solvents:,,361.0,,2020-01-20
4,Homogeneity:,,36.0,,2020-01-20
5,Foreign Material:,,26.0,,2020-01-20
6,Moisture:,,71.0,,2020-01-20
7,Heavy Metals:,,541.0,,2020-01-20
8,Water Activity:,,51.0,,2020-01-20
9,Cannabinoids:,,77.0,,2020-01-20


In [8]:
from os import listdir
data_dir2 = '../../data/'
files = [f for f in listdir(data_dir2) if '.xlsx' in f]

In [9]:
dates=[]
for file in files: 
    dates.append(file.replace('_report.xlsx', '').replace('_update.xlsx',''))
dates = sorted(dates)

In [10]:
dates[:5]

['20181009', '20181015', '20181022', '20181029', '20181105']

In [11]:
totals = pd.DataFrame()
product_categories = pd.DataFrame()
fail_categories = pd.DataFrame()

for file in files:
    # Read file, append rows to dataframes, save
    
    # Read file
    wb, sheet = openBCCfile(file=file)
    # Read tables
    totals = totals.append(importCOAdata(table='totals', file=file))
    product_categories = product_categories.append(importCOAdata(table='product_category', file=file))
    fail_categories = fail_categories.append(importCOAdata(table='fail_category', file=file))

Opening ~/projects/bcc_weekly_reports/data/20181113_report.xlsx
Rows: 4, Cols: 19
Opening ~/projects/bcc_weekly_reports/data/20190429_report.xlsx
Rows: 4, Cols: 24
Opening ~/projects/bcc_weekly_reports/data/20181203_report.xlsx
Rows: 4, Cols: 19
Opening ~/projects/bcc_weekly_reports/data/20190610_report.xlsx
Rows: 4, Cols: 24
Opening ~/projects/bcc_weekly_reports/data/20190527_report.xlsx
Rows: 4, Cols: 24
Opening ~/projects/bcc_weekly_reports/data/20191209_report.xlsx
Rows: 4, Cols: 24
Opening ~/projects/bcc_weekly_reports/data/20191007_report.xlsx
Rows: 4, Cols: 24
Opening ~/projects/bcc_weekly_reports/data/20190729_report.xlsx
Rows: 4, Cols: 24
Opening ~/projects/bcc_weekly_reports/data/20190923_report.xlsx
Rows: 4, Cols: 24
Opening ~/projects/bcc_weekly_reports/data/20190603_report.xlsx
Rows: 4, Cols: 24
Opening ~/projects/bcc_weekly_reports/data/20181029_report.xlsx
Rows: 4, Cols: 19
Opening ~/projects/bcc_weekly_reports/data/20181210_report.xlsx
Rows: 4, Cols: 19
Opening ~/projec

In [12]:
# Remove empty columns
del totals['']
del product_categories['']
del fail_categories['']

In [13]:
totals.to_csv(path_or_buf='../../etl_data/totals.csv', index=False)
product_categories.to_csv(path_or_buf='../../etl_data/product_categories.csv', index=False)
fail_categories.to_csv(path_or_buf='../../etl_data/fail_categories.csv', index=False)

In [14]:
# Clean up totals df
del totals['Certificates of Analysis Received'] # This column is identical to 'Tested Batches'. Don't need both
totals['Percent Failed'] = totals['Failed Batches'] / totals['Tested Batches']

In [15]:
totals.head()

Unnamed: 0,Tested Batches,Failed Batches,Date,Percent Failed
0,20797,3164,2018-11-13,0.152137
0,46503,4679,2019-04-29,0.100617
0,23864,3373,2018-12-03,0.141343
0,52471,4906,2019-06-10,0.093499
0,50811,4831,2019-05-27,0.095078


In [16]:
# Clean up product_categories df

# Rename Columns
product_categories['Category'] = product_categories['Tested Batches By Category']
del product_categories['Tested Batches By Category'] 
product_categories['Failed Batches'] = product_categories['Failed Batches By Category']
del product_categories['Failed Batches By Category']

# Some values has ',' in them and are thus 'objects'
# Convert these to ints
def sanitize(data):
    valid = '1234567890.' #valid characters for a float
    try: 
        ''.join(filter(lambda char: char in valid, data))
    except: 
        return data
    else: 
        return int(''.join(filter(lambda char: char in valid, data)))
product_categories['Failed Batches'] = product_categories['Failed Batches'].apply(sanitize)
product_categories['Failed Batches'] = product_categories['Failed Batches'].astype('int')

# Remove ':' from Category strings
def colonRemove(data):
    try: 
        data.replace(':','')
    except: 
        return data
    else: 
        return data.replace(':','')
product_categories['Category'] = product_categories['Category'].apply(colonRemove)


# Calculated columns
product_categories['Percent Failed'] = product_categories['Failed Batches'] / product_categories['Tested Batches']


product_categories.head()

Unnamed: 0,Tested Batches,Date,Category,Failed Batches,Percent Failed
0,10415,2018-11-13,Flower,1029.0,0.0988
1,6464,2018-11-13,"Inhalable\n(cartridges, waxes, etc.)",1054.0,0.163057
2,3918,2018-11-13,"Other\n(edibles, tinctures, topicals, etc.)",1081.0,0.275906
3,20797,2018-11-13,Total,3164.0,0.152137
0,23347,2019-04-29,Flower,1627.0,0.069688


In [18]:
product_categories.dtypes

Tested Batches             int64
Date              datetime64[ns]
Category                  object
Failed Batches           float64
Percent Failed           float64
dtype: object

In [28]:
# new_cols = ['Flower']
pc_pivot = product_categories.pivot(index='Date', columns='Category')
# May need to swap levels of hierarchical indeces if Tableau doesn't like this version
# pc_pivot.swaplevel(axis=1)
pc_pivot.head()

Unnamed: 0_level_0,Tested Batches,Tested Batches,Tested Batches,Tested Batches,Failed Batches,Failed Batches,Failed Batches,Failed Batches,Percent Failed,Percent Failed,Percent Failed,Percent Failed
Category,Flower,"Inhalable\n(cartridges, waxes, etc.)","Other\n(edibles, tinctures, topicals, etc.)",Total,Flower,"Inhalable\n(cartridges, waxes, etc.)","Other\n(edibles, tinctures, topicals, etc.)",Total,Flower,"Inhalable\n(cartridges, waxes, etc.)","Other\n(edibles, tinctures, topicals, etc.)",Total
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2018-10-09,7993,4991,3065,16050,853.0,915.0,915.0,2684.0,0.106718,0.18333,0.298532,0.167227
2018-10-15,8446,5208,3215,16869,918.0,942.0,944.0,2804.0,0.108691,0.180876,0.293624,0.166222
2018-10-22,8978,5483,3372,17833,933.0,966.0,969.0,2868.0,0.103921,0.176181,0.287367,0.160825
2018-10-29,9426,5858,3579,18863,968.0,1006.0,1011.0,2985.0,0.102695,0.171731,0.282481,0.158246
2018-11-05,9829,6138,3740,19707,991.0,1026.0,1036.0,3053.0,0.100824,0.167155,0.277005,0.15492


In [32]:
# Clean up fail_categories df
fail_categories['Failed Batches'] = fail_categories['Failed Batches By Category']
del fail_categories['Failed Batches By Category']
fail_categories['Failure Mode'] = fail_categories['*Reasons For Failure']
del fail_categories['*Reasons For Failure']

fail_categories.dtypes

Date              datetime64[ns]
Failed Batches            object
Failure Mode              object
dtype: object

In [33]:
fail_categories.head()

Unnamed: 0,Date,Failed Batches,Failure Mode
0,2018-11-13,2043,Label Claims:
1,2018-11-13,685,Pesticides:
2,2018-11-13,380,Microbial Impurities:
3,2018-11-13,171,Residual Solvents:
4,2018-11-13,35,Homogeneity:


In [None]:
fail_categories['Failure Mode'] = fail_categories['Failure Reason'].apply(colonRemove)