In [1]:
import xlrd
import pandas as pd

In [2]:
def dateFromFilename(file='20200120_report.xlsx'):
    date = file.replace('_report.xlsx', '').replace('_update.xlsx', '')
    return date

In [3]:
def openBCCfile(file = '20200120_report.xlsx'):
    date = dateFromFilename(file)
    data_dir = '~/projects/bcc_weekly_reports/data/'
    loc = (data_dir + file)

    print('Opening {}'.format(loc))
    wb = xlrd.open_workbook(loc)
    sheet = wb.sheet_by_name('COA Stats')
#     if '_update' in file:
#         sheet = wb.sheet_by_index(0)
#     else:
#         sheet = wb.sheet_by_index(2)
    print('Rows: {}, Cols: {}'.format(sheet.ncols, sheet.nrows))
    
    return wb, sheet

wb, sheet = openBCCfile(file = '20200120_report.xlsx')

Opening ~/projects/bcc_weekly_reports/data/20200120_report.xlsx
Rows: 4, Cols: 24


In [4]:
def importCOAdata(table='totals', file = '20200120_report.xlsx'):
    '''
    Return a df of one of the 3 tables found in the COA weekly report
    '''
    # Which table to import
    if table == 'totals':
        rows = range(1,3)
    elif table == 'product_category':
        rows = range(4,9)
    elif table == 'fail_category':
        rows = range(9,sheet.nrows)

    # Create dataframe from table
    data = []
    for row in rows:
        data.append(sheet.row_values(row))
    data_df = pd.DataFrame(data=data[1:], columns=data[0], dtype='int')
    
    # Create date column and make dtype=datetime
    date = dateFromFilename(file)
    data_df['date'] = pd.to_datetime(date)
    
    return data_df


In [5]:
totals = importCOAdata(table='totals')
totals

Unnamed: 0,Certificates of Analysis Received,Tested Batches,Unnamed: 3,Failed Batches,date
0,90793,90793,,6124,2020-01-20


In [6]:
product_categories = importCOAdata(table='product_category')
product_categories

Unnamed: 0,Tested Batches By Category,Tested Batches,Unnamed: 3,Failed Batches By Category,date
0,Flower:,44108,,2401,2020-01-20
1,"Inhalable\n(cartridges, waxes, etc.):",31715,,2170,2020-01-20
2,"Other\n(edibles, tinctures, topicals, etc.):",14970,,1553,2020-01-20
3,Total:,90793,,6124,2020-01-20


In [7]:
fail_categories = importCOAdata(table='fail_category')
fail_categories

Unnamed: 0,*Reasons For Failure,Unnamed: 2,Failed Batches By Category,Unnamed: 4,date
0,Label Claims:,,2379.0,,2020-01-20
1,Pesticides:,,1790.0,,2020-01-20
2,Microbial Impurities:,,929.0,,2020-01-20
3,Residual Solvents:,,361.0,,2020-01-20
4,Homogeneity:,,36.0,,2020-01-20
5,Foreign Material:,,26.0,,2020-01-20
6,Moisture:,,71.0,,2020-01-20
7,Heavy Metals:,,541.0,,2020-01-20
8,Water Activity:,,51.0,,2020-01-20
9,Cannabinoids:,,77.0,,2020-01-20


In [8]:
from os import listdir
data_dir2 = '../../data/'
files = [f for f in listdir(data_dir2) if '.xlsx' in f]

In [9]:
dates=[]
for file in files: 
    dates.append(file.replace('_report.xlsx', '').replace('_update.xlsx',''))
dates = sorted(dates)

In [10]:
dates[:5]

['20181009', '20181015', '20181022', '20181029', '20181105']

In [19]:
totals = pd.DataFrame()
product_categories = pd.DataFrame()
fail_categories = pd.DataFrame()

for file in files:
    # Read file, append rows to dataframes, save
    
    # Read file
    wb, sheet = openBCCfile(file=file)
    # Read tables
    totals = totals.append(importCOAdata(table='totals', file=file))
    product_categories = product_categories.append(importCOAdata(table='product_category', file=file))
    fail_categories = fail_categories.append(importCOAdata(table='fail_category', file=file))

Opening ~/projects/bcc_weekly_reports/data/20181113_report.xlsx
Rows: 4, Cols: 19
Opening ~/projects/bcc_weekly_reports/data/20190429_report.xlsx
Rows: 4, Cols: 24
Opening ~/projects/bcc_weekly_reports/data/20181203_report.xlsx
Rows: 4, Cols: 19
Opening ~/projects/bcc_weekly_reports/data/20190610_report.xlsx
Rows: 4, Cols: 24
Opening ~/projects/bcc_weekly_reports/data/20190527_report.xlsx
Rows: 4, Cols: 24
Opening ~/projects/bcc_weekly_reports/data/20191209_report.xlsx
Rows: 4, Cols: 24
Opening ~/projects/bcc_weekly_reports/data/20191007_report.xlsx
Rows: 4, Cols: 24
Opening ~/projects/bcc_weekly_reports/data/20190729_report.xlsx
Rows: 4, Cols: 24
Opening ~/projects/bcc_weekly_reports/data/20190923_report.xlsx
Rows: 4, Cols: 24
Opening ~/projects/bcc_weekly_reports/data/20190603_report.xlsx
Rows: 4, Cols: 24
Opening ~/projects/bcc_weekly_reports/data/20181029_report.xlsx
Rows: 4, Cols: 19
Opening ~/projects/bcc_weekly_reports/data/20181210_report.xlsx
Rows: 4, Cols: 19
Opening ~/projec

In [20]:
len(fail_categories)

762

In [21]:
totals.head()

Unnamed: 0,Certificates of Analysis Received,Tested Batches,Unnamed: 3,Failed Batches,date
0,20797,20797,,3164,2018-11-13
0,46503,46503,,4679,2019-04-29
0,23864,23864,,3373,2018-12-03
0,52471,52471,,4906,2019-06-10
0,50811,50811,,4831,2019-05-27


In [22]:
del totals['']
del product_categories['']
del fail_categories['']

totals.to_csv(path_or_buf='../../etl_data/totals.csv', index=False)
product_categories.to_csv(path_or_buf='../../etl_data/product_categories.csv', index=False)
fail_categories.to_csv(path_or_buf='../../etl_data/fail_categories.csv', index=False)

In [23]:
totals.columns

Index(['Certificates of Analysis Received', 'Tested Batches', 'Failed Batches',
       'date'],
      dtype='object')

In [24]:
product_categories.columns

Index(['Tested Batches By Category', 'Tested Batches',
       'Failed Batches By Category', 'date'],
      dtype='object')

In [25]:
product_categories

Unnamed: 0,Tested Batches By Category,Tested Batches,Failed Batches By Category,date
0,Flower:,10415,1029,2018-11-13
1,"Inhalable\n(cartridges, waxes, etc.):",6464,1054,2018-11-13
2,"Other\n(edibles, tinctures, topicals, etc.):",3918,1081,2018-11-13
3,Total:,20797,3164,2018-11-13
0,Flower:,23347,1627,2019-04-29
...,...,...,...,...
3,Total:,22096,3310,2018-11-26
0,Flower:,22277,1589,2019-04-15
1,"Inhalable\n(cartridges, waxes, etc.):",14150,1598,2019-04-15
2,"Other\n(edibles, tinctures, topicals, etc.):",7590,1402,2019-04-15
