# Auxiliary notebook to product markdown tables for README

In [1]:
###############################################################################

## Load packages (5s)
!pip install -q jupyternotify
!pip install -q nest_asyncio
!pip install -q tabulate

from IPython.core.interactiveshell import InteractiveShell
import jupyternotify
import nest_asyncio
import numpy as np
import pandas as pd
import warnings

InteractiveShell.ast_node_interactivity = "all"
get_ipython().register_magics(jupyternotify.JupyterNotifyMagics)
# %autonotify -a 30
nest_asyncio.apply()
warnings.filterwarnings("ignore") # For extraneous pd warnings

def mysummary(df):
    df_dtypes = df.dtypes
    df_isna = df.isna().sum()
    df_0 = df.iloc[0].T.astype('str')
    df_summary = pd.DataFrame({
        'Type':df_dtypes,
        'Missing Entries':df_isna,
        'First Row of Data':df_0})
    return df_summary

<IPython.core.display.Javascript object>

In [2]:
%%time
%%notify

## Load data (2min)
%time downsample = pd.read_parquet('downsample.parquet')
%time df = pd.read_parquet('data.parquet')

## Compute summary stats (2min)
df_dtypes = df.dtypes
df_0 = df.iloc[0].T.astype('str')
%time df_isna = df.isna().sum() # (30s)
%time df_describe = df.describe(include='all', percentiles=[0.0001,.9999])
df_describe = df_describe.T.fillna('') # (1min 30s)
df_dtypes_isna_0 = pd.concat([df_dtypes, df_isna, df_0], axis=1)
df_dtypes_isna_0.columns = ['dtype','null count','iloc[0]']
df_stats = pd.concat([df_dtypes_isna_0, df_describe], axis=1)

## Summarize datafiles stats (3min)
%time len_raw = len(pd.read_csv('raw.csv',dtype='object',usecols=[0])) # (1min)
%time df = pd.read_parquet('data.parquet') # (2min)
len_data = len(df)
len_nonanomalous = len(df) - df.Anomalous.sum()
len_downsample = len(pd.read_parquet('downsample.parquet'))

CPU times: user 2.24 s, sys: 535 ms, total: 2.77 s
Wall time: 1.77 s
CPU times: user 59.6 s, sys: 40.8 s, total: 1min 40s
Wall time: 2min 14s
CPU times: user 20.6 s, sys: 11.7 s, total: 32.3 s
Wall time: 43.2 s
CPU times: user 1min 35s, sys: 12.6 s, total: 1min 48s
Wall time: 1min 59s
CPU times: user 47.6 s, sys: 6.41 s, total: 54.1 s
Wall time: 1min 5s
CPU times: user 1min 21s, sys: 1min 13s, total: 2min 34s
Wall time: 3min 23s


<IPython.core.display.Javascript object>

CPU times: user 5min 11s, sys: 2min 26s, total: 7min 37s
Wall time: 9min 31s


In [3]:
## Convert summary stats to markdown
def to_markdown(dtypes, columns, floatfmt='.0f'):    
    return df_stats.loc[
        df_stats.dtype.astype('str').isin(dtypes),
        ['dtype','count','null count'] + columns
    ].to_markdown(floatfmt=floatfmt)
print(to_markdown(
    ['datetime64[ns]'],
    ['unique','top','freq','first','last']))
print()
print(to_markdown(
    ['object','bool'],
    ['unique','top','freq']))
print()
print(to_markdown(
    ['int16','int32','float32'],
    ['mean','std','min','0.01%','99.99%','max']))
print(to_markdown(
    ['float64'],
    ['mean','std','min','0.01%','99.99%','max'],
    floatfmt='.1f'
).split('\n',2)[2])


## Convert datafiles stats to markdown
filesizes = !! du -h raw.csv data.parquet downsample.parquet
filesizes = [n.split('\t') for n in filesizes]
files = pd.DataFrame(index=[
    'Raw data','Cleaned data','(nonanomalous)','5% Downsample'])
files['filename'] = [
    filesizes[0][1],filesizes[1][1],'',filesizes[2][1]]
files['filesize'] = [
    filesizes[0][0],filesizes[1][0],'',filesizes[2][0]]
files['number of rows'] = [
    len_raw,len_data,len_nonanomalous,len_downsample]
print()
print(files.to_markdown())

|      | dtype          |    count |   null count |   unique | top                 |   freq | first               | last                |
|:-----|:---------------|---------:|-------------:|---------:|:--------------------|-------:|:--------------------|:--------------------|
| Date | datetime64[ns] | 19883991 |            0 |     2157 | 2017-12-22 00:00:00 |  16674 | 2012-01-03 00:00:00 | 2020-10-30 00:00:00 |

|                 | dtype   |    count |   null count |   unique | top                          |     freq |
|:----------------|:--------|---------:|-------------:|---------:|:-----------------------------|---------:|
| Invoice         | object  | 19883991 |            0 | 19883991 | 124889500022                 |        1 |
| Store_num       | object  | 19883991 |            0 |     2495 | 2633                         |   169816 |
| Store           | object  | 19883991 |            0 |     2613 | Hy-Vee #3 / Bdi / Des Moines |   169816 |
| Address         | object  | 19804064 |