In [1]:
import os, math, subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

# some settings for displaying Pandas results
# pd.set_option('display.width', 2000)
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.precision', 4)
# pd.set_option('display.max_colwidth', -1)

# Statistic report table

In [None]:
def exploring_stats(pdf_input):
    # check rows, cols
    total_records, total_columns = pdf_input.shape[0], pdf_input.shape[1]
    print(f"Total {total_records} records, {total_columns} columns")

    # check dtypes
    name = []
    sub_type = []
    for n, t in pdf_input.dtypes.iteritems():
        name.append(n)
        sub_type.append(t)

    # check distinct
    ls_ndist = []
    for cname in pdf_input.columns:
        ndist = pdf_input[cname].nunique()
        pct_dist = ndist * 100.0 / total_records
        ls_ndist.append("{} ({:0.2f}%)".format(ndist, pct_dist))

    # check missing
    ls_nmiss = []
    for cname in pdf_input.columns:
        nmiss = pdf_input[cname].isnull().sum()
        pct_miss = nmiss * 100.0 / total_records
        ls_nmiss.append("{} ({:0.2f}%)".format(nmiss, pct_miss))

    # check zeros
    ls_zeros = []
    for cname in pdf_input.columns:
        try:
            nzeros = (pdf_input[cname] == 0).sum()
            pct_zeros = nzeros * 100.0 / total_records
            ls_zeros.append("{} ({:0.2f}%)".format(nzeros, pct_zeros))
        except:
            ls_zeros.append("{} ({:0.2f}%)".format(0, 0))
            continue

    # check negative
    ls_neg = []
    for cname in pdf_input.columns:
        try:
            nneg = (pdf_input[cname].astype("float") < 0).sum()
            pct_neg = nneg * 100.0 / total_records
            ls_neg.append("{} ({:0.2f}%)".format(nneg, pct_neg))
        except:
            ls_neg.append("{} ({:0.2f}%)".format(0, 0))
            continue

    # prepare output
    data = {
        "name": name,
        "sub_type": sub_type,
        "n_distinct": ls_ndist,
        "n_miss": ls_nmiss,
        "n_zeros": ls_zeros,
        "n_negative": ls_neg,
    }

    # check stats
    pdf_stats = pdf_input.describe().transpose()
    ls_stats = []
    for stat in pdf_stats.columns:
        data[stat] = []
        for cname in pdf_input.columns:
            try:
                data[stat].append(pdf_stats.loc[cname, stat])
            except:
                data[stat].append(0.0)

    # take samples
    nsample = 10
    pdf_sample = pdf_input.sample(frac=.5).head(nsample).transpose()
    pdf_sample.columns = ["sample_{}".format(i) for i in range(nsample)]

    # output
    col_ordered = ["sub_type", "n_distinct", "n_miss", "n_negative", "n_zeros",
                   "25%", "50%", "75%", "count", "max", "mean", "min", "std"] + list(pdf_sample.columns)
    pdf_data = pd.DataFrame(data).set_index("name")
    pdf_data = pd.concat([pdf_data, pdf_sample], axis=1)
    pdf_data = pdf_data[col_ordered]

    return pdf_data

In [2]:
def export_rp_wb(li_dfs, li_names = None, output_dir = 'Outputs/data_statistic_report.xlsx'):
    with pd.ExcelWriter(output_dir) as writer:
        for df, name in zip(li_dfs, li_names):
            print(name, end = ": ")
            rp = exploring_stats(df)
            rp.to_excel(writer, sheet_name=name, index=False)    

def export_rp_wb(input_dfs_dir, output_dir = 'Outputs/data_statistic_report.xlsx'):
    with pd.ExcelWriter(output_dir) as writer:
        for path in input_dfs_dir:
            name = os.path.splitext(os.path.basename(path))
            df = pd.read_csv(path)
            print(name, end = ": ")
            rp = exploring_stats(df)
            rp.to_excel(writer, sheet_name=name, index=False)  

# Dataprep report

In [None]:
from dataprep.eda import create_report
# create_report(df).show_browser()