# Overview
* calc summary statistics for paper
* create viz for paper

# Dependencies

In [1]:
# general
import os
import pathlib

# data
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa

# viz
import matplotlib.pyplot as plt

# config
import config_scrub as cs

# scrub
import scrub_tools as st

# Get Config

In [2]:
# get config
dict_col_names = cs.dict_col_names

# applicable_years
ls_years = [year for year in dict_col_names.keys() if year != "default"]

# Get Data
* loop over years
* for each year, read in parquet
* in df_YYYY, add column for the year
* append to giatn df
* save giant df as parquet

In [3]:
# create diabetes dataset if it doesn't already exist
# o/w just read it in

output_filename = "./clean/diabetes.parquet"

if not os.path.exists(output_filename):
    
    # log
    print("output_filename does not exist so creating")

    # get each year's data
    ls_dfs = []
    for year in ls_years:

        # get df
        df_YYYY = pd.read_parquet(cs.get_parquet_filename(year))

        # add year column
        df_YYYY['year'] = year

        # append
        ls_dfs.append(df_YYYY)
        
    # create giant df
    df_all = pd.concat(ls_dfs)

    # drop nulls
    df_all = df_all.dropna()
    
    # save to parquet
    tbl_all = pa.Table.from_pandas(df_all)
    pq.write_table(tbl_all, "./clean/diabetes.parquet")
    
else:
    
    # log
    print("output_filename does exist so just loading")
    
    # get df
    df_all = pd.read_parquet(output_filename)

output_filename does not exist so creating


# Summarize

In [4]:
# number of records and cols
print(f"number of records: {len(df_all.index)}")
print(f"number of columns: {len(df_all.columns)}")

number of records: 5941780
number of columns: 19


In [5]:
# overall % diabetes
df_grp_diabetes = df_all.groupby(['diabetes']).size().reset_index(name='counts')
count_total = df_grp_diabetes['counts'].sum()
df_grp_diabetes['diabetes_pct'] = df_grp_diabetes.apply(lambda x: x['counts'] / count_total, axis=1)

In [6]:
df_grp_diabetes

Unnamed: 0,diabetes,counts,diabetes_pct
0,no,5043345,0.848794
1,yes,898435,0.151206


# Viz

In [7]:
# graph grid
# x = year
# y = var

In [8]:
# x_cnt = len(df_all.columns)
# y_cnt = len(ls_years)
# fig, axs = plt.subplots(x_cnt, y_cnt)

# # test
# x = np.linspace(0, 2 * np.pi, 400)
# y = np.sin(x ** 2)

# for ax in axs.flat:
#     ax.plot(x, y)

In [9]:
# viz each col
pathlib.Path(f"./viz/diabetes/").mkdir(parents=True, exist_ok=True)
for col in df_all.columns:

    # set up output
    viz_title = f"{col}"
    viz_filename = f"./viz/diabetes/{viz_title}.png"

    # get ax
    if str(df_all[col].dtype) in ["float64", "int64"]:
        ax = df_all[col].hist()
    elif str(df_all[col].dtype) in ["category", "object"]:
        ax = df_all[col].value_counts().plot(kind="bar")
    else:
        continue

    # build footnote

    if str(df_all[col].dtype) in ["float64", "int64"]:
        col_min = round(df_all[col].min(), 2)
        col_max = round(df_all[col].max(), 2)
        col_avg = round(df_all[col].mean(), 2)
        footnote = f"\n* avg = {col_avg}, min = {col_min}, max = {col_max}"
        plt.figtext(0.01, 0.01, footnote, horizontalalignment="left")

    # save
    ax.set_title(viz_title)
    ax.set_xlabel(col)
    ax.set_ylabel("counts")
    ax.figure.tight_layout()
    ax.figure.savefig(viz_filename)
    plt.clf()  # clear the entire figure

<Figure size 432x288 with 0 Axes>