# Initial Imports & Setup

Use Python Data Analysis Libraries

In [2]:
# Data Science Modules
import numpy as np
import pandas as pd
import altair as alt
from selenium import webdriver

# Python Basics
import glob as glob
import shutil
import os
import datetime as datetime

# Importing Data into Dataframe

Create pandas dataframe from folder of CSV files.

Maps columns to friendlier names and data for faceting.

Assumes:

* **./files/names.csv**: CSV File with Provider/Clinic Names, etc. Names is used to determine active providers/clinics. (So if there's no name, it wont get evaluated later -- say, for example, after Everson Closes.
* **./files/metrics.csv**: CSV File with Metrics information (Display Name, Category, Target, etc.)
* **./data/(Zero-Padded Date at beginning of CSVs from Meridos).csv**: Files from Automatic weekly report from Meridios

In [3]:
# Read Lookup files into dataframes for mapping
names = pd.read_csv('./files/names.csv', index_col='MeridiosName')
metrics = pd.read_csv('./files/metrics.csv', index_col='MeridiosMetric')

# Blank dataframe for eventual output.
df = pd.DataFrame()

# Read in all the data files and append to dataframe
files = glob.glob('./data/*.csv')
for file in files:
    filedf = pd.read_csv(file,
                        usecols=["NAME","Metricname","SeenNum","SeenDenom"])
    # Prettier Names but we'll drop these eventually.
    filedf = filedf.rename(columns={'NAME': 'MeridiosName',
                                    'Metricname': 'MeridiosMetric',
                                    'SeenNum': 'Numerator', 
                                    'SeenDenom': 'Denominator'})

    # Add Lookup Columns.
    filedf['Name'] = filedf.MeridiosName.map(names.Name)
    filedf['Type'] = filedf.MeridiosName.map(names.Type)
    filedf['Clinic'] = filedf.MeridiosName.map(names.Clinic)
    filedf['Metric'] = filedf.MeridiosMetric.map(metrics.Metric)
    
    # Who are we kidding with precision? Round to Basis points
    # Not actually percentage (but rather than * 100 leaving for formatting
    # in visualization.)
    filedf['Percentage'] = round(filedf['Numerator'] / filedf['Denominator'], 4)

    # Zero Padded Dates with Dots. Like: "03.15.2018" We're using the filename
    # as the source of the date column because exports haven't always 
    # had the date-time data correct. 
    filename_parts = file[7:-4].split(' ')
    if (len(str(filename_parts[0])) is 10):
        filedf['Date'] = datetime.datetime.strptime(filename_parts[0], "%m.%d.%Y")

        # Now delete the 'raw' columns from import
        filedf.drop("MeridiosName", axis=1, inplace=True)
        filedf.drop("MeridiosMetric", axis=1, inplace=True)
        filedf.drop("Numerator", axis=1, inplace=True)
        filedf.drop("Denominator", axis=1, inplace=True)
        
        # We're only appending data that had Zero Padded Date at 
        # beginning of filename.
        df = df.append(filedf) 
    else:
        print("CSV Data File doesn't have Zero-Padded Date at beginning.")
        
# We can inspect dataframe different ways        
#df.dtypes
#df.info()
#df.describe()
#df.columns
#df.Metric.unique()
df.head(20)

Unnamed: 0,Name,Type,Clinic,Metric,Percentage,Date
0,Steven Alexander,Individual,LFM,AAA,0.381,2018-04-23
1,Steven Alexander,Individual,LFM,,0.2639,2018-04-23
2,Steven Alexander,Individual,LFM,,0.0823,2018-04-23
3,Steven Alexander,Individual,LFM,Chlamydia,,2018-04-23
4,Steven Alexander,Individual,LFM,Colorectal Screen,0.7109,2018-04-23
5,Steven Alexander,Individual,LFM,DM Care Gaps,0.9747,2018-04-23
6,Steven Alexander,Individual,LFM,DM with Statin,0.8588,2018-04-23
7,Steven Alexander,Individual,LFM,DM with ACE or ARB,0.6986,2018-04-23
8,Steven Alexander,Individual,LFM,DM Eye Exam,0.5696,2018-04-23
9,Steven Alexander,Individual,LFM,DM Foot Exam,0.8924,2018-04-23


## Validations:

* Do new CSV files include new MeridiosName, New Metric items? (Has the underlying report changed? Such as when edited provider name.)
* Should we keep a list of already imported data and do some message to clarify after new data is available?
* Percentages under 0% and over 100% are report errors

Currently no validations. But probably should automate checks that new data from report is accurate. 

# Altair Graphs

In [10]:
def core_clinic_charts(metric, clinic):
    chart = alt.Chart(df[(df['Metric']==metric) & 
                         (df['Type']=='Individual') & 
                         (df['Clinic']==clinic)]
    ).mark_line(
    ).encode(
        alt.X('Date:T', title=""),
        alt.Y('Percentage:Q', axis=alt.Axis(format="%")),
        color=alt.Color('Name:N', legend=None),
        tooltip='Name:N',
    ).properties(
        title=str(metric) + ' at ' + str(clinic)
    ).facet(
        column="Clinic:N",
    )
    return chart

#for clinic in df.Clinic.unique():

clinic = 'NCFP'
for metric in df.Metric.unique():
    chart = core_clinic_charts(metric, clinic)
    foldername = str(clinic).replace(" ", "_")
    if not os.path.exists("./altair-output/" + foldername):
        os.makedirs("./altair-output/" + foldername)
    chart.save("./altair-output/" + foldername + '/'+ str(metric).replace(" ", "_") + ".html")
    


# Matplotlib Graphs

These graphs are a first attempt to make individual folders. Roughly 1200 pictures from dataframe. 

Creates ./output/ folder with SVG files and an index.html file. 

In [22]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager

# Matplotlib Specifics
# %matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams["svg.fonttype"] = "none"
plt.rcParams["font.family"] = "Bitstream Vera Sans"
plt.rcParams["figure.titleweight"] = 'Bold'
plt.rcParams['figure.titlesize'] = 'xx-large'

for name in df.Name.unique():
#name = 'NCFP'

    foldername = name.replace(" ", "_")

    Provider = df[df['Name'] == name]
    Provider.dropna(inplace=True)

    for number, metric in enumerate(Provider.Metric.unique()):
        ProviderMetric = Provider[Provider['Metric'] == metric]
        ax = ProviderMetric.plot(x="Date", y="Percentage", legend=False, ylim=(0,100), figsize=(6,6))
        ax.set_xlabel("")

        # Let's Do ever third tick mark (quarters)
        ax.set_xticks(ax.get_xticks()[::3])
        fig = ax.get_figure()
        fig.patch.set_facecolor('none')
        ax.patch.set_facecolor('white')
        ax.patch.set_alpha(0.8)
        fig.suptitle(metric, fontsize=20, fontweight='bold')
        fig.tight_layout(pad=2)

        metricfilename = metric.replace(" ", "_")
        if not os.path.exists("./matplotlib-output/" + foldername):
            os.makedirs("./matplotlib-output/" + foldername)
        fig.savefig("./matplotlib-output/" + foldername + "/" + metricfilename + ".svg", facecolor=fig.get_facecolor(), edgecolor='none')
        plt.close(fig)

    shutil.copyfile('./files/index.html', './matplotlib-output/' + foldername + '/index.html') 


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


AttributeError: 'float' object has no attribute 'replace'