# Imports and Constants

In [54]:
import csv
from pandas import DataFrame, read_csv
import pandas as pd
import matplotlib.pyplot as plt
from textwrap import wrap
import numpy as np
from __future__ import print_function
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

In [55]:
output_file_path = "output.csv"
input_file_path = "Inpatient_Prospective_Payment_System__IPPS__Provider_Summary_for_the_Top_100_Diagnosis-Related_Groups__DRG__-_FY2011.csv"

In [56]:
drgDef = 'DRG Definition'
providerId = 'Provider Id'
providerName = 'Provider Name'
providerStreet = 'Provider Street Address'
providerCity= 'Provider City'
providerState = 'Provider State'
providerZip = 'Provider Zip Code'
hospRef = 'Hospital Referral Region Description'
totalDischarges = ' Total Discharges '
avgCovered = ' Average Covered Charges '
avgTotal = ' Average Total Payments '
avgMedicare = 'Average Medicare Payments'

In [57]:
header_list = [drgDef, providerId, providerName, providerStreet, providerCity,providerState, 
               providerZip, hospRef, totalDischarges, avgCovered, avgTotal, avgMedicare]

# Data Refining

In [43]:
def refine_dataset(dataSet):
    """
        Prints type of each variable in the dataSet as well as the occurences of
        each unique value

        :param DataFrame dataSet: The DataFrame to refine
        :return: refined dataSet
        :rtype: DataFrame
    """
    #Drops rows with invalid number of variables
    dataSet.dropna(how='any')

    #Write updated dataset to file (For sake of testing comment this out, it's quite slow)
    dataSet.to_csv(output_file_path);

    return dataSet

dataSet = refine_dataset(dataSet)

# Basic Data Analysis

In [44]:
def descriptive_analysis(dataSet):
    """
        Prints the number of records in a dataSet
        the type of each varaible
        and the occurrences of each unique value
        
        :param DataFrame dataSet: The DataFrame to print stats for
    """
    
    print("\nNumber of records: " + str(len(dataSet)) + "\n")
    
    #Seems janky but does the job,
    #removes person from list (Does not have any effect on given dataSet)
    for headerName in dataSet:
        # Get group for current header
        group = dataSet.groupby(headerName)

        # Print number of occurrences of each variable
        print(group.size())
        print("\n")

descriptive_analysis(dataSet)


Number of records: 163065

DRG Definition
039 - EXTRACRANIAL PROCEDURES W/O CC/MCC                                      1079
057 - DEGENERATIVE NERVOUS SYSTEM DISORDERS W/O MCC                           1201
064 - INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION W MCC                    1659
065 - INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION W CC                     2269
066 - INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION W/O CC/MCC               1806
069 - TRANSIENT ISCHEMIA                                                      1962
074 - CRANIAL & PERIPHERAL NERVE DISORDERS W/O MCC                             979
101 - SEIZURES W/O MCC                                                        1593
149 - DYSEQUILIBRIUM                                                           988
176 - PULMONARY EMBOLISM W/O MCC                                              1396
177 - RESPIRATORY INFECTIONS & INFLAMMATIONS W MCC                            1894
178 - RESPIRATORY INFECTIONS & INFLAMMATIONS

# Data Plotting Methods

Plot pie chart of distribution by a given header name

In [45]:
def plot_pie_distribution_by_header(dataSet, header):
    """
        Takes dataSet and plots distribution of given header

        :param DataFrame dataSet: The DataFrame to be plotted
    """
    %matplotlib notebook

    fig, axis = plt.subplots()
    
    title = "Distribution of " + header
    
    dataSet[header].value_counts().plot(ax=axis, kind='pie')
    plt.ylabel("")
    #plt.tight_layout()
    plt.title(title, y=1.02)
    axis.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.savefig('Visualisations/distribution.png')
    plt.show()

Plot bar graph of distribution by a given header name

In [62]:
def plot_bar_distribution_by_header(dataSet, header):
    """
        Takes dataSet and plots distribution of given header

        :param DataFrame dataSet: The DataFrame to be plotted
    """
    %matplotlib notebook

    fig, axis = plt.subplots()
    
    title = "Distribution of " + header
    
    filename = 'Visualisations/distribution_of_' + header.replace(" ", "_") + '.png'
    
    dataSet[header].value_counts().plot(ax=axis, kind='bar')
    plt.ylabel("")
    #plt.tight_layout()
    plt.title(title, y=1.02)  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.savefig(filename)
    plt.show()

# Data Analysis Execution

In [None]:
dataSet = pd.read_csv(input_file_path, header=0)

In [63]:
plot_bar_distribution_by_header(dataSet, providerState)

<IPython.core.display.Javascript object>