In [16]:
%matplotlib inline
import loader
import random
import sys
import numpy as np
import matplotlib.pyplot as plt
file = '2012-Consolidated-stripped.csv'

In [20]:
def reservoir_sample(iterator, k):
    """
    Basic reservoir sample. Takes a target sample amount
    """
    # fill the reservoir to start
    iterator = iter(iterator)
    result = [next(iterator) for _ in range(k)]
    n = k
    for item in iterator:
        n += 1
        s = random.randint(0, n)
        if s < k:
            result[s] = item
    return result


def get_sample_size(len1, len2, percent):
    return int(min(len1, len2) * percent)

In [21]:
reload(loader)
rows = loader.load_raw(file)

In [22]:
# split dataset by chronic / not_chronic
chron = []
notchron = []

for pid in rows:
    person = rows[pid]
    person_data = person.info
    chron_bool = False
    for code in person.info:
        if 'chronic_'in code and not chron_bool:
                if person_data[code] is 1:
                    chron_bool = True
                elif person_data[code] is -9 or person_data[code] is -8 or person_data[code] is -7:
                    rows[pid].chronic = -1
            
    if chron_bool:
        chron.append(pid)
    else:
        notchron.append(pid)

In [37]:
# sample helpers
def sample_two(l1, l2, pct):
    ssize = get_sample_size(len(l1), len(l2), pct)
    s1 = reservoir_sample(l1, ssize)
    s2 = reservoir_sample(l2, ssize)
    return s1, s2

def sample_one(single, pct):
    ssize = get_sample_size(len(single), sys.maxint, pct)
    sample = reservoir_sample(single, ssize)
    return sample

def get_spending(id_list, rows):
    return [rows[pid].info['spending_dist_total'] for pid in id_list]

def get_subsidized(id_list, rows):
    return [rows[pid].info['spending_pay_medicaid'] + rows[pid].info['spending_pay_medicare'] for pid in id_list]

In [34]:
# create samples of chronic / not chronic
schron, snotchron = sample_two(chron, notchron, 0.7)

In [39]:
# get average total spending for both groups
schron_spend = get_spending(schron, rows)
snotchron_spend = get_spending(snotchron, rows)
print 'chronic total spending', np.average(schron_spend), np.std(schron_spend)
print 'not chronic total spending', np.average(snotchron_spend), np.std(snotchron_spend)

chronic total spending 6502.75560734 16205.4853535
not chronic total spending 1484.78260025 8802.73067834


In [40]:
# get average subsidized spending
schron_sspend = get_subsidized(schron, rows)
snotchron_sspend = get_subsidized(snotchron, rows)
print 'chronic sub spending', np.average(schron_sspend), np.std(schron_sspend)
print 'not chronic sub spending', np.average(snotchron_sspend), np.std(snotchron_sspend)

chronic sub spending 2982.13593553 10700.5406053
not chronic sub spending 438.716962812 5515.1383859


In [42]:
# get total visits
schron_office = [rows[pid].info['service_office'] for pid in schron]
snotchron_office = [rows[pid].info['service_office'] for pid in snotchron]

# get cost per visit
schron_cpv = [rows[pid].info['spending_dist_office']/rows[pid].info['service_office'] for pid in schron if rows[pid].info['service_office']>0]
snotchron_cpv = [rows[pid].info['spending_dist_office']/rows[pid].info['service_office'] for pid in snotchron if rows[pid].info['service_office']>0]

In [43]:
print 'chronic office visits', np.average(schron_office), np.std(schron_office)
print 'not chronic office visits', np.average(snotchron_office), np.std(snotchron_office)

print 'chronic cost per visit', np.average(schron_cpv), np.std(schron_cpv)
print 'not chronic cost per visit', np.average(snotchron_cpv), np.std(snotchron_cpv)

chronic office visits 6.40023303233 11.6200499013
not chronic office visits 2.133799398 5.57106481946
chronic cost per visit 4.17861107685 36.9440740481
not chronic cost per visit 2.00597393193 21.9697919906


In [67]:
# split dataset between chronic conditions
hbp = []
coronary = []
myocardial = []
stroke = []
diabetes = []
asthma = []
arthritis = []
cancer = []

hbp2 = []
multiple = []

diseases = {
    'chronic_hbp': hbp, 
    'chronic_coronary': coronary, 
    'chronic_myocardial': myocardial, 
    'chronic_stroke': stroke,
    'chronic_diabetes': diabetes,
    'chronic_asthma': asthma,
    'chronic_arthritis': arthritis, 
    'chronic_cancer': cancer
}

for pid in rows:
    person = rows[pid]
    person_data = person.info
    has_chronic = False
    for code in person.info:
        if 'chronic_hbp2' in code and person_data[code] is 1:
            hbp2.append(pid)
            continue
        if code in diseases and person_data[code] is 1: 
            diseases[code].append(pid)
            if not has_chronic:
                has_chronic = True
            else:
                multiple.append(pid)
                

# This boolean controls whether the multiple condition group is mutually exclusive from individual
separate = False
if (separate):
    # get people who only have that condition
    set_multiple = set(multiple)
    for disease in diseases:
        setify = set(diseases[disease])
        diseases[disease] = list(setify.difference(set_multiple))

    # get people who only have hbp and who were diagnosed twice
    set_hbp = set(hbp)
    set_hbp2 = set(hbp2)
    hbp2 = list(set_hbp.intersection(set_hbp2))   

In [68]:
# create sub samples of each condition
shbp = sample_one(hbp, 0.7)
scoronary = sample_one(coronary, 0.7)
smyocardial = sample_one(myocardial, 0.7)
sstroke = sample_one(stroke, 0.7)
sdiabetes = sample_one(diabetes, 0.7)
sasthma = sample_one(asthma, 0.7)
sarthritis = sample_one(arthritis, 0.7)
scancer = sample_one(cancer, 0.7)

shbp2 = sample_one(hbp2, 0.7)
smultiple = sample_one(multiple, 0.7)

In [69]:
# get subsidized spending for each condition (filtering out multiple)
shbp_ssp = get_subsidized(shbp, rows)
scoronary_ssp = get_subsidized(scoronary, rows)
smyocardial_ssp = get_subsidized(smyocardial, rows)
sstroke_ssp = get_subsidized(sstroke, rows)
sdiabetes_ssp = get_subsidized(sdiabetes, rows)
sasthma_ssp = get_subsidized(sasthma, rows)
sarthritis_ssp = get_subsidized(sarthritis, rows)
scancer_ssp = get_subsidized(scancer, rows)

shbp2_ssp = get_subsidized(shbp2, rows)
smultiple_ssp = get_subsidized(smultiple, rows)

print 'hbp', np.average(shbp_ssp)
print 'coronary', np.average(scoronary_ssp)
print 'myocardial', np.average(smyocardial_ssp)
print 'stroke', np.average(sstroke_ssp)
print 'diabetes', np.average(sdiabetes_ssp)
print 'asthma', np.average(sasthma_ssp)
print 'arthritis', np.average(sarthritis_ssp)
print 'cancer', np.average(scancer_ssp)

print 'hbp2', np.average(shbp2_ssp)
print 'multiple', np.average(smultiple_ssp)

hbp 3815.92092575
coronary 9163.708159
myocardial 9263.43962848
stroke 9965.27910448
diabetes 5709.73974026
asthma 2542.19746933
arthritis 4703.25316165
cancer 4941.55381727
hbp2 4254.85748219
multiple 6860.04389907


In [None]:
# diabetes is unexpectedly the 4th highest single conditions -- let's take a closer look