In [1]:
import loader
import random
import numpy as np
file = '2012-Consolidated-stripped.csv'

In [2]:
def reservoir_sample(iterator, k):
    """
    Basic reservoir sample. Takes a target sample amount
    """
    # fill the reservoir to start
    iterator = iter(iterator)
    result = [next(iterator) for _ in range(k)]
    n = k
    for item in iterator:
        n += 1
        s = random.randint(0, n)
        if s < k:
            result[s] = item
    return result

def get_sample_size(len1, len2, percent):
    return int(min(len1, len2) * percent)

In [3]:
reload(loader)
rows = loader.load_raw(file)

In [20]:
chronic = []
not_chronic = []
asdf = []

for pid in rows:
    person = rows[pid]
    person_data = person.info
    chron = False
    all_pop = True
    for code in person.info:
        if 'chronic_'in code and not chron:
                if person_data[code] is 1:
                    chron = True
                elif person_data[code] is -9 or person_data[code] is -8 or person_data[code] is -7:
                    rows[pid].chronic = -1
            
    if chron:
        chronic.append(pid)
    else:
        not_chronic.append(pid)
    
    person.age = person_data['demo_age']
    
    if 'spending_dist_total' in person.info:
        person.spend = person_data['spending_dist_total']
    else:
        person.spend = 0
    
    if 'service_office' in person.info:
        person.office = person_data['service_office']
    else:
        person.office = 0
        
    if 'spending_dist_office' in person.info:
        person.officesp = person_data['spending_dist_office']
    else:
        person.officesp = 0


In [6]:
# calculate sample size
ssize = get_sample_size(len(chronic), len(not_chronic), 0.70)

# sample chronic and non-chronic
schron = reservoir_sample(chronic, ssize)
snot = reservoir_sample(not_chronic, ssize)

In [7]:
# get spend for each
schron_spend = [rows[pid].spend for pid in schron]
snot_spend = [rows[pid].spend for pid in snot]

In [8]:
print(np.average(schron_spend))
print(np.average(snot_spend))

6223.79531993
1570.16758909


In [9]:
# get office visit for each
schron_office = [rows[pid].office for pid in schron]
snot_office = [rows[pid].office for pid in snot]

In [10]:
print(np.average(schron_office))
print(np.average(snot_office))

6.32003107098
2.09175648121


In [17]:
# get cost per visit for each
schron_cpv = [rows[pid].officesp/rows[pid].office for pid in schron if rows[pid].office>0]
snot_cpv = [rows[pid].officesp/rows[pid].office for pid in snot if rows[pid].office>0]

In [18]:
print(np.average(schron_cpv))
print(np.average(snot_office))

3.7121267969
2.09175648121


In [35]:
# breakdown in to age groups
chron_young, chron_mid, chron_old = [],[],[]
for pid in chronic:
    if rows[pid].age>18 and rows[pid].age<45:
        chron_young.append(pid)
    elif rows[pid].age>=45 and rows[pid].age<65:
        chron_mid.append(pid)
    elif rows[pid].age>=65:
        chron_old.append(pid)

notchron_young, notchron_mid, notchron_old = [],[],[]
for pid in not_chronic:
    if rows[pid].age>18 and rows[pid].age<45:
        notchron_young.append(pid)
    elif rows[pid].age>=45 and rows[pid].age<65:
        notchron_mid.append(pid)
    elif rows[pid].age>=65:
        notchron_old.append(pid)  

In [37]:
# sample for each age group
def sample_age_group(chron, notchron):
    ssize_age = get_sample_size(len(chron), len(notchron), 0.7)
    schron_age = reservoir_sample(chron, ssize_age)
    snotchron_age = reservoir_sample(notchron, ssize_age)
    return schron_age, snotchron_age

schron_young, snotchron_young = sample_age_group(chron_young, notchron_young)
schron_mid, snotchron_mid = sample_age_group(chron_mid, notchron_mid)
schron_old, snotchron_old = sample_age_group(chron_old, notchron_old)


In [38]:
print len(schron_young), len(schron_mid), len(schron_old)

2538 2408 320


In [39]:
# gspending
chron_young_sp = [rows[pid].spend for pid in schron_young]
chron_mid_sp = [rows[pid].spend for pid in schron_mid]
chron_old_sp = [rows[pid].spend for pid in schron_old]
notchron_young_sp = [rows[pid].spend for pid in snotchron_young]
notchron_mid_sp = [rows[pid].spend for pid in snotchron_mid]
notchron_old_sp = [rows[pid].spend for pid in snotchron_old]

In [40]:
print np.average(chron_young_sp), np.average(notchron_young_sp)
print np.average(chron_mid_sp), np.average(notchron_mid_sp)
print np.average(chron_old_sp), np.average(notchron_old_sp)

3769.04373522 1436.51576044
6919.00996678 1724.77034884
8813.55625 2799.609375


In [31]:
chron_young_ov = [rows[pid].office for pid in chronic if rows[pid].age>18 and rows[pid].age < 45]
chron_mid_ov = [rows[pid].office for pid in chronic if rows[pid].age>=45 and rows[pid].age < 65]
chron_old_ov = [rows[pid].office for pid in chronic if rows[pid].age>=65]
notchron_young_ov = [rows[pid].office for pid in not_chronic if rows[pid].age>18 and rows[pid].age < 45]
notchron_mid_ov = [rows[pid].office for pid in not_chronic if rows[pid].age>=45 and rows[pid].age < 65]
notchron_old_ov = [rows[pid].office for pid in not_chronic if rows[pid].age>=65]

In [32]:
print np.average(chron_young_ov), np.average(notchron_young_ov)
print np.average(chron_mid_ov), np.average(notchron_mid_ov)
print np.average(chron_old_ov), np.average(notchron_old_ov)

4.50523993381 1.86236261016
6.47518588968 2.3254867771
9.26730869339 3.75545851528
