In [1]:
import os
import pandas as pd
from tqdm import tqdm

In [2]:
basedir = '../../data/processed_dat/_run_output'
res_dirs = os.listdir(basedir)
res_dirs

['nonburn_0.01_y2010',
 'nonburn_0.01_y2000',
 'burned_100_0.01_y2000',
 'nonburn_0.001_y2000',
 'nonburn_0.02_y1990',
 'burned_100_0.01_y1990',
 'burned_100_0.01_y2010',
 'nonburn_0.02_y2000',
 'nonburn_0.01_y1990',
 'nonburn_0.02_y2010']

In [3]:
fs = os.listdir(os.path.join(basedir, 'nonburn_0.02_y2000'))
fs

['thumbnails',
 'stored_population.csv',
 'stored_household.csv',
 'yearly_summary.csv',
 'stored_sex_age_stats.csv']

In [11]:
fp = os.path.join(basedir, 'nonburn_0.02_y2000', 'stored_household.csv')
df = pd.read_csv(fp)
df['children'] = df['infant'] + df['school_age']

df.head()

Unnamed: 0,ind,time,household_id,member,household_founded,infant,school_age,adult,retired,household_size,children
0,0,0,0,0;1;2;3,0,1,1,2,0,4,2
1,1,0,1,4;5;6;7,0,0,2,1,1,4,2
2,2,0,2,8;9;12,0,0,0,1,2,3,0
3,3,0,3,13,0,0,0,1,0,1,0
4,4,0,4,14;15;16;17;18,0,1,2,2,0,5,3


In [12]:
households = list(set(df.household_id.tolist()))
len(households)

30282

In [13]:
all_times = list(set(df['time'].tolist()))
all_times

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [14]:
def process_a_household(df_hh):
    df_hh = df_hh.sort_values('time')
    ts = df_hh.time.tolist()
    df_hh = df_hh.set_index('time')
    pre_hhsize = df_hh.loc[ts[0]]['household_size']
    changed_info = []
    if ts[0]>0:
        #print(ts[0])
        changed_info.append((ts[0], 'new-household', pre_hhsize))
    hh_change = []
    for t in ts[1:]:
        row = df_hh.loc[t]
        cur_hhsize = row['household_size']
        hhsize_change = float(cur_hhsize - pre_hhsize)
        hh_change.append(hhsize_change)
        pre_hhsize = cur_hhsize
    for t,c in zip(ts[1:], hh_change):
        if c!=0:
            if c>0:
                changed_info.append((t, 'size-up', c))
            else:
                changed_info.append((t, 'size-down', c))
    #print(changed_info)
    return changed_info

In [15]:
def analyse_household_size_dynamic(df):
    households = list(set(df.household_id.tolist()))
    all_times = list(set(df['time'].tolist()))
    hh_events = None
    for hh in tqdm(households, desc='processing households'):
        df_hh = df[df['household_id']==hh]
        changes = process_a_household(df_hh)
        if len(changes)>0:
            df_changes = pd.DataFrame.from_records(changes)
            df_changes['hh_id'] = hh
            if not(hh_events is None):
                hh_events = hh_events.append(df_changes)
            else:
                hh_events = df_changes
    hh_events = hh_events.reset_index(drop=True)
    hh_events = hh_events.rename(columns={0: 'time', 1:'event', 2:'size'})
    
    time_sum = {}
    for t in all_times[1:]:
        #print(t)
        events_on_t = hh_events[hh_events['time']==t]
        size_up = len(events_on_t[events_on_t['event']=='size-up'])
        size_down = len(events_on_t[events_on_t['event']=='size-down'])
        new_hh = len(events_on_t[events_on_t['event']=='new-household'])
        time_sum[t] = {'size_up':size_up, 'size_down':size_down, 'new_household':new_hh, 'sub_total':size_up+size_down+new_hh}
    time_sum_df = pd.DataFrame.from_dict(time_sum, orient='index')
    time_sum_df.index.name = 'time'
    time_sum_df = time_sum_df.reset_index()
    return time_sum_df

simple version with same meaning

$$
R = \big\{ household ~|~ at~least~one~member~with~age~ \geq 65 \big\} 
$$

$$
A = \big\{ household ~|~ at~least~one~member~with~age~in~between~  \geq 18 ~and~ < 65 \big\} 
$$

$$
C = \big\{ household ~|~ at~least~one~member~with~age~ < 18 \big\} 
$$

$$
type~I = R \setminus \big( A \cup C \big) 
$$

$$
type~II = A \setminus \big( A \cup C \big) 
$$

$$
type~III = R \cap A \setminus C
$$

$$
type~IV = R \cap C \setminus A
$$

$$
type~V = A \cap C \setminus R 
$$

$$
type~VI = R \cap A \cap C
$$

In [16]:
def analyse_household_structure(df, y):
    df_y = df[df['time']==y]
    household_type_counts = { ty:0 for ty in range(8) }

    #age_groups = ['children', 'adult', 'retired']
    for i in range(len(df_y)):
        row = df_y.iloc[i]
        #print(row)
        c = row['children']
        a = row['adult']
        r = row['retired']
        if r==0 and a==0 and c>0: # only children household, not exist by rules 
            household_type_counts[0]+=1
        elif r>0 and a==0 and c==0:
            household_type_counts[1]+=1
        elif r==0 and a>0 and c==0:
            household_type_counts[2]+=1
        elif r>0 and a>0 and c==0:
            household_type_counts[3]+=1
        elif r>0 and a==0 and c>0:
            household_type_counts[4]+=1
        elif r==0 and a>0 and c>0:
            household_type_counts[5]+=1
        elif r>0 and a>0 and c>0:
            household_type_counts[6]+=1
        else:
            # all == 0, a no-people household, not possible to be exist, just for safely capture the else statement
            household_type_counts[7]+=1
    return household_type_counts

def analyse_household_structure_dynamic(df, ys=None):
    if ys is None:
        all_times = list(set(df['time'].tolist()))
    else:
        all_times = ys
    hs_dy = {}
    for y in tqdm(all_times, desc='time'):
        strc = analyse_household_structure(df, y)
        strc2 = { 'type_{}'.format(str(ty)):v for ty,v in strc.items() }
        hs_dy[y] = strc2
    hs_dynamic = pd.DataFrame.from_dict(hs_dy, orient='index')
    hs_dynamic.index.name = 'time'
    hs_dynamic = hs_dynamic.reset_index()
    return hs_dynamic

In [17]:
analyse_household_structure_dynamic(df)

time: 100%|██████████| 21/21 [01:37<00:00,  5.44s/it]


Unnamed: 0,time,type_0,type_1,type_2,type_3,type_4,type_5,type_6,type_7
0,0,1284,35,4893,2564,627,6455,2524,0
1,1,1162,84,5624,2669,634,6394,2465,0
2,2,1054,142,6390,2736,635,6309,2400,0
3,3,932,227,7059,2836,655,6247,2273,0
4,4,827,323,7610,2974,653,6137,2204,0
5,5,723,418,8227,3089,664,6025,2146,0
6,6,622,578,8807,3181,642,5932,2070,0
7,7,528,756,9319,3299,662,5770,1961,0
8,8,447,974,9774,3453,656,5591,1830,0
9,9,360,1225,10262,3588,622,5431,1686,0


In [26]:
time_sum = {}
for t in all_times[1:]:
    #print(t)
    events_on_t = hh_events[hh_events['time']==t]
    size_up = len(events_on_t[events_on_t['event']=='size-up'])
    size_down = len(events_on_t[events_on_t['event']=='size-down'])
    new_hh = len(events_on_t[events_on_t['event']=='new-houshold'])
    time_sum[t] = {'size_up':size_up, 'size_down':size_down, 'new_household':new_hh, 'sub_total':size_up+size_down+new_hh}
time_sum_df = pd.DataFrame.from_dict(time_sum, orient='index')
time_sum_df

Unnamed: 0,size_up,size_down,new_household,sub_total
0,0,0,0,0
1,439,898,389,1726
2,425,939,372,1736
3,451,869,348,1668
4,449,888,354,1691
5,447,847,336,1630
6,466,808,329,1603
7,491,825,325,1641
8,444,810,319,1573
9,491,856,341,1688


In [14]:
len(hh_events)

33334

In [16]:
len(hh_events[hh_events['event']=='size-up'])

9881

In [17]:
len(hh_events[hh_events['event']=='size-down'])

16929

In [19]:
len(hh_events[hh_events['event']=='new-household'])

6524

In [19]:
basedir = '../../data/processed_dat/_run_output'
res_dirs = os.listdir(basedir)
res_dirs = [ d for d in res_dirs if 's' in d ]
res_dirs

['nonburn_0.005_s366725_y2000',
 'nonburn_0.005_s767514_y2000',
 'nonburn_0.005_s183236_y2000',
 'nonburn_0.005_s477025_y2000',
 'nonburn_0.005_s739784_y2000',
 'nonburn_0.005_s997948_y2000',
 'nonburn_0.005_s808225_y2000',
 'nonburn_0.005_s481850_y2000',
 'nonburn_0.005_s152745_y2000',
 'nonburn_0.005_s412125_y2000']

In [20]:
dfs = []

for di in res_dirs:
    fp = os.path.join(basedir, di, 'stored_household.csv')
    df = pd.read_csv(fp)
    df['children'] = df['infant'] + df['school_age']
    df_res = analyse_household_structure_dynamic(df)
    dfs.append(df_res)

time: 100%|██████████| 21/21 [00:21<00:00,  1.17s/it]
time: 100%|██████████| 21/21 [00:21<00:00,  1.14s/it]
time: 100%|██████████| 21/21 [00:21<00:00,  1.15s/it]
time: 100%|██████████| 21/21 [00:21<00:00,  1.16s/it]
time: 100%|██████████| 21/21 [00:21<00:00,  1.17s/it]
time: 100%|██████████| 21/21 [00:21<00:00,  1.13s/it]
time: 100%|██████████| 21/21 [00:21<00:00,  1.17s/it]
time: 100%|██████████| 21/21 [00:22<00:00,  1.19s/it]
time: 100%|██████████| 21/21 [00:21<00:00,  1.24s/it]
time: 100%|██████████| 21/21 [00:22<00:00,  1.21s/it]


In [28]:
for df_res in dfs:
    last = []
    for t in range(0,21):
        last_i = df_res.iloc[t]['type_0']
        last.append(last_i)
    print(last)

[305, 280, 251, 227, 203, 173, 151, 133, 108, 88, 67, 47, 38, 21, 15, 8, 0, 0, 0, 0, 0]
[333, 297, 257, 227, 200, 177, 156, 132, 107, 85, 76, 63, 43, 34, 22, 9, 0, 0, 0, 0, 0]
[323, 292, 260, 228, 198, 173, 150, 136, 116, 98, 75, 61, 46, 33, 19, 8, 0, 0, 0, 0, 0]
[357, 329, 293, 265, 234, 211, 186, 156, 129, 108, 94, 70, 54, 38, 19, 12, 0, 0, 0, 0, 0]
[305, 277, 252, 227, 199, 177, 150, 124, 108, 84, 62, 47, 39, 33, 22, 10, 0, 0, 0, 0, 0]
[356, 322, 292, 260, 224, 203, 176, 145, 121, 100, 86, 71, 47, 29, 17, 6, 0, 0, 0, 0, 0]
[331, 304, 284, 256, 221, 194, 162, 141, 121, 95, 82, 63, 45, 35, 19, 10, 0, 0, 0, 0, 0]
[313, 276, 247, 221, 197, 171, 151, 126, 97, 84, 71, 55, 42, 29, 20, 12, 0, 0, 0, 0, 0]
[331, 302, 259, 228, 196, 169, 148, 126, 106, 86, 66, 52, 44, 32, 19, 9, 0, 0, 0, 0, 0]
[334, 304, 264, 242, 216, 187, 162, 136, 123, 102, 76, 58, 40, 24, 18, 8, 0, 0, 0, 0, 0]
