# Assumptions

- Population counts are taken at week 40 of each season
- The population counts by age do not change significantly between the time when the population count is taken and when the breakpoints for age happen for sampling.
- 0 year olds represent people > 6 mo old


In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime, date
%matplotlib inline


start_week = 40 
pan_start = 15

# These are the breakpoints for which age was defined for a given season. I.e., if the breakpoint is 2007-01-01
# that means that all individuals need to be > 6 months old on or beofre 2007-01-01.

inferred_beakpoints = {'2008': '2008-01-01',
                       '2009': '2008-12-01',
                       '2009Pan': '2009-05-01',
                       '2010': '2009-12-01',
                       '2011': '2011-01-01',
                       '2012': '2011-09-01',
                       '2013': '2012-09-01',
                       '2014': '2013-09-01',
                       '2015': '2014-09-01',
                       '2016': '2015-09-01',
                       '2017': '2016-09-01',
                       '2018': '2017-09-01'}


eligible_demo = pd.read_csv('../raw_data/demography_by_age.csv')
all_seasons = list(range(2008, 2019)) + [2009.5]
index = pd.MultiIndex.from_product([range(1890, 2019), all_seasons], names=['Birth_year', 'Season'])
converted_df = pd.DataFrame(0,
                            index = index,
                            columns = ['Population'])
index2 = pd.MultiIndex.from_product([set(eligible_demo.Age), all_seasons], names=['Age', 'Season'])
extended_df = pd.DataFrame(index = index2)

for index, row in eligible_demo.iterrows():
    if row.Year > 2007:
        age = row.Age
        pop = row.MESA_pop
        week_of_eligibility = start_week
        day_of_eligibility = inferred_beakpoints[str(int(row.Year))]
        start = datetime.strptime(day_of_eligibility, '%Y-%m-%d')
        year = start.year
        ordinal_start_day = (start.date() - date(year, 1, 1)).days + 1 
        year_length_days = (date(year, 12, 31) - date(year, 1, 1)).days + 1

        birth_year_1 = int(year - age)
        birth_year_2 = int(year - age - 1)

        if age != 0:
            frac_1 = ordinal_start_day / year_length_days
        else: # if the age is 0 then the eligible cohort only consists of children > 6 mo old
            frac_1 = (ordinal_start_day - 183)/183
        if frac_1 < 0:
            frac_1 = 0

        season = row.Year
        converted_df.loc[(birth_year_1, season), 'Population'] += frac_1 * pop
        converted_df.loc[(birth_year_1, season), 'a1_pop'] =  frac_1 * pop
        converted_df.loc[(birth_year_1, season), 'a1'] =  age

        converted_df.loc[(birth_year_2, season), 'Population'] += pop - (frac_1 * pop)
        converted_df.loc[(birth_year_2, season), 'a2_pop'] =  pop - (frac_1 * pop)
        converted_df.loc[(birth_year_2, season), 'a2'] =  age

        extended_df.loc[(age, season), 'Population'] = pop
        extended_df.loc[(age, season), 'y1'] = birth_year_1
        extended_df.loc[(age, season), 'y2'] = birth_year_2
        extended_df.loc[(age, season), 'f1'] = frac_1
        extended_df.loc[(age, season), 'f2'] = (1-frac_1)

    
# Add in pandemic, we assume that the age distribution does not change
# between 2008-2009 and the pandemic
for index, row in eligible_demo.iterrows():
    if row.Year == 2009:
        age = row.Age
        pop = row.MESA_pop
        week_of_eligibility = start_week
        day_of_eligibility = inferred_beakpoints['2009Pan']
        start = datetime.strptime(day_of_eligibility, '%Y-%m-%d')
        year = start.year
        ordinal_start_day = (start.date() - date(year, 1, 1)).days + 1 
        year_length_days = (date(year, 12, 31) - date(year, 1, 1)).days + 1

        birth_year_1 = int(year - age)
        birth_year_2 = int(year - age - 1)

        if age != 0:
            frac_1 = ordinal_start_day / year_length_days
        else: # if the age is 0 then the eligible cohort only consists of children > 6 mo old
            frac_1 = (ordinal_start_day - 183)/183
        if frac_1 < 0:
            frac_1 = 0
        
        converted_df.loc[(birth_year_1, 2009.5), 'Population'] += frac_1 * pop
        converted_df.loc[(birth_year_1, 2009.5), 'a1_pop'] =  frac_1 * pop
        converted_df.loc[(birth_year_1, 2009.5), 'a1'] =  age

        converted_df.loc[(birth_year_2, 2009.5), 'Population'] += pop - (frac_1 * pop)
        converted_df.loc[(birth_year_2, 2009.5), 'a2_pop'] =  pop - (frac_1 * pop)
        converted_df.loc[(birth_year_2, 2009.5), 'a2'] =  age

        extended_df.loc[(age, 2009.5), 'Population'] = pop
        extended_df.loc[(age, 2009.5), 'y1'] = birth_year_1
        extended_df.loc[(age, 2009.5), 'y2'] = birth_year_2
        extended_df.loc[(age, 2009.5), 'f1'] = frac_1
        extended_df.loc[(age, 2009.5), 'f2'] = (1-frac_1)

converted_df = converted_df[(converted_df.Population > 0) &
                            (converted_df.index.get_level_values('Birth_year') >= 1918)]

converted_df['f1'] = converted_df.a1_pop / converted_df.Population
converted_df['f2'] = 1.0 - converted_df.f1


In [2]:
converted_df = converted_df.sort_values(['Birth_year', 'Season'])
converted_df.to_csv('../data/demography_by_birth_year.csv')
extended_df.to_csv('../data/demography_by_age_extended.csv')