In [1]:
import pandas as pd
import numpy as np

In [2]:
%autosave 0

Autosave disabled


<center>
**Read in the files and format to match the requirements of the projection-generating script**
<br></br>
<br></br>
</center>
Some notes on the source files:
- The file with mortality projections comes from the NRS in the .xml format. Convert it to Excel first.
- The Uist population file needs to be re-shaped first from wide to long format so that it has just three columns: Age, Gender and count of population.

In [3]:
df_uist = pd.read_excel('Data/2016_Uist_data.xlsx')
df_mort = pd.read_excel('Data/sc_ppz_opendata2016.xlsx', sheet_name='Mortality_assumptions')
#rename the gender values to be descriptive
df_mort['Sex'] = df_mort['Sex'].map({1:'Male', 2:'Female'})

In [4]:
#set indexes on gender+age for quicker lookups as indexes are hash tables
df_mort.set_index(df_mort['Sex'] + df_mort['Age'].astype('str').str.strip(), inplace=True)
#drop sex and age columns
df_mort.drop(columns=['Sex', 'Age'], inplace=True)

In [5]:
#recode the year columns, assuming the originals are of the format '2016-2017'
orig_cols = df_mort.columns.values
new_cols = [int(x[0:4]) for x in orig_cols]
mapper = dict(zip(orig_cols, new_cols))
df_mort.rename(mapper=mapper, axis=1, inplace=True)

In [6]:
#round the mortality rates to whole numbers for simplicity
df_mort = df_mort.round()

In [7]:
#do the same for Uist dataset so that they are aligned
df_uist['index_col'] = df_uist['Gender'] + df_uist['Age'].astype('str').str.strip()
df_uist.set_index('index_col', drop=False, inplace=True)

In [8]:
def _survival_vector(row, year, old_age):
    """
    Helper function to probabilistically determine
    how many of the starter population survive given
    that year's survival rates for their age/gender
    """
    dice = np.random.randint(0, 100000, int(row['Pop_Count']))

    if (row['index_col'] == 'Female90+') | (row['index_col'] == 'Male90+'):
        mort = old_age
    else:
        mort = df_mort.loc[row['index_col'], year]

    success = list(dice > mort)
    return success.count(True)

<center>Population Pyramid Code</center>

In [9]:
def population_pyramid(df_pop, df_mort, years, reps, old_age_mortality, **kwargs):
    """
    Using the information on population structure in baseline year
    project the natural decline of that structure forward given 
    number of years and old age (90+) mortality.
    
    The function takes an optional parameter:
    - 'starting_year' which can be used if the data structure file 
    is updated before the mortality projection is re-based. 
    The default value is 2016.
    """
    
    #check if any optional parameters had been passed
    starting_year = 2016
    if 'starting_year' in kwargs:
        starting_year = kwargs["starting_year"]
    #store the old age mortality (supplied as value between 0 and 1)
    old_age = 100000 * old_age_mortality
    
    sim = [] #number of times to repeat the projection
    x = [] #years
    y = [] #projected population at a given year
    age = []
    gender = []
    
    for i in range(reps):
        #split the original DF into male and female for shifts to work correctly
        df_male = df_pop[df_pop['Gender'] == 'Male'].copy()
        df_female = df_pop[df_pop['Gender'] == 'Female'].copy()
        
        #add the starting points at the beginning of each rep.
        #There are a total of 91 rows for ages 0-90+
        x += [starting_year] * 2 * 91
        y += df_pop['Pop_Count'].tolist()
        age += df_pop['Age'].tolist() * (years + 1)
        gender += df_pop['Gender'].tolist() * (years + 1)
        sim += [i] * (years+1) * 2 * 91
    
        for j in range(starting_year + 1, starting_year + years + 1):
            #df_male/female is the "working population" that is being overwritten each loop

            #1) run survive function on the population from the previous (j-1) year
            #that we created by overwriting the "working population" in the previous loop
            survived_male = df_male.copy().apply(
                _survival_vector, axis=1, args=(j,old_age))
            survived_female = df_female.copy().apply(
                _survival_vector, axis=1, args=(j,old_age))
            
            #2) save the 90+ survivors from j-1
            male_90 = survived_male.loc['Male90+']
            female_90 = survived_female.loc['Female90+']
            
            #3) age the population of j-1 (survived_male) and call it survived_male_new
            survived_male_new = survived_male.shift(1).fillna(0)
            survived_female_new = survived_female.shift(1).fillna(0)
            
            #4) add the (j-1) survivors of the 90+ bracket to the aged 89 survivors who
            #are now the new 90+
            survived_male_new.loc['Male90+'] += male_90
            survived_female_new.loc['Female90+'] += female_90
            
            #5) add the j population to the list for export; females first to match inital sort
            x += [j] * 2 * 91
            y = y + survived_female_new.copy().tolist() + survived_male_new.copy().tolist()
            
            #6) overwrite the "working population" with survivors from j
            df_male.loc[:,'Pop_Count'] = survived_male_new.copy()
            df_female.loc[:,'Pop_Count'] = survived_female_new.copy()
            
    pyramid_df = pd.DataFrame(data={'Year':x, 'Age': age, 'Gender': gender,
                                    'Pop_Count':y, 'SIM':sim})
    
    #add reference population for comparison
    pyramid_df = pyramid_df.merge(df_pop, how='left', on=['Age', 'Gender'],
                                  suffixes=['_Projection', '_Reference'])
        
    return pyramid_df

<center>Population Projection Code</center>

In [10]:
def population_projection(df_pop, df_mort, years, reps, old_age_mortality, **kwargs):
    """
    Using the information on population structure in baseline year
    project the natural decline of that structure forward given 
    number of years and old age (90+) mortality.
    
    The function takes an optional parameter:
    - 'starting_year' which can be used if the data structure file 
    is updated before the mortality projection is re-based. 
    The default value is 2016.
    """
    
    #check if any optional parameters had been passed
    starting_year = 2016
    if 'starting_year' in kwargs:
        starting_year = kwargs["starting_year"]
    #store the old age mortality (supplied as value between 0 and 1)
    old_age = 100000 * old_age_mortality
    
    sim = [] #used for generating individual lines in the visualisation
    x = [] #years
    y = [] #total sum of projected population
    
    for i in range(reps):
        #split the original DF into male and female for shifts to work correctly
        #since df_male is a "label" or reference to the object on the right side,
        #we need to take a copy of df_pop to avoid overwriting the original
        df_male = df_pop[df_pop['Gender'] == 'Male'].copy()
        df_female = df_pop[df_pop['Gender'] == 'Female'].copy()
        
        #add the starting points at the beginning of each rep
        x.append(starting_year)
        y.append(df_pop['Pop_Count'].sum()) 
        sim.append(i)
        
        #Python's range works up to, but not including the upper limit
        for j in range(starting_year + 1, starting_year + years + 1):

#             UNCOMMENT SECTION(S) BELOW TO SHOW DEBUGGING PRINTS
#             -------------------------------------------------
#             print('>Number of males in {}:'.format(j-1))
#             print('Males aged 0: {}\nMales aged 1: {}\nMales aged 2: {}'.
#                   format(df_male.loc['Male0','Pop_Count'],
#                          df_male.loc['Male1','Pop_Count'],
#                          df_male.loc['Male2','Pop_Count']))
#             print('Males aged 88: {}\nMales aged 89: {}\nMales aged 90+: {}'.
#                   format(df_male.loc['Male88','Pop_Count'],
#                          df_male.loc['Male89','Pop_Count'],
#                          df_male.loc['Male90+','Pop_Count']))

            #1)
            survived_male = df_male.copy().apply(
                _survival_vector, axis=1, args=(j,old_age))
            survived_female = df_female.copy().apply(
                _survival_vector, axis=1, args=(j,old_age))

            #2)
            male_90 = survived_male.loc['Male90+']
            female_90 = survived_female.loc['Female90+']
            
            #3)
            survived_male_new = survived_male.shift(1).fillna(0)
            survived_female_new = survived_female.shift(1).fillna(0)

            #4)
            survived_male_new.loc['Male90+'] += male_90
            survived_female_new.loc['Female90+'] += female_90

            #5)
            x.append(j)
            y.append(survived_male_new.copy().sum() + survived_female_new.copy().sum())
            sim.append(i)
            
            #6)
            df_male.loc[:,'Pop_Count'] = survived_male_new.copy()
            df_female.loc[:,'Pop_Count'] = survived_female_new.copy()
            
#             UNCOMMENT SECTION(S) BELOW TO SHOW DEBUGGING PRINTS
#             -------------------------------------------------
#             print('>Males aged 90+ survived in {} to age into {}: {}'.format(j-1, j, male_90))
#             print('>Number of males surviving {} and aging into {}:'.format(j-1, j))
#             print('Males aged 0: {}\nMales aged 1: {}\nMales aged 2: {}'.
#                   format(survived_male.loc['Male0'],
#                          survived_male.loc['Male1'],
#                          survived_male.loc['Male2']))
#             print('Males aged 88: {}\nMales aged 89: {}\nMales aged 90+: {}'.
#                   format(survived_male.loc['Male88'],
#                          survived_male.loc['Male89'],
#                          survived_male.loc['Male90+']))

    d3_df = pd.DataFrame(data={'Year':x, 'Pop_Count':y, 'SIM':sim})
    return d3_df

<center>Run the population pyramid projection and save to .csv</center>

In [11]:
data = population_pyramid(df_uist, df_mort, 20, 1, 0.35)
data.to_csv('data/pop_pyramid.csv', index=False)

<center>Run the projection function to create low, medium and high variants</center>

In [12]:
low = population_projection(df_uist, df_mort, 20, 100, 0.2)
medium = population_projection(df_uist, df_mort, 20, 100, 0.35)
high = population_projection(df_uist, df_mort, 20, 100, 0.5)

#add identifiers to the projections and concatenate
low['variant'] = 'Low'
medium['variant'] = 'Medium'
high['variant'] = 'High'

df_final = pd.concat([low, medium, high], ignore_index=True)
df_final.to_csv('data/pop_projection.csv', index=False)