In [1]:
import pandas as pd
import numpy as np

In [2]:
%autosave 0

Autosave disabled


<center>
**Read in the files and format to match the requirements of the projection-generating script**
<br></br>
<br></br>
</center>
Some notes on the source files:
- The file with mortality projections comes from the NRS in the .xml format. Convert it to Excel first.
- The Uist population file needs to be re-shaped first from wide to long format so that it has just three columns: Age, Gender and count of population.

In [10]:
df_uist = pd.read_excel('Data/2016_Uist_data.xlsx')
df_mort = pd.read_excel('Data/sc_ppz_opendata2016.xlsx', sheet_name='Mortality_assumptions')
#rename the gender values to be descriptive
df_mort['Sex'] = df_mort['Sex'].map({1:'Male', 2:'Female'})

In [11]:
#set indexes on gender+age for quicker lookups as indexes are hash tables
df_mort.set_index(df_mort['Sex'] + df_mort['Age'].astype('str').str.strip(), inplace=True)
#drop sex and age columns
df_mort.drop(columns=['Sex', 'Age'], inplace=True)

In [12]:
#recode the year columns, assuming the originals are of the format '2016-2017'
orig_cols = df_mort.columns.values
new_cols = [int(x[0:4]) for x in orig_cols]
mapper = dict(zip(orig_cols, new_cols))
df_mort.rename(mapper=mapper, axis=1, inplace=True)

In [13]:
#round the mortality rates to whole numbers for simplicity
df_mort = df_mort.round()

In [14]:
#do the same for Uist dataset so that they are aligned
df_uist['index_col'] = df_uist['Gender'] + df_uist['Age'].astype('str').str.strip()
df_uist.drop(columns=['Gender', 'Age'], inplace=True)
df_uist.set_index('index_col', drop=False, inplace=True)

<center>
**Projection code**
</center>

In [109]:
def population_projection(df_pop, df_mort, years, reps, old_age_mortality, **kwargs):
    """
    Using the information on population structure in baseline year
    project the growth or decline of that structure forward given 
    number of years and old age (90+) mortality.
    
    The function takes an optional parameter:
    - 'starting_year' which can be used if the data structure file 
    is updated before the mortality projection is re-based. 
    The default value is 2016.
    """
    
    #check if any optional parameters had been passed
    starting_year = 2016
    if 'starting_year' in kwargs:
        starting_year = kwargs["starting_year"]
    #store the old age mortality (supplied as value between 0 and 1)
    old_age = 100000 * old_age_mortality
    
    def _survival_vector(row):
        """
        Helper function to probabilistically determine
        how many of the starter population survive given
        that year's survival rates
        """
        dice = np.random.randint(0, 100000, int(row['Pop_Count']))
        if (row['index_col'] == 'Female90+') | (row['index_col'] == 'Male90+'):
            mort = old_age
        else:
            mort = df_mort.loc[row['index_col'],j]
        
        success = list(dice > mort)
        return success.count(True)
    
    sim = [] #used for generating lines in the visualisation
    x = [] #years
    y = [] #total sum of projected population
    
    for i in range(reps):
        df = df_pop.copy()
        #add the starting points at the beginning of each rep
        x.append(starting_year)
        y.append(df_pop['Pop_Count'].sum()) 
        sim.append(i)
    
        for j in range(starting_year + 1, starting_year + years + 1):
            #iterate over years
            male_90 = 0
            female_90 = 0
            survived_pop = df.copy().apply(_survival_vector, axis=1)
            #save the new 90+ survivors
            male_90 = survived_pop.loc['Male90+']
            female_90 = survived_pop.loc['Female90+']
            
            #append the year and its projected population totals to the list
            x.append(j)
            y.append(survived_pop.copy().sum())
            sim.append(i)
            
            #overwrite the population counts & add the survivors to the 90+ group
            df.loc[:,'Pop_Count'] = survived_pop.shift(1).fillna(0)
            df.loc['Male90+', 'Pop_Count'] += male_90
            df.loc['Female90+', 'Pop_Count'] += female_90
            
    d3_df = pd.DataFrame(data={'Year':x, 'Pop_Count':y, 'SIM':sim})
        
    return d3_df

<center>Run the projection function to create low, medium and high variants</center>

In [112]:
low = population_projection(df_uist, df_mort, 20, 100, 0.2)
medium = population_projection(df_uist, df_mort, 20, 100, 0.3)
high = population_projection(df_uist, df_mort, 20, 100, 0.45)

In [113]:
#add identifiers to the projections and concatenate
low['variant'] = 'Low'
medium['variant'] = 'Medium'
high['variant'] = 'High'

In [114]:
df_final = pd.concat([low, medium, high], ignore_index=True)

In [116]:
df_final.to_csv('final.csv', index=False)