# Python Notebook for Calculation of Expected and SIR Values

## Preliminaries

Import Dependencies

In [1]:
import pandas as pd

Set Province

In [2]:
province = "CATANDUANES"

Load CSV files

In [87]:
# load cases dataset
df_cases = pd.read_csv(f'../01_data/01_processed/00_case_data/{province}_case_data.csv')
df_cases['Date'] = pd.to_datetime(df_cases['Date'])

# load population dataset
df_pop = pd.read_csv(f'../01_data/01_processed/01_population_data/{province}_simulated_population.csv')

In [88]:
df_cases.head()

Unnamed: 0,Date,Municipality,NewCases,Deaths,Recoveries,d_cases,n
0,2020-06-06,SAN MIGUEL,1,0,0,1,1
1,2020-06-07,SAN MIGUEL,0,0,0,0,1
2,2020-06-08,SAN MIGUEL,0,0,0,0,1
3,2020-06-09,SAN MIGUEL,0,0,0,0,1
4,2020-06-10,SAN MIGUEL,0,0,0,0,1


In [89]:
df_pop

Unnamed: 0,Municipality,2020,2021,2022
0,BAGAMANOC,11086,11239,11393
1,BARAS,13484,13274,13064
2,BATO,21748,21593,21438
3,CARAMORAN,32114,32114,32114
4,GIGMOTO,8712,8712,8712
5,PANDAN,21473,21157,20841
6,PANGANIBAN,9713,9713,9713
7,SAN ANDRES,38480,38480,38480
8,SAN MIGUEL,15680,15458,15235
9,VIGA,22869,22458,22047


Set date range to be used

In [90]:
start_date = pd.to_datetime('2021-01-01')
end_date = pd.to_datetime('2022-12-31')

(start_date, end_date)

(Timestamp('2021-01-01 00:00:00'), Timestamp('2022-12-31 00:00:00'))

Filter dataframe to only use select dates

In [91]:
df_cases_filtered = df_cases[(df_cases["Date"] >= start_date) & (df_cases["Date"] <= end_date)]
df_cases_filtered

Unnamed: 0,Date,Municipality,NewCases,Deaths,Recoveries,d_cases,n
209,2021-01-01,SAN MIGUEL,0,0,0,0,0
210,2021-01-02,SAN MIGUEL,0,0,0,0,0
211,2021-01-03,SAN MIGUEL,0,0,0,0,0
212,2021-01-04,SAN MIGUEL,0,0,0,0,0
213,2021-01-05,SAN MIGUEL,0,0,0,0,0
...,...,...,...,...,...,...,...
14054,2022-12-27,VIGA,0,0,0,0,0
14055,2022-12-28,VIGA,0,0,0,0,0
14056,2022-12-29,VIGA,0,0,0,0,0
14057,2022-12-30,VIGA,0,0,0,0,0


## Formula

The SIR value was calculated by Satorra & Tebé (2024) using this formla:
$$
    SIR_{it} = Y_{it}/E_{i}
$$
Where $Y_{it}$ is the observed number of cases in the $i$-th ABS (spatial division) and $E_i$ is the expected number of cases if it behaved like the whole population for the entire period.

The expected number of cases $E_i$ were calculated using indirect standardization with this formula:
$$
    E_i=\sum_{j=1}^J r_jN_j
$$
Where $N_j$ is the population in the $j$-th sex-age stratum of the area and $r_j$ is the average rate for the whole period in the same stratum and:
$$
    r_j=\frac{\sum_{i=1}^n\sum_{t=1}^T Y_{ijt}}{\sum_{i=1}^n\sum_{t=1}^T N_{ijt}}
$$

## Simplification of Formula

Since there is no available data for population by age and sex for each municipality, simplify the formula for $E_i$ to only take into account the expected number of cases for the entire population of the municipality:
$$
    E_i=rN_i
$$
Where $N_i$ is the total population of the $i$-th area and $r$ is the average rate for the whole period in the entire area.
[TO ADD: FORMULA]

## Calculation of Average Rate $r$

In [92]:
r = 0

year_range = range(int(start_date.year), int(end_date.year)+1)

# calculate rate per year
for year in year_range:
    # get sum per year
    r += (df_cases_filtered[df_cases_filtered['Date'].dt.year == year]['n']/ df_pop[f'{year}'].sum()).sum()

    #print((df_cases_filtered[df_cases_filtered['Date'].dt.year == year]['n']/ df_pop[f'{year}'].sum()).sum())
    

#r = r / len(year_range)
r

0.1585127054024255

Create new DataFrame to calcute for expected number of cases

In [93]:
df_exp = pd.DataFrame(columns=['Municipality', 'exp'])


for mun in df_pop['Municipality'].unique():
    # calculate expected number of cases, E_i = r*N_i
    N_i = 0

    for year in year_range:
        N_i += df_pop[df_pop['Municipality'] == mun][f'{year}']
    
    # expected number of cases is equal to the average number of cases per day per municipality
    E_i = (r * (N_i/ len(year_range))) / len(pd.date_range(start=start_date, end=end_date))

    # add expected value to row
    df_exp.loc[-1] = [mun, E_i.iloc[0]]
    df_exp.index = df_exp.index + 1 
    df_exp = df_exp.sort_index()


df_exp
        


Unnamed: 0,Municipality,exp
0,VIRAC,16.307157
1,VIGA,4.831923
2,SAN MIGUEL,3.33235
3,SAN ANDRES,8.355574
4,PANGANIBAN,2.109088
5,PANDAN,4.559737
6,GIGMOTO,1.89173
7,CARAMORAN,6.973256
8,BATO,4.671891
9,BARAS,2.859526


Combine results with DataFrame

In [94]:
df_cases_filtered

Unnamed: 0,Date,Municipality,NewCases,Deaths,Recoveries,d_cases,n
209,2021-01-01,SAN MIGUEL,0,0,0,0,0
210,2021-01-02,SAN MIGUEL,0,0,0,0,0
211,2021-01-03,SAN MIGUEL,0,0,0,0,0
212,2021-01-04,SAN MIGUEL,0,0,0,0,0
213,2021-01-05,SAN MIGUEL,0,0,0,0,0
...,...,...,...,...,...,...,...
14054,2022-12-27,VIGA,0,0,0,0,0
14055,2022-12-28,VIGA,0,0,0,0,0
14056,2022-12-29,VIGA,0,0,0,0,0
14057,2022-12-30,VIGA,0,0,0,0,0


In [95]:
df_cases_filtered_exp = pd.merge(df_cases_filtered, df_exp, on='Municipality')
df_cases_filtered_exp

Unnamed: 0,Date,Municipality,NewCases,Deaths,Recoveries,d_cases,n,exp
0,2021-01-01,SAN MIGUEL,0,0,0,0,0,3.332350
1,2021-01-02,SAN MIGUEL,0,0,0,0,0,3.332350
2,2021-01-03,SAN MIGUEL,0,0,0,0,0,3.332350
3,2021-01-04,SAN MIGUEL,0,0,0,0,0,3.332350
4,2021-01-05,SAN MIGUEL,0,0,0,0,0,3.332350
...,...,...,...,...,...,...,...,...
8025,2022-12-27,VIGA,0,0,0,0,0,4.831923
8026,2022-12-28,VIGA,0,0,0,0,0,4.831923
8027,2022-12-29,VIGA,0,0,0,0,0,4.831923
8028,2022-12-30,VIGA,0,0,0,0,0,4.831923


Export to New CSV File

In [None]:
df_cases_filtered_exp.to_csv(f"../01_data/01_processed/00_case_data/{province}_case_data_exp.csv",  index=False)