# Processing Dataset using PGSC Code

## Preliminaries

Import Dependencies

In [1]:
import pandas as pd
import os, re

Set Province

In [2]:
province = "Catanduanes"

Set Start and End Dates

In [3]:
start_date = pd.to_datetime('2021-01-01')
end_date = pd.to_datetime('2022-12-31')

(start_date, end_date)

(Timestamp('2021-01-01 00:00:00'), Timestamp('2022-12-31 00:00:00'))

## Get the PSGC Codes

Get PSGC Code for Province

In [7]:
# load csv file
df_psgc_prov = pd.read_csv('../01_data/00_raw/02_psgc_codes/PH_Adm2_ProvDists.csv')

# get code for province
prov_psgc = df_psgc_prov.loc[df_psgc_prov['adm2_en'] == province].iloc[0]['adm2_psgc']
prov_psgc

# delete dataframe
del df_psgc_prov

Get PSGC Codes for Municipalities

In [102]:
# load csv file
df_psgc_mun = pd.read_csv('../01_data/00_raw/02_psgc_codes/PH_Adm3_MuniCities.csv')

# filter dataframe
df_psgc_mun = df_psgc_mun[df_psgc_mun['adm2_psgc'] == prov_psgc]

# set to length of 10 digits
#mun_pgsc = mun_pgsc.apply(lambda x: str(x).zfill(10))

df_psgc_mun['psgc_clean'] = df_psgc_mun['adm3_psgc'].apply(lambda x: str(x).zfill(10))
mun_pgsc = df_psgc_mun['psgc_clean']

mun_pgsc.values

array(['0502001000', '0502002000', '0502003000', '0502004000',
       '0502005000', '0502006000', '0502007000', '0502008000',
       '0502009000', '0502010000', '0502011000'], dtype=object)

In [103]:
#502001000 in mun_pgsc.values
df_psgc_mun

Unnamed: 0,adm1_psgc,adm2_psgc,adm3_psgc,adm3_en,geo_level,len_crs,area_crs,len_km,area_km2,psgc_clean
557,500000000,502000000,502001000,Bagamanoc,Mun,69510,68263960,69,68.0,502001000
558,500000000,502000000,502002000,Baras,Mun,73441,71126568,73,71.0,502002000
559,500000000,502000000,502003000,Bato,Mun,57344,51954499,57,51.0,502003000
560,500000000,502000000,502004000,Caramoran,Mun,120928,281463181,120,281.0,502004000
561,500000000,502000000,502005000,Gigmoto,Mun,80885,106033936,80,106.0,502005000
562,500000000,502000000,502006000,Pandan,Mun,72753,106047281,72,106.0,502006000
563,500000000,502000000,502007000,Panganiban,Mun,74678,50585420,74,50.0,502007000
564,500000000,502000000,502008000,San Andres,Mun,85557,179142572,85,179.0,502008000
565,500000000,502000000,502009000,San Miguel,Mun,59751,242829264,59,242.0,502009000
566,500000000,502000000,502010000,Viga,Mun,90041,168222453,90,168.0,502010000


## Filter Case Information by Province

Combine all CSV files into one DataFrame

In [21]:
# initialize an empty DataFrame
df_cases = pd.DataFrame()

# get all paths for each CSV file
dir = os.listdir("../01_data/00_raw/00_case_information")

# append each CSV file to DataFrame
for csv_file in dir:
    path = f'../01_data/00_raw/00_case_information/{csv_file}'

    # check if dataframe is empty
    if (df_cases.empty):
        #print(csv_file)
        df_cases = pd.read_csv(path)
    else:
        df_cases = pd.concat([df_cases, pd.read_csv(path)])

  df_cases = pd.read_csv(path)
  df_cases = pd.concat([df_cases, pd.read_csv(path)])


In [22]:
#verify number of cases
df_cases.shape

(4136488, 24)

In [80]:
df_cases[(df_cases['ProvRes'] == "CATANDUANES")]['psgc_clean'].unique()

array(['0502009000', '0502001000', '0502011000', '0502004000', None,
       '0502005000', '0502002000', '0502007000', '0502006000',
       '0502003000', '0502008000', '0502010000'], dtype=object)

Create column with cleaned PSGC

In [62]:
def clean_psgc(x):
    res_list = re.findall('[0-9]+', str(x))

    try:
        res = res_list[0]

        ## add zero after province
        if len(res) == 9:
            res = res[:2] + '0' + res[2:]
        elif len(res) == 10:
            res = res[:3] + '0' + res[3:]

        return res
    except IndexError:
        return None

df_cases['psgc_clean'] = df_cases['CityMuniPSGC'].apply(clean_psgc)

df_cases

Unnamed: 0,CaseCode,Age,AgeGroup,Sex,DateSpecimen,DateResultRelease,DateRepConf,DateDied,DateRecover,RemovalType,...,CityMuniPSGC,BarangayRes,BarangayPSGC,HealthStatus,Quarantined,DateOnset,Pregnanttab,ValidationStatus,this is just here to include the folders to git repo,psgc_clean
0,C404174,38.0,35 to 39,FEMALE,,2020-01-30,2020-01-30,,,RECOVERED,...,PH074610000,,,RECOVERED,NO,2020-01-21,NO,"Removal Type is ""Recovered"", but no Recovered ...",,0704610000
1,C462688,44.0,40 to 44,MALE,,2020-01-30,2020-02-03,2020-02-01,,DIED,...,PH074610000,,,DIED,NO,2020-01-18,,,,0704610000
2,C387710,60.0,60 to 64,FEMALE,2020-01-23,2020-01-30,2020-02-05,,2020-01-31,RECOVERED,...,PH071233000,,,RECOVERED,NO,2020-01-21,NO,Case has Admitting Facility but is not Admitte...,,0701233000
3,C377460,49.0,45 to 49,MALE,,,2020-03-06,,,RECOVERED,...,PH041028000,,,RECOVERED,NO,,,Case has Admitting Facility but is not Admitte...,,0401028000
4,C498051,63.0,60 to 64,MALE,2020-03-05,,2020-03-06,2020-03-11,,DIED,...,PH045805000,,,DIED,NO,,,Age or Birthdate is Invalid\nCase has Lab Resu...,,0405805000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136483,C27437726,55.0,55 to 59,FEMALE,12/28/2023,12/28/2023,12/31/2023,,,RECOVERED,...,PH064501000,MANDALAGAN,PH064501050,RECOVERED,,12/28/2023,,"Removal Type is ""Recovered"", but no Recovered ...",,0604501000
136484,C86805412,35.0,35 to 39,MALE,12/28/2023,12/28/2023,12/31/2023,,,RECOVERED,...,PH137501000,BARANGAY 168,PH137501168,RECOVERED,,12/27/2023,,"Removal Type is ""Recovered"", but no Recovered ...",,1307501000
136485,C6347838,56.0,55 to 59,MALE,12/29/2023,12/29/2023,12/31/2023,,,RECOVERED,...,PH137603000,TUNASAN,PH137603008,RECOVERED,,,,"Health Status is ""Recovered"", but no Date Reco...",,1307603000
136486,C80918199,65.0,65 to 69,FEMALE,12/27/2023,12/28/2023,12/31/2023,,,RECOVERED,...,PH143213000,DILAG,PH143213013,RECOVERED,,,,"Health Status is ""Recovered"", but no Date Reco...",,1403213000


Filter by PSGC 

In [97]:
df_cases_filtered = df_cases[df_cases['psgc_clean'].isin(mun_pgsc)]

#mun_pgsc

# note: some case data don't have the psgc code so it's less than the total number of cases if using only province
df_cases_filtered.shape

(3999, 25)

## Aggregate Data

Aggregate Data for Each Municipality

In [125]:
# create initial empty DataFrame
df_aggregated = pd.DataFrame()

def aggregate_data(df, mun):
    # for each unique dates, count the number of new cases, deaths, and recoveries
    cases = df.groupby("DateRepConf").size()
    cases.index.name = "Date"

    deaths = df[df["RemovalType"] == "DIED"].groupby("DateRepRem").size()
    deaths.index.name = "Date"

    recoveries = df[df["RemovalType"] == "RECOVERED"].groupby("DateRepRem").size()
    recoveries.index.name = "Date"

    # create a new DataFrame for aggregated data
    new_df = pd.DataFrame()

    data = {
        "NewCases" : cases,
        "Deaths" : deaths,
        "Recoveries" : recoveries
    }

    new_df = pd.concat(data, axis = 1)

    # add information
    #clean_mun = re.sub(' \(.*\)', '', mun)
    new_df["PSGC"] = mun

    new_df["Municipality"] = df_psgc_mun[df_psgc_mun['psgc_clean'] == mun]['adm3_en'].values[0]

    # sort by date
    new_df = new_df.reset_index()
    new_df["Date"] = pd.to_datetime(new_df.Date, format='mixed')

    new_df = new_df.sort_values(by="Date")
    new_df = new_df.fillna(0)

    return new_df


for mun in mun_pgsc:
    df_mun = aggregate_data(df_cases_filtered[df_cases_filtered["psgc_clean"] == mun], mun)
    df_aggregated = pd.concat([df_aggregated, df_mun], ignore_index=True)


# reorganize columns
#df_aggregated = df_aggregated.iloc[:, [4, 0, 1, 2, 3]]

#df_aggregated = df_aggregated.iloc[:, [4, 0, 1, 2, 3]]
#df_aggregated =df_aggregated.iloc[:, [6, 3, 4, 5, 0, 1, 2]]

# reformat date
#df_aggregated["Date"] = pd.to_datetime(df_aggregated.Date, format='mixed')

df_aggregated


Unnamed: 0,Date,NewCases,Deaths,Recoveries,PSGC,Municipality
0,2020-06-22,1.0,0.0,0.0,0502001000,Bagamanoc
1,2020-07-21,0.0,0.0,1.0,0502001000,Bagamanoc
2,2020-07-25,1.0,0.0,0.0,0502001000,Bagamanoc
3,2020-08-09,1.0,0.0,0.0,0502001000,Bagamanoc
4,2020-08-16,0.0,0.0,1.0,0502001000,Bagamanoc
...,...,...,...,...,...,...
2525,2023-12-24,2.0,0.0,0.0,0502011000,Virac
2526,2023-12-25,0.0,0.0,2.0,0502011000,Virac
2527,2023-12-31,2.0,0.0,0.0,0502011000,Virac
2528,2024-01-01,0.0,0.0,2.0,0502011000,Virac


## Compute for Cumulative Cases for each Municipality

Compute for change in cases per row

In [126]:
df_aggregated["d_cases"] = df_aggregated["NewCases"] - df_aggregated["Deaths"] - df_aggregated["Recoveries"]

df_aggregated

Unnamed: 0,Date,NewCases,Deaths,Recoveries,PSGC,Municipality,d_cases
0,2020-06-22,1.0,0.0,0.0,0502001000,Bagamanoc,1.0
1,2020-07-21,0.0,0.0,1.0,0502001000,Bagamanoc,-1.0
2,2020-07-25,1.0,0.0,0.0,0502001000,Bagamanoc,1.0
3,2020-08-09,1.0,0.0,0.0,0502001000,Bagamanoc,1.0
4,2020-08-16,0.0,0.0,1.0,0502001000,Bagamanoc,-1.0
...,...,...,...,...,...,...,...
2525,2023-12-24,2.0,0.0,0.0,0502011000,Virac,2.0
2526,2023-12-25,0.0,0.0,2.0,0502011000,Virac,-2.0
2527,2023-12-31,2.0,0.0,0.0,0502011000,Virac,2.0
2528,2024-01-01,0.0,0.0,2.0,0502011000,Virac,-2.0


Filter Data Frame to only use certain dates

In [None]:
#df_cases_filtered_date = df_aggregated[(df_aggregated["Date"] >= start_date) & (df_aggregated["Date"] <= end_date)]
#df_cases_filtered_date

Unnamed: 0,Date,NewCases,Deaths,Recoveries,PSGC,Municipality,d_cases
10,2021-03-04,1.0,0.0,0.0,0502001000,Bagamanoc,1.0
11,2021-03-28,0.0,0.0,1.0,0502001000,Bagamanoc,-1.0
12,2021-04-19,1.0,0.0,0.0,0502001000,Bagamanoc,1.0
13,2021-04-25,4.0,0.0,0.0,0502001000,Bagamanoc,4.0
14,2021-04-27,2.0,0.0,0.0,0502001000,Bagamanoc,2.0
...,...,...,...,...,...,...,...
2463,2022-11-04,0.0,0.0,1.0,0502011000,Virac,-1.0
2464,2022-11-25,1.0,0.0,0.0,0502011000,Virac,1.0
2465,2022-11-28,1.0,0.0,0.0,0502011000,Virac,1.0
2466,2022-12-04,0.0,0.0,1.0,0502011000,Virac,-1.0


Generate Date Range Series for Entire Data

In [119]:
# create series of date from min to max
date_range = pd.date_range(start=start_date, end=end_date).to_frame(name="Date")

date_range

Unnamed: 0,Date
2021-01-01,2021-01-01
2021-01-02,2021-01-02
2021-01-03,2021-01-03
2021-01-04,2021-01-04
2021-01-05,2021-01-05
...,...
2022-12-27,2022-12-27
2022-12-28,2022-12-28
2022-12-29,2022-12-29
2022-12-30,2022-12-30


Calculate cumulative cases per municipality

In [None]:
df_cumsum = pd.DataFrame()

def calculate_cumsum(df, mun):
    # dataframe with all dates included
    df_extended = pd.DataFrame()
    df_extended["Date"] = date_range

    # merge with existing dataframe
    df_extended = pd.merge(df_extended, df, how='outer', on='Date')

    # add additional data
    df_extended['PSGC'] = mun
    df_extended["Municipality"] = df_psgc_mun[df_psgc_mun['psgc_clean'] == mun]['adm3_en'].values[0]

    df_extended = df_extended.fillna(0)

    # compute cumulative data
    df_extended["n"] = df_extended["d_cases"].cumsum()

    return df_extended

for mun in mun_pgsc:
    df_mun = calculate_cumsum(df_aggregated[df_aggregated["PSGC"]==mun], mun)
    df_cumsum = pd.concat([df_cumsum, df_mun])

df_cumsum

Unnamed: 0,Date,NewCases,Deaths,Recoveries,PSGC,Municipality,d_cases,n
0,2021-01-01,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0
1,2021-01-02,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0
2,2021-01-03,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0
3,2021-01-04,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0
4,2021-01-05,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0
...,...,...,...,...,...,...,...,...
725,2022-12-27,0.0,0.0,0.0,0502011000,Virac,0.0,-7.0
726,2022-12-28,0.0,0.0,0.0,0502011000,Virac,0.0,-7.0
727,2022-12-29,0.0,0.0,0.0,0502011000,Virac,0.0,-7.0
728,2022-12-30,0.0,0.0,0.0,0502011000,Virac,0.0,-7.0


## Compute Expected Values

Load simulated population data

In [141]:
# load population dataset
df_pop = pd.read_csv(f'../01_data/01_processed/01_population_data/{province}_simulated_population.csv')

def clean_psgc(x):
    x = str(x)
    if len(x) == 9:
        return '0' + x
    else:
        return x

df_pop['PSGC'] = df_pop['PSGC'].apply(clean_psgc)

df_pop

Unnamed: 0,Municipality,PSGC,2020,2021,2022
0,BAGAMANOC,502001000,11086,11239,11393
1,BARAS,502002000,13484,13274,13064
2,BATO,502003000,21748,21593,21438
3,CARAMORAN,502004000,32114,32114,32114
4,GIGMOTO,502005000,8712,8712,8712
5,PANDAN,502006000,21473,21157,20841
6,PANGANIBAN,502007000,9713,9713,9713
7,SAN ANDRES,502008000,38480,38480,38480
8,SAN MIGUEL,502009000,15680,15458,15235
9,VIGA,502010000,22869,22458,22047


### Calcurate Average Case rate $r$

Where $r$ is the average case rate for an area for the entire period:

In [142]:
r = 0

year_range = range(int(start_date.year), int(end_date.year)+1)

# calculate rate per year
for year in year_range:
    # get sum per year
    r += (df_cumsum[df_cumsum['Date'].dt.year == year]['n']/ df_pop[f'{year}'].sum()).sum()
    
r

0.11776288880803139

### Calculate expected number of cases per municipality

Create new DataFrame for expected number of cases

In [143]:
df_exp = pd.DataFrame(columns=['PSGC', 'exp'])


for mun in df_pop['PSGC'].unique():
    # calculate expected number of cases, E_i = r*N_i
    N_i = 0

    for year in year_range:
        N_i += df_pop[df_pop['PSGC'] == mun][f'{year}']
    
    # expected number of cases is equal to the average number of cases per day per municipality
    E_i = (r * (N_i/ len(year_range))) / len(pd.date_range(start=start_date, end=end_date))

    # add expected value to row
    df_exp.loc[-1] = [mun, E_i.iloc[0]]
    df_exp.index = df_exp.index + 1 
    df_exp = df_exp.sort_index()


df_exp

Unnamed: 0,PSGC,exp
0,502011000,12.114978
1,502010000,3.589752
2,502009000,2.475682
3,502008000,6.207556
4,502007000,1.566892
5,502006000,3.387538
6,502005000,1.405411
7,502004000,5.180599
8,502003000,3.470859
9,502002000,2.12441


Combine results with dataset

In [144]:
df_cumsum_exp = pd.merge(df_cumsum, df_exp, on='PSGC')
df_cumsum_exp

Unnamed: 0,Date,NewCases,Deaths,Recoveries,PSGC,Municipality,d_cases,n,exp
0,2021-01-01,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0,1.825486
1,2021-01-02,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0,1.825486
2,2021-01-03,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0,1.825486
3,2021-01-04,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0,1.825486
4,2021-01-05,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0,1.825486
...,...,...,...,...,...,...,...,...,...
8025,2022-12-27,0.0,0.0,0.0,0502011000,Virac,0.0,-7.0,12.114978
8026,2022-12-28,0.0,0.0,0.0,0502011000,Virac,0.0,-7.0,12.114978
8027,2022-12-29,0.0,0.0,0.0,0502011000,Virac,0.0,-7.0,12.114978
8028,2022-12-30,0.0,0.0,0.0,0502011000,Virac,0.0,-7.0,12.114978


## Export Data to CSV file

Reorder Columns

In [146]:
df_cumsum_exp = df_cumsum_exp.iloc[:, [5, 4, 0, 1, 2, 3, 6, 7, 8]]
df_cumsum_exp

Unnamed: 0,Municipality,PSGC,Date,NewCases,Deaths,Recoveries,d_cases,n,exp
0,Bagamanoc,0502001000,2021-01-01,0.0,0.0,0.0,0.0,0.0,1.825486
1,Bagamanoc,0502001000,2021-01-02,0.0,0.0,0.0,0.0,0.0,1.825486
2,Bagamanoc,0502001000,2021-01-03,0.0,0.0,0.0,0.0,0.0,1.825486
3,Bagamanoc,0502001000,2021-01-04,0.0,0.0,0.0,0.0,0.0,1.825486
4,Bagamanoc,0502001000,2021-01-05,0.0,0.0,0.0,0.0,0.0,1.825486
...,...,...,...,...,...,...,...,...,...
8025,Virac,0502011000,2022-12-27,0.0,0.0,0.0,0.0,-7.0,12.114978
8026,Virac,0502011000,2022-12-28,0.0,0.0,0.0,0.0,-7.0,12.114978
8027,Virac,0502011000,2022-12-29,0.0,0.0,0.0,0.0,-7.0,12.114978
8028,Virac,0502011000,2022-12-30,0.0,0.0,0.0,0.0,-7.0,12.114978


In [147]:
df_cumsum_exp.to_csv(f"../01_data/01_processed/00_case_data/{province}_case_data_psgc.csv",  index=False)