# Processing Dataset using PGSC Code

## Preliminaries

Import Dependencies

In [1]:
import pandas as pd
import os, re

# for helper functions
import sys
sys.path.append('../06_helper_functions')
import helpers

Set Province

In [2]:
province = "Catanduanes"

Set Start and End Dates

In [3]:
start_date = pd.to_datetime('2021-01-01')
end_date = pd.to_datetime('2022-12-31')

(start_date, end_date)

(Timestamp('2021-01-01 00:00:00'), Timestamp('2022-12-31 00:00:00'))

## Get the PSGC Codes

Get PSGC Code for Province

In [4]:
prov_psgc = helpers.get_prov_code(province)

prov_psgc

502000000

Get PSGC Codes for Municipalities

In [18]:
mun_pgsc = helpers.get_mun_codes(int(prov_psgc))

mun_pgsc = mun_pgsc.astype(object).apply(lambda x: str(x).zfill(10))
mun_pgsc

557    0502001000
558    0502002000
559    0502003000
560    0502004000
561    0502005000
562    0502006000
563    0502007000
564    0502008000
565    0502009000
566    0502010000
567    0502011000
Name: adm3_psgc, dtype: object

## Filter Case Information by Province

Combine all CSV files into one DataFrame

In [6]:
# initialize an empty DataFrame
df_cases = pd.DataFrame()

# get all paths for each CSV file
dir = os.listdir("../01_data/00_raw/00_case_information")

# append each CSV file to DataFrame
for csv_file in dir:
    path = f'../01_data/00_raw/00_case_information/{csv_file}'

    # check if dataframe is empty
    if (df_cases.empty):
        #print(csv_file)
        df_cases = pd.read_csv(path)
    else:
        df_cases = pd.concat([df_cases, pd.read_csv(path)])

  df_cases = pd.read_csv(path)
  df_cases = pd.concat([df_cases, pd.read_csv(path)])


In [52]:
#verify number of cases
df_cases.shape

(4136488, 23)

In [53]:
df_cases.head()

Unnamed: 0,CaseCode,Age,AgeGroup,Sex,DateSpecimen,DateResultRelease,DateRepConf,DateDied,DateRecover,RemovalType,...,ProvRes,CityMunRes,CityMuniPSGC,BarangayRes,BarangayPSGC,HealthStatus,Quarantined,DateOnset,Pregnanttab,ValidationStatus
0,C404174,38.0,35 to 39,FEMALE,,2020-01-30,2020-01-30,,,RECOVERED,...,NEGROS ORIENTAL,DUMAGUETE CITY (CAPITAL),PH074610000,,,RECOVERED,NO,2020-01-21,NO,"Removal Type is ""Recovered"", but no Recovered ..."
1,C462688,44.0,40 to 44,MALE,,2020-01-30,2020-02-03,2020-02-01,,DIED,...,NEGROS ORIENTAL,DUMAGUETE CITY (CAPITAL),PH074610000,,,DIED,NO,2020-01-18,,
2,C387710,60.0,60 to 64,FEMALE,2020-01-23,2020-01-30,2020-02-05,,2020-01-31,RECOVERED,...,BOHOL,PANGLAO,PH071233000,,,RECOVERED,NO,2020-01-21,NO,Case has Admitting Facility but is not Admitte...
3,C377460,49.0,45 to 49,MALE,,,2020-03-06,,,RECOVERED,...,BATANGAS,SANTO TOMAS,PH041028000,,,RECOVERED,NO,,,Case has Admitting Facility but is not Admitte...
4,C498051,63.0,60 to 64,MALE,2020-03-05,,2020-03-06,2020-03-11,,DIED,...,RIZAL,CAINTA,PH045805000,,,DIED,NO,,,Age or Birthdate is Invalid\nCase has Lab Resu...


Create column with cleaned PSGC

In [7]:
def clean_psgc(x):
    res_list = re.findall('[0-9]+', str(x))

    try:
        res = res_list[0]

        ## add zero after province
        if len(res) == 9:
            res = res[:2] + '0' + res[2:]
        elif len(res) == 10:
            res = res[:3] + '0' + res[3:]

        return res
    except IndexError:
        return None

# clean psgc code for uniformity, since the format in the DOH case data is different from the PSGC shapefile
df_cases['psgc_clean'] = df_cases['CityMuniPSGC'].apply(clean_psgc)

df_cases.head(10)

Unnamed: 0,CaseCode,Age,AgeGroup,Sex,DateSpecimen,DateResultRelease,DateRepConf,DateDied,DateRecover,RemovalType,...,CityMunRes,CityMuniPSGC,BarangayRes,BarangayPSGC,HealthStatus,Quarantined,DateOnset,Pregnanttab,ValidationStatus,psgc_clean
0,C404174,38.0,35 to 39,FEMALE,,2020-01-30,2020-01-30,,,RECOVERED,...,DUMAGUETE CITY (CAPITAL),PH074610000,,,RECOVERED,NO,2020-01-21,NO,"Removal Type is ""Recovered"", but no Recovered ...",704610000.0
1,C462688,44.0,40 to 44,MALE,,2020-01-30,2020-02-03,2020-02-01,,DIED,...,DUMAGUETE CITY (CAPITAL),PH074610000,,,DIED,NO,2020-01-18,,,704610000.0
2,C387710,60.0,60 to 64,FEMALE,2020-01-23,2020-01-30,2020-02-05,,2020-01-31,RECOVERED,...,PANGLAO,PH071233000,,,RECOVERED,NO,2020-01-21,NO,Case has Admitting Facility but is not Admitte...,701233000.0
3,C377460,49.0,45 to 49,MALE,,,2020-03-06,,,RECOVERED,...,SANTO TOMAS,PH041028000,,,RECOVERED,NO,,,Case has Admitting Facility but is not Admitte...,401028000.0
4,C498051,63.0,60 to 64,MALE,2020-03-05,,2020-03-06,2020-03-11,,DIED,...,CAINTA,PH045805000,,,DIED,NO,,,Age or Birthdate is Invalid\nCase has Lab Resu...,405805000.0
5,C130591,58.0,55 to 59,FEMALE,2020-03-06,2020-03-07,2020-03-07,2020-03-12,,DIED,...,CAINTA,PH045805000,,,DIED,NO,,NO,Age or Birthdate is Invalid,405805000.0
6,C178743,39.0,35 to 39,MALE,2020-03-06,2020-03-08,2020-03-08,,2020-03-21,RECOVERED,...,CITY OF MAKATI,PH137602000,,,RECOVERED,YES,2020-03-03,,Age or Birthdate is Invalid,1307602000.0
7,C440075,33.0,30 to 34,MALE,2020-03-06,2020-03-08,2020-03-08,,2020-04-05,RECOVERED,...,,,,,RECOVERED,YES,2020-03-01,,Age or Birthdate is Invalid,
8,C202135,57.0,55 to 59,MALE,2020-03-06,2020-03-08,2020-03-08,,2020-03-23,RECOVERED,...,,,,,RECOVERED,NO,,,Age or Birthdate is Invalid,
9,C557002,86.0,80+,MALE,2020-03-06,2020-03-08,2020-03-08,2020-03-14,,DIED,...,CITY OF MARIKINA,PH137402000,,,DIED,NO,2020-03-01,,Age or Birthdate is Invalid,1307402000.0


Filter by PSGC 

In [19]:
df_cases_filtered = df_cases[df_cases['psgc_clean'].isin(mun_pgsc)]

# note: some case data don't have the psgc code so it's less than the total number of cases if using only province
df_cases_filtered.shape

(3999, 24)

In [20]:
df_cases_filtered.head(10)

Unnamed: 0,CaseCode,Age,AgeGroup,Sex,DateSpecimen,DateResultRelease,DateRepConf,DateDied,DateRecover,RemovalType,...,CityMunRes,CityMuniPSGC,BarangayRes,BarangayPSGC,HealthStatus,Quarantined,DateOnset,Pregnanttab,ValidationStatus,psgc_clean
19761,C781369,50.0,50 to 54,MALE,2020-05-30,,2020-06-06,,2020-07-07,RECOVERED,...,SAN MIGUEL,PH052009000,,,RECOVERED,YES,,,Age or Birthdate is Invalid\nCase has Admittin...,502009000
29313,C636837,47.0,45 to 49,MALE,,,2020-06-22,,2020-07-08,RECOVERED,...,BAGAMANOC,PH052001000,,,RECOVERED,YES,,,Age or Birthdate is Invalid,502001000
43311,C951557,1.0,0 to 4,MALE,2020-07-02,2020-07-04,2020-07-06,,2020-07-23,RECOVERED,...,VIRAC (CAPITAL),PH052011000,,,RECOVERED,NO,2020-06-28,,Age or Birthdate is Invalid,502011000
45859,C639154,21.0,20 to 24,MALE,2020-06-30,2020-07-05,2020-07-08,,,RECOVERED,...,CARAMORAN,PH052004000,,,RECOVERED,NO,2020-06-30,,"Removal Type is ""Recovered"", but no Recovered ...",502004000
49871,C619176,22.0,20 to 24,FEMALE,2020-07-06,2020-07-07,2020-07-10,,2020-07-23,RECOVERED,...,VIRAC (CAPITAL),PH052011000,,,RECOVERED,YES,,NO,Age or Birthdate is Invalid,502011000
50112,C274073,20.0,20 to 24,FEMALE,2020-07-06,2020-07-07,2020-07-10,,2020-07-23,RECOVERED,...,VIRAC (CAPITAL),PH052011000,,,RECOVERED,YES,,NO,Age or Birthdate is Invalid,502011000
52797,C700452,2.0,0 to 4,MALE,2020-07-07,2020-07-09,2020-07-12,,2020-07-23,RECOVERED,...,VIRAC (CAPITAL),PH052011000,,,RECOVERED,NO,2020-07-04,,Age or Birthdate is Invalid\nDate Admitted is ...,502011000
57704,C215410,23.0,20 to 24,FEMALE,2020-07-11,2020-07-14,2020-07-16,,,RECOVERED,...,VIRAC (CAPITAL),PH052011000,,,RECOVERED,YES,2020-07-02,NO,"Age or Birthdate is Invalid\nRemoval Type is ""...",502011000
62794,C386011,23.0,20 to 24,MALE,2020-07-16,2020-07-17,2020-07-19,,,RECOVERED,...,GIGMOTO,PH052005000,,,RECOVERED,YES,2020-07-12,,"Removal Type is ""Recovered"", but no Recovered ...",502005000
62898,C445116,61.0,60 to 64,MALE,2020-07-15,2020-07-17,2020-07-19,,2020-08-01,RECOVERED,...,VIRAC (CAPITAL),PH052011000,,,RECOVERED,NO,,,Age or Birthdate is Invalid,502011000


## Aggregate Data

Aggregate Data for Each Municipality

In [59]:
# create initial empty DataFrame
df_aggregated = pd.DataFrame()

def aggregate_data(df, psgc):
    # for each unique dates, count the number of new cases, deaths, and recoveries
    cases = df.groupby("DateRepConf").size()
    cases.index.name = "Date"

    deaths = df[df["RemovalType"] == "DIED"].groupby("DateRepRem").size()
    deaths.index.name = "Date"

    recoveries = df[df["RemovalType"] == "RECOVERED"].groupby("DateRepRem").size()
    recoveries.index.name = "Date"

    # create a new DataFrame for aggregated data
    new_df = pd.DataFrame()

    data = {
        "NewCases" : cases,
        "Deaths" : deaths,
        "Recoveries" : recoveries
    }

    new_df = pd.concat(data, axis = 1)

    # add information
    new_df["PSGC"] = psgc

    new_df["Municipality"] = df_psgc_mun[df_psgc_mun['psgc_clean'] == psgc]['adm3_en'].values[0]

    # sort by date
    new_df = new_df.reset_index()
    new_df["Date"] = pd.to_datetime(new_df.Date, format='mixed')

    new_df = new_df.sort_values(by="Date")
    new_df = new_df.fillna(0)

    return new_df


for psgc in mun_pgsc:
    df_mun = aggregate_data(df_cases_filtered[df_cases_filtered["psgc_clean"] == psgc], psgc)
    df_aggregated = pd.concat([df_aggregated, df_mun], ignore_index=True)


df_aggregated.head(10)


Unnamed: 0,Date,NewCases,Deaths,Recoveries,PSGC,Municipality
0,2020-06-22,1.0,0.0,0.0,502001000,Bagamanoc
1,2020-07-21,0.0,0.0,1.0,502001000,Bagamanoc
2,2020-07-25,1.0,0.0,0.0,502001000,Bagamanoc
3,2020-08-09,1.0,0.0,0.0,502001000,Bagamanoc
4,2020-08-16,0.0,0.0,1.0,502001000,Bagamanoc
5,2020-08-25,0.0,0.0,1.0,502001000,Bagamanoc
6,2020-10-19,1.0,0.0,0.0,502001000,Bagamanoc
7,2020-10-20,1.0,0.0,0.0,502001000,Bagamanoc
8,2020-11-01,0.0,0.0,1.0,502001000,Bagamanoc
9,2020-11-08,0.0,0.0,1.0,502001000,Bagamanoc


## Compute for Cumulative Cases for each Municipality

Compute for change in cases per row

In [60]:
df_aggregated["d_cases"] = df_aggregated["NewCases"] - df_aggregated["Deaths"] - df_aggregated["Recoveries"]

df_aggregated.head(10)

Unnamed: 0,Date,NewCases,Deaths,Recoveries,PSGC,Municipality,d_cases
0,2020-06-22,1.0,0.0,0.0,502001000,Bagamanoc,1.0
1,2020-07-21,0.0,0.0,1.0,502001000,Bagamanoc,-1.0
2,2020-07-25,1.0,0.0,0.0,502001000,Bagamanoc,1.0
3,2020-08-09,1.0,0.0,0.0,502001000,Bagamanoc,1.0
4,2020-08-16,0.0,0.0,1.0,502001000,Bagamanoc,-1.0
5,2020-08-25,0.0,0.0,1.0,502001000,Bagamanoc,-1.0
6,2020-10-19,1.0,0.0,0.0,502001000,Bagamanoc,1.0
7,2020-10-20,1.0,0.0,0.0,502001000,Bagamanoc,1.0
8,2020-11-01,0.0,0.0,1.0,502001000,Bagamanoc,-1.0
9,2020-11-08,0.0,0.0,1.0,502001000,Bagamanoc,-1.0


Generate Date Range Series for Entire Data

In [61]:
# create series of date from min to max
date_range = pd.date_range(start=df_aggregated['Date'].min(), end=df_aggregated['Date'].max()).to_frame(name="Date")

date_range

Unnamed: 0,Date
2020-06-06,2020-06-06
2020-06-07,2020-06-07
2020-06-08,2020-06-08
2020-06-09,2020-06-09
2020-06-10,2020-06-10
...,...
2024-01-04,2024-01-04
2024-01-05,2024-01-05
2024-01-06,2024-01-06
2024-01-07,2024-01-07


Calculate cumulative cases per municipality

In [63]:
df_cumsum = pd.DataFrame()

def calculate_cumsum(df, psgc):
    # dataframe with all dates included
    df_extended = pd.DataFrame()
    df_extended["Date"] = date_range

    # merge with existing dataframe
    df_extended = pd.merge(df_extended, df, how='outer', on='Date')

    # add additional data
    df_extended['PSGC'] = psgc
    df_extended["Municipality"] = df_psgc_mun[df_psgc_mun['psgc_clean'] == psgc]['adm3_en'].values[0]

    df_extended = df_extended.fillna(0)

    # compute cumulative data
    df_extended["n"] = df_extended["d_cases"].cumsum()

    return df_extended

for psgc in mun_pgsc:
    df_mun = calculate_cumsum(df_aggregated[df_aggregated["PSGC"] == psgc], psgc)
    df_cumsum = pd.concat([df_cumsum, df_mun])

df_cumsum.head(10)

Unnamed: 0,Date,NewCases,Deaths,Recoveries,PSGC,Municipality,d_cases,n
0,2020-06-06,0.0,0.0,0.0,502001000,Bagamanoc,0.0,0.0
1,2020-06-07,0.0,0.0,0.0,502001000,Bagamanoc,0.0,0.0
2,2020-06-08,0.0,0.0,0.0,502001000,Bagamanoc,0.0,0.0
3,2020-06-09,0.0,0.0,0.0,502001000,Bagamanoc,0.0,0.0
4,2020-06-10,0.0,0.0,0.0,502001000,Bagamanoc,0.0,0.0
5,2020-06-11,0.0,0.0,0.0,502001000,Bagamanoc,0.0,0.0
6,2020-06-12,0.0,0.0,0.0,502001000,Bagamanoc,0.0,0.0
7,2020-06-13,0.0,0.0,0.0,502001000,Bagamanoc,0.0,0.0
8,2020-06-14,0.0,0.0,0.0,502001000,Bagamanoc,0.0,0.0
9,2020-06-15,0.0,0.0,0.0,502001000,Bagamanoc,0.0,0.0


Filter Data Frame by Date

In [64]:
df_cases_filtered_date = df_cumsum[(df_cumsum["Date"] >= start_date) & (df_cumsum["Date"] <= end_date)]
df_cases_filtered_date

Unnamed: 0,Date,NewCases,Deaths,Recoveries,PSGC,Municipality,d_cases,n
209,2021-01-01,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0
210,2021-01-02,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0
211,2021-01-03,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0
212,2021-01-04,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0
213,2021-01-05,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0
...,...,...,...,...,...,...,...,...
934,2022-12-27,0.0,0.0,0.0,0502011000,Virac,0.0,2.0
935,2022-12-28,0.0,0.0,0.0,0502011000,Virac,0.0,2.0
936,2022-12-29,0.0,0.0,0.0,0502011000,Virac,0.0,2.0
937,2022-12-30,0.0,0.0,0.0,0502011000,Virac,0.0,2.0


## Compute Expected Values

Load population data

In [67]:
# load population dataset
df_pop = pd.read_csv(f'../01_data/01_processed/01_population_data/{province}_population.csv')

df_pop.head(10)

Unnamed: 0,Municipality,PSGC,2020,2021,2022
0,BAGAMANOC,502001000,11086,11239,11393
1,BARAS,502002000,13484,13274,13064
2,BATO,502003000,21748,21593,21438
3,CARAMORAN,502004000,32114,32114,32114
4,GIGMOTO,502005000,8712,8712,8712
5,PANDAN,502006000,21473,21157,20841
6,PANGANIBAN,502007000,9713,9713,9713
7,SAN ANDRES,502008000,38480,38480,38480
8,SAN MIGUEL,502009000,15680,15458,15235
9,VIGA,502010000,22869,22458,22047


### Calcurate Average Case rate $r$

Where $r$ is the average case rate for an area for the entire period:

In [68]:
r = 0

year_range = range(int(2021), int(2022)+1)

# calculate rate per year
for year in year_range:
    # get sum per year
    r += (df_cumsum[df_cumsum['Date'].dt.year == year]['n']/ df_pop[f'{year}'].sum()).sum()
    
r

0.1585127054024255

### Calculate expected number of cases per municipality

Create new DataFrame for expected number of cases

In [69]:
df_exp = pd.DataFrame(columns=['PSGC', 'exp'])


for mun in df_pop['PSGC'].unique():
    # calculate expected number of cases, E_i = r*N_i
    N_i = 0

    for year in year_range:
        N_i += df_pop[df_pop['PSGC'] == mun][f'{year}']
    
    # expected number of cases is equal to the average number of cases per day per municipality
    E_i = (r * (N_i/ len(year_range))) / len(pd.date_range(start=start_date, end=end_date))

    # add expected value to row
    df_exp.loc[-1] = [mun, E_i.iloc[0]]
    df_exp.index = df_exp.index + 1 
    df_exp = df_exp.sort_index()


df_exp

Unnamed: 0,PSGC,exp
0,502011000.0,16.307157
1,502010000.0,4.831923
2,502009000.0,3.33235
3,502008000.0,8.355574
4,502007000.0,2.109088
5,502006000.0,4.559737
6,502005000.0,1.89173
7,502004000.0,6.973256
8,502003000.0,4.671891
9,502002000.0,2.859526


Combine results with dataset

In [71]:
# clean psgc code for uniformity
df_exp['PSGC_old'] = df_exp['PSGC']
df_exp['PSGC'] = df_exp['PSGC_old'].astype(int).astype(str)
df_exp['PSGC'] = df_exp['PSGC'].apply(lambda x: str(x).zfill(10))

df_exp

Unnamed: 0,PSGC,exp,PSGC_old
0,502011000,16.307157,502011000.0
1,502010000,4.831923,502010000.0
2,502009000,3.33235,502009000.0
3,502008000,8.355574,502008000.0
4,502007000,2.109088,502007000.0
5,502006000,4.559737,502006000.0
6,502005000,1.89173,502005000.0
7,502004000,6.973256,502004000.0
8,502003000,4.671891,502003000.0
9,502002000,2.859526,502002000.0


In [77]:
# merge data frames with computed expected values
df_cumsum_exp = pd.merge(df_cases_filtered_date, df_exp_cleaned, on='PSGC')
df_cumsum_exp

Unnamed: 0,Date,NewCases,Deaths,Recoveries,PSGC,Municipality,d_cases,n,exp
0,2021-01-01,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0,2.457164
1,2021-01-02,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0,2.457164
2,2021-01-03,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0,2.457164
3,2021-01-04,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0,2.457164
4,2021-01-05,0.0,0.0,0.0,0502001000,Bagamanoc,0.0,0.0,2.457164
...,...,...,...,...,...,...,...,...,...
8025,2022-12-27,0.0,0.0,0.0,0502011000,Virac,0.0,2.0,16.307157
8026,2022-12-28,0.0,0.0,0.0,0502011000,Virac,0.0,2.0,16.307157
8027,2022-12-29,0.0,0.0,0.0,0502011000,Virac,0.0,2.0,16.307157
8028,2022-12-30,0.0,0.0,0.0,0502011000,Virac,0.0,2.0,16.307157


## Export Data to CSV file

Reorder Columns

In [78]:
# this is just for organization and for better readability
df_cumsum_exp = df_cumsum_exp.iloc[:, [5, 4, 0, 1, 2, 3, 6, 7, 8]]
df_cumsum_exp

Unnamed: 0,Municipality,PSGC,Date,NewCases,Deaths,Recoveries,d_cases,n,exp
0,Bagamanoc,0502001000,2021-01-01,0.0,0.0,0.0,0.0,0.0,2.457164
1,Bagamanoc,0502001000,2021-01-02,0.0,0.0,0.0,0.0,0.0,2.457164
2,Bagamanoc,0502001000,2021-01-03,0.0,0.0,0.0,0.0,0.0,2.457164
3,Bagamanoc,0502001000,2021-01-04,0.0,0.0,0.0,0.0,0.0,2.457164
4,Bagamanoc,0502001000,2021-01-05,0.0,0.0,0.0,0.0,0.0,2.457164
...,...,...,...,...,...,...,...,...,...
8025,Virac,0502011000,2022-12-27,0.0,0.0,0.0,0.0,2.0,16.307157
8026,Virac,0502011000,2022-12-28,0.0,0.0,0.0,0.0,2.0,16.307157
8027,Virac,0502011000,2022-12-29,0.0,0.0,0.0,0.0,2.0,16.307157
8028,Virac,0502011000,2022-12-30,0.0,0.0,0.0,0.0,2.0,16.307157


In [79]:
df_cumsum_exp.to_csv(f"../01_data/01_processed/00_case_data/{province}_case_data_psgc.csv",  index=False)