# Processing Dataset using PGSC Code

## Preliminaries

Import Dependencies

In [1]:
import pandas as pd
import os, re

Set Province

In [2]:
province = "Catanduanes"

Set Start and End Dates

In [3]:
start_date = pd.to_datetime('2021-01-01')
end_date = pd.to_datetime('2022-12-31')

(start_date, end_date)

(Timestamp('2021-01-01 00:00:00'), Timestamp('2022-12-31 00:00:00'))

## Get the PSGC Codes

Get PSGC Code for Province

In [7]:
# load csv file
df_psgc_prov = pd.read_csv('../01_data/00_raw/02_psgc_codes/PH_Adm2_ProvDists.csv')

# get code for province
prov_psgc = df_psgc_prov.loc[df_psgc_prov['adm2_en'] == province].iloc[0]['adm2_psgc']
prov_psgc

# delete dataframe
del df_psgc_prov

Get PSGC Codes for Municipalities

In [102]:
# load csv file
df_psgc_mun = pd.read_csv('../01_data/00_raw/02_psgc_codes/PH_Adm3_MuniCities.csv')

# filter dataframe
df_psgc_mun = df_psgc_mun[df_psgc_mun['adm2_psgc'] == prov_psgc]

# set to length of 10 digits
#mun_pgsc = mun_pgsc.apply(lambda x: str(x).zfill(10))

df_psgc_mun['psgc_clean'] = df_psgc_mun['adm3_psgc'].apply(lambda x: str(x).zfill(10))
mun_pgsc = df_psgc_mun['psgc_clean']

mun_pgsc.values

array(['0502001000', '0502002000', '0502003000', '0502004000',
       '0502005000', '0502006000', '0502007000', '0502008000',
       '0502009000', '0502010000', '0502011000'], dtype=object)

In [103]:
#502001000 in mun_pgsc.values
df_psgc_mun

Unnamed: 0,adm1_psgc,adm2_psgc,adm3_psgc,adm3_en,geo_level,len_crs,area_crs,len_km,area_km2,psgc_clean
557,500000000,502000000,502001000,Bagamanoc,Mun,69510,68263960,69,68.0,502001000
558,500000000,502000000,502002000,Baras,Mun,73441,71126568,73,71.0,502002000
559,500000000,502000000,502003000,Bato,Mun,57344,51954499,57,51.0,502003000
560,500000000,502000000,502004000,Caramoran,Mun,120928,281463181,120,281.0,502004000
561,500000000,502000000,502005000,Gigmoto,Mun,80885,106033936,80,106.0,502005000
562,500000000,502000000,502006000,Pandan,Mun,72753,106047281,72,106.0,502006000
563,500000000,502000000,502007000,Panganiban,Mun,74678,50585420,74,50.0,502007000
564,500000000,502000000,502008000,San Andres,Mun,85557,179142572,85,179.0,502008000
565,500000000,502000000,502009000,San Miguel,Mun,59751,242829264,59,242.0,502009000
566,500000000,502000000,502010000,Viga,Mun,90041,168222453,90,168.0,502010000


## Filter Case Information by Province

Combine all CSV files into one DataFrame

In [21]:
# initialize an empty DataFrame
df_cases = pd.DataFrame()

# get all paths for each CSV file
dir = os.listdir("../01_data/00_raw/00_case_information")

# append each CSV file to DataFrame
for csv_file in dir:
    path = f'../01_data/00_raw/00_case_information/{csv_file}'

    # check if dataframe is empty
    if (df_cases.empty):
        #print(csv_file)
        df_cases = pd.read_csv(path)
    else:
        df_cases = pd.concat([df_cases, pd.read_csv(path)])

  df_cases = pd.read_csv(path)
  df_cases = pd.concat([df_cases, pd.read_csv(path)])


In [22]:
#verify number of cases
df_cases.shape

(4136488, 24)

In [80]:
df_cases[(df_cases['ProvRes'] == "CATANDUANES")]['psgc_clean'].unique()

array(['0502009000', '0502001000', '0502011000', '0502004000', None,
       '0502005000', '0502002000', '0502007000', '0502006000',
       '0502003000', '0502008000', '0502010000'], dtype=object)

Create column with cleaned PSGC

In [62]:
def clean_psgc(x):
    res_list = re.findall('[0-9]+', str(x))

    try:
        res = res_list[0]

        ## add zero after province
        if len(res) == 9:
            res = res[:2] + '0' + res[2:]
        elif len(res) == 10:
            res = res[:3] + '0' + res[3:]

        return res
    except IndexError:
        return None

df_cases['psgc_clean'] = df_cases['CityMuniPSGC'].apply(clean_psgc)

df_cases

Unnamed: 0,CaseCode,Age,AgeGroup,Sex,DateSpecimen,DateResultRelease,DateRepConf,DateDied,DateRecover,RemovalType,...,CityMuniPSGC,BarangayRes,BarangayPSGC,HealthStatus,Quarantined,DateOnset,Pregnanttab,ValidationStatus,this is just here to include the folders to git repo,psgc_clean
0,C404174,38.0,35 to 39,FEMALE,,2020-01-30,2020-01-30,,,RECOVERED,...,PH074610000,,,RECOVERED,NO,2020-01-21,NO,"Removal Type is ""Recovered"", but no Recovered ...",,0704610000
1,C462688,44.0,40 to 44,MALE,,2020-01-30,2020-02-03,2020-02-01,,DIED,...,PH074610000,,,DIED,NO,2020-01-18,,,,0704610000
2,C387710,60.0,60 to 64,FEMALE,2020-01-23,2020-01-30,2020-02-05,,2020-01-31,RECOVERED,...,PH071233000,,,RECOVERED,NO,2020-01-21,NO,Case has Admitting Facility but is not Admitte...,,0701233000
3,C377460,49.0,45 to 49,MALE,,,2020-03-06,,,RECOVERED,...,PH041028000,,,RECOVERED,NO,,,Case has Admitting Facility but is not Admitte...,,0401028000
4,C498051,63.0,60 to 64,MALE,2020-03-05,,2020-03-06,2020-03-11,,DIED,...,PH045805000,,,DIED,NO,,,Age or Birthdate is Invalid\nCase has Lab Resu...,,0405805000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136483,C27437726,55.0,55 to 59,FEMALE,12/28/2023,12/28/2023,12/31/2023,,,RECOVERED,...,PH064501000,MANDALAGAN,PH064501050,RECOVERED,,12/28/2023,,"Removal Type is ""Recovered"", but no Recovered ...",,0604501000
136484,C86805412,35.0,35 to 39,MALE,12/28/2023,12/28/2023,12/31/2023,,,RECOVERED,...,PH137501000,BARANGAY 168,PH137501168,RECOVERED,,12/27/2023,,"Removal Type is ""Recovered"", but no Recovered ...",,1307501000
136485,C6347838,56.0,55 to 59,MALE,12/29/2023,12/29/2023,12/31/2023,,,RECOVERED,...,PH137603000,TUNASAN,PH137603008,RECOVERED,,,,"Health Status is ""Recovered"", but no Date Reco...",,1307603000
136486,C80918199,65.0,65 to 69,FEMALE,12/27/2023,12/28/2023,12/31/2023,,,RECOVERED,...,PH143213000,DILAG,PH143213013,RECOVERED,,,,"Health Status is ""Recovered"", but no Date Reco...",,1403213000


Filter by PSGC 

In [97]:
df_cases_filtered = df_cases[df_cases['psgc_clean'].isin(mun_pgsc)]

#mun_pgsc

# note: some case data don't have the psgc code so it's less than the total number of cases if using only province
df_cases_filtered.shape

(3999, 25)

Combine with Municipality Names

In [106]:
df_cases_filtered_with_mun = df_cases_filtered.merge(df_psgc_mun[['psgc_clean', 'adm3_en']], on='psgc_clean', how='left')

df_cases_filtered_with_mun

Unnamed: 0,CaseCode,Age,AgeGroup,Sex,DateSpecimen,DateResultRelease,DateRepConf,DateDied,DateRecover,RemovalType,...,BarangayRes,BarangayPSGC,HealthStatus,Quarantined,DateOnset,Pregnanttab,ValidationStatus,this is just here to include the folders to git repo,psgc_clean,adm3_en
0,C781369,50.0,50 to 54,MALE,2020-05-30,,2020-06-06,,2020-07-07,RECOVERED,...,,,RECOVERED,YES,,,Age or Birthdate is Invalid\nCase has Admittin...,,0502009000,San Miguel
1,C636837,47.0,45 to 49,MALE,,,2020-06-22,,2020-07-08,RECOVERED,...,,,RECOVERED,YES,,,Age or Birthdate is Invalid,,0502001000,Bagamanoc
2,C951557,1.0,0 to 4,MALE,2020-07-02,2020-07-04,2020-07-06,,2020-07-23,RECOVERED,...,,,RECOVERED,NO,2020-06-28,,Age or Birthdate is Invalid,,0502011000,Virac
3,C639154,21.0,20 to 24,MALE,2020-06-30,2020-07-05,2020-07-08,,,RECOVERED,...,,,RECOVERED,NO,2020-06-30,,"Removal Type is ""Recovered"", but no Recovered ...",,0502004000,Caramoran
4,C619176,22.0,20 to 24,FEMALE,2020-07-06,2020-07-07,2020-07-10,,2020-07-23,RECOVERED,...,,,RECOVERED,YES,,NO,Age or Birthdate is Invalid,,0502011000,Virac
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3994,C71566037,69.0,65 to 69,MALE,12/21/2023,12/22/2023,12/24/2023,,,RECOVERED,...,PALTA SALVACION,PH052011042,RECOVERED,,12/19/2023,,"Removal Type is ""Recovered"", but no Recovered ...",,0502011000,Virac
3995,C62913016,38.0,35 to 39,MALE,12/27/2023,12/28/2023,12/30/2023,,,RECOVERED,...,DIVINO ROSTRO (POB.),PH052008017,RECOVERED,,12/22/2023,,"Health Status is ""Recovered"", but no Date Reco...",,0502008000,San Andres
3996,C47629021,69.0,65 to 69,FEMALE,12/27/2023,12/28/2023,12/30/2023,,,RECOVERED,...,RIZAL,PH052010018,RECOVERED,,12/24/2023,,"Health Status is ""Recovered"", but no Date Reco...",,0502010000,Viga
3997,C2258759,83.0,80+,FEMALE,12/28/2023,12/29/2023,12/31/2023,,,RECOVERED,...,CALATAGAN PROPER,PH052011011,RECOVERED,,12/26/2023,,"Removal Type is ""Recovered"", but no Recovered ...",,0502011000,Virac


Only Keep Necessary Columns