# Processing Dataset using PGSC Code

## Preliminaries

Import Dependencies

In [1]:
import pandas as pd
import os, re

Set Province

In [2]:
province = "Quezon"

Set Start and End Dates

In [3]:
start_date = pd.to_datetime('2021-01-01')
end_date = pd.to_datetime('2022-12-31')

(start_date, end_date)

(Timestamp('2021-01-01 00:00:00'), Timestamp('2022-12-31 00:00:00'))

## Get the PSGC Codes

Get PSGC Code for Province

In [4]:
# load csv file
df_psgc_prov = pd.read_csv('../01_data/00_raw/02_psgc_codes/PH_Adm2_ProvDists.csv')

# get code for province
prov_psgc = df_psgc_prov.loc[df_psgc_prov['adm2_en'] == province].iloc[0]['adm2_psgc']
prov_psgc

# delete dataframe
del df_psgc_prov

Get PSGC Codes for Municipalities

In [5]:
# load csv file
df_psgc_mun = pd.read_csv('../01_data/00_raw/02_psgc_codes/PH_Adm3_MuniCities.csv')

# filter dataframe
df_psgc_mun = df_psgc_mun[df_psgc_mun['adm2_psgc'] == prov_psgc]

# set to length of 10 digits
#mun_pgsc = mun_pgsc.apply(lambda x: str(x).zfill(10))

df_psgc_mun['psgc_clean'] = df_psgc_mun['adm3_psgc'].apply(lambda x: str(x).zfill(10))
mun_pgsc = df_psgc_mun['psgc_clean']

mun_pgsc.values

array(['0405601000', '0405602000', '0405603000', '0405605000',
       '0405606000', '0405607000', '0405608000', '0405610000',
       '0405615000', '0405616000', '0405617000', '0405618000',
       '0405619000', '0405620000', '0405621000', '0405622000',
       '0405623000', '0405625000', '0405627000', '0405628000',
       '0405629000', '0405630000', '0405631000', '0405632000',
       '0405633000', '0405634000', '0405635000', '0405636000',
       '0405637000', '0405638000', '0405639000', '0405640000',
       '0405641000', '0405642000', '0405644000', '0405645000',
       '0405646000', '0405647000', '0405648000', '0405649000'],
      dtype=object)

In [None]:
#502001000 in mun_pgsc.values
df_psgc_mun

## Filter Case Information by Province

Combine all CSV files into one DataFrame

In [7]:
# initialize an empty DataFrame
df_cases = pd.DataFrame()

# get all paths for each CSV file
dir = os.listdir("../01_data/00_raw/00_case_information")

# append each CSV file to DataFrame
for csv_file in dir:
    path = f'../01_data/00_raw/00_case_information/{csv_file}'

    # check if dataframe is empty
    if (df_cases.empty):
        #print(csv_file)
        df_cases = pd.read_csv(path)
    else:
        df_cases = pd.concat([df_cases, pd.read_csv(path)])

  df_cases = pd.read_csv(path)
  df_cases = pd.concat([df_cases, pd.read_csv(path)])


In [8]:
#verify number of cases
df_cases.shape

(4136488, 23)

In [9]:
df_cases.head()

Unnamed: 0,CaseCode,Age,AgeGroup,Sex,DateSpecimen,DateResultRelease,DateRepConf,DateDied,DateRecover,RemovalType,...,ProvRes,CityMunRes,CityMuniPSGC,BarangayRes,BarangayPSGC,HealthStatus,Quarantined,DateOnset,Pregnanttab,ValidationStatus
0,C404174,38.0,35 to 39,FEMALE,,2020-01-30,2020-01-30,,,RECOVERED,...,NEGROS ORIENTAL,DUMAGUETE CITY (CAPITAL),PH074610000,,,RECOVERED,NO,2020-01-21,NO,"Removal Type is ""Recovered"", but no Recovered ..."
1,C462688,44.0,40 to 44,MALE,,2020-01-30,2020-02-03,2020-02-01,,DIED,...,NEGROS ORIENTAL,DUMAGUETE CITY (CAPITAL),PH074610000,,,DIED,NO,2020-01-18,,
2,C387710,60.0,60 to 64,FEMALE,2020-01-23,2020-01-30,2020-02-05,,2020-01-31,RECOVERED,...,BOHOL,PANGLAO,PH071233000,,,RECOVERED,NO,2020-01-21,NO,Case has Admitting Facility but is not Admitte...
3,C377460,49.0,45 to 49,MALE,,,2020-03-06,,,RECOVERED,...,BATANGAS,SANTO TOMAS,PH041028000,,,RECOVERED,NO,,,Case has Admitting Facility but is not Admitte...
4,C498051,63.0,60 to 64,MALE,2020-03-05,,2020-03-06,2020-03-11,,DIED,...,RIZAL,CAINTA,PH045805000,,,DIED,NO,,,Age or Birthdate is Invalid\nCase has Lab Resu...


Create column with cleaned PSGC

In [10]:
def clean_psgc(x):
    res_list = re.findall('[0-9]+', str(x))

    try:
        res = res_list[0]

        ## add zero after province
        if len(res) == 9:
            res = res[:2] + '0' + res[2:]
        elif len(res) == 10:
            res = res[:3] + '0' + res[3:]

        return res
    except IndexError:
        return None

df_cases['psgc_clean'] = df_cases['CityMuniPSGC'].apply(clean_psgc)

df_cases

Unnamed: 0,CaseCode,Age,AgeGroup,Sex,DateSpecimen,DateResultRelease,DateRepConf,DateDied,DateRecover,RemovalType,...,CityMunRes,CityMuniPSGC,BarangayRes,BarangayPSGC,HealthStatus,Quarantined,DateOnset,Pregnanttab,ValidationStatus,psgc_clean
0,C404174,38.0,35 to 39,FEMALE,,2020-01-30,2020-01-30,,,RECOVERED,...,DUMAGUETE CITY (CAPITAL),PH074610000,,,RECOVERED,NO,2020-01-21,NO,"Removal Type is ""Recovered"", but no Recovered ...",0704610000
1,C462688,44.0,40 to 44,MALE,,2020-01-30,2020-02-03,2020-02-01,,DIED,...,DUMAGUETE CITY (CAPITAL),PH074610000,,,DIED,NO,2020-01-18,,,0704610000
2,C387710,60.0,60 to 64,FEMALE,2020-01-23,2020-01-30,2020-02-05,,2020-01-31,RECOVERED,...,PANGLAO,PH071233000,,,RECOVERED,NO,2020-01-21,NO,Case has Admitting Facility but is not Admitte...,0701233000
3,C377460,49.0,45 to 49,MALE,,,2020-03-06,,,RECOVERED,...,SANTO TOMAS,PH041028000,,,RECOVERED,NO,,,Case has Admitting Facility but is not Admitte...,0401028000
4,C498051,63.0,60 to 64,MALE,2020-03-05,,2020-03-06,2020-03-11,,DIED,...,CAINTA,PH045805000,,,DIED,NO,,,Age or Birthdate is Invalid\nCase has Lab Resu...,0405805000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136483,C27437726,55.0,55 to 59,FEMALE,12/28/2023,12/28/2023,12/31/2023,,,RECOVERED,...,BACOLOD CITY (CAPITAL),PH064501000,MANDALAGAN,PH064501050,RECOVERED,,12/28/2023,,"Removal Type is ""Recovered"", but no Recovered ...",0604501000
136484,C86805412,35.0,35 to 39,MALE,12/28/2023,12/28/2023,12/31/2023,,,RECOVERED,...,CALOOCAN CITY,PH137501000,BARANGAY 168,PH137501168,RECOVERED,,12/27/2023,,"Removal Type is ""Recovered"", but no Recovered ...",1307501000
136485,C6347838,56.0,55 to 59,MALE,12/29/2023,12/29/2023,12/31/2023,,,RECOVERED,...,CITY OF MUNTINLUPA,PH137603000,TUNASAN,PH137603008,RECOVERED,,,,"Health Status is ""Recovered"", but no Date Reco...",1307603000
136486,C80918199,65.0,65 to 69,FEMALE,12/27/2023,12/28/2023,12/31/2023,,,RECOVERED,...,CITY OF TABUK (CAPITAL),PH143213000,DILAG,PH143213013,RECOVERED,,,,"Health Status is ""Recovered"", but no Date Reco...",1403213000


Filter by PSGC 

In [11]:
df_cases_filtered = df_cases[df_cases['psgc_clean'].isin(mun_pgsc)]

#mun_pgsc

# note: some case data don't have the psgc code so it's less than the total number of cases if using only province
df_cases_filtered.shape

(32047, 24)

In [12]:
df_cases_filtered.head(10)

Unnamed: 0,CaseCode,Age,AgeGroup,Sex,DateSpecimen,DateResultRelease,DateRepConf,DateDied,DateRecover,RemovalType,...,CityMunRes,CityMuniPSGC,BarangayRes,BarangayPSGC,HealthStatus,Quarantined,DateOnset,Pregnanttab,ValidationStatus,psgc_clean
487,C225418,27.0,25 to 29,MALE,2020-03-18,2020-03-23,2020-03-24,,2020-04-08,RECOVERED,...,SARIAYA,PH045645000,,,RECOVERED,YES,2020-03-16,,Age or Birthdate is Invalid,405645000
1319,C298913,36.0,35 to 39,MALE,,,2020-03-29,,,RECOVERED,...,CITY OF TAYABAS,PH045647000,,,RECOVERED,YES,,,"Removal Type is ""Recovered"", but no Recovered ...",405647000
1628,C450673,69.0,65 to 69,MALE,2020-03-20,2020-03-25,2020-03-31,,2020-05-02,RECOVERED,...,SAMPALOC,PH045639000,,,RECOVERED,NO,2020-03-17,,Age or Birthdate is Invalid,405639000
1928,C189191,55.0,55 to 59,MALE,2020-03-26,2020-03-28,2020-03-31,2020-04-04,,DIED,...,CANDELARIA,PH045608000,,,DIED,NO,2020-03-18,,Age or Birthdate is Invalid,405608000
2135,C737892,23.0,20 to 24,FEMALE,2020-03-26,2020-03-29,2020-04-01,,2020-04-28,RECOVERED,...,SARIAYA,PH045645000,,,RECOVERED,NO,2020-03-21,NO,Age or Birthdate is Invalid,405645000
2808,C101015,79.0,75 to 79,MALE,2020-03-28,2020-04-01,2020-04-03,2020-04-06,,DIED,...,UNISAN,PH045649000,,,DIED,NO,2020-03-25,,Age or Birthdate is Invalid\nCase has Admittin...,405649000
3446,C337958,35.0,35 to 39,FEMALE,,,2020-04-06,,,RECOVERED,...,CITY OF TAYABAS,PH045647000,,,RECOVERED,YES,2020-03-21,NO,"Health Status is ""Recovered"", but no Date Reco...",405647000
3594,C273307,31.0,30 to 34,FEMALE,2020-04-03,,2020-04-08,,2020-05-10,RECOVERED,...,UNISAN,PH045649000,,,RECOVERED,NO,2020-04-01,NO,Age or Birthdate is Invalid,405649000
3635,C527987,66.0,65 to 69,MALE,2020-04-03,2020-04-06,2020-04-08,2020-04-03,,DIED,...,INFANTA,PH045620000,,,DIED,NO,2020-03-30,,Case has Admitting Facility but is not Admitte...,405620000
3697,C103706,74.0,70 to 74,FEMALE,2020-04-02,2020-04-07,2020-04-09,2020-04-11,,DIED,...,TIAONG,PH045648000,,,DIED,NO,2020-04-01,NO,Age or Birthdate is Invalid\nCase has Admittin...,405648000


## Aggregate Data

Aggregate Data for Each Municipality

In [13]:
# create initial empty DataFrame
df_aggregated = pd.DataFrame()

def aggregate_data(df, mun):
    # for each unique dates, count the number of new cases, deaths, and recoveries
    cases = df.groupby("DateRepConf").size()
    cases.index.name = "Date"

    deaths = df[df["RemovalType"] == "DIED"].groupby("DateRepRem").size()
    deaths.index.name = "Date"

    recoveries = df[df["RemovalType"] == "RECOVERED"].groupby("DateRepRem").size()
    recoveries.index.name = "Date"

    # create a new DataFrame for aggregated data
    new_df = pd.DataFrame()

    data = {
        "NewCases" : cases,
        "Deaths" : deaths,
        "Recoveries" : recoveries
    }

    new_df = pd.concat(data, axis = 1)

    # add information
    #clean_mun = re.sub(' \(.*\)', '', mun)
    new_df["PSGC"] = mun

    new_df["Municipality"] = df_psgc_mun[df_psgc_mun['psgc_clean'] == mun]['adm3_en'].values[0]

    # sort by date
    new_df = new_df.reset_index()
    new_df["Date"] = pd.to_datetime(new_df.Date, format='mixed')

    new_df = new_df.sort_values(by="Date")
    new_df = new_df.fillna(0)

    return new_df


for mun in mun_pgsc:
    df_mun = aggregate_data(df_cases_filtered[df_cases_filtered["psgc_clean"] == mun], mun)
    df_aggregated = pd.concat([df_aggregated, df_mun], ignore_index=True)


# reorganize columns
#df_aggregated = df_aggregated.iloc[:, [4, 0, 1, 2, 3]]

#df_aggregated = df_aggregated.iloc[:, [4, 0, 1, 2, 3]]
#df_aggregated =df_aggregated.iloc[:, [6, 3, 4, 5, 0, 1, 2]]

# reformat date
#df_aggregated["Date"] = pd.to_datetime(df_aggregated.Date, format='mixed')

df_aggregated


Unnamed: 0,Date,NewCases,Deaths,Recoveries,PSGC,Municipality
0,2020-07-15,1.0,0.0,0.0,0405601000,Agdangan
1,2020-07-17,2.0,0.0,0.0,0405601000,Agdangan
2,2020-07-21,2.0,0.0,0.0,0405601000,Agdangan
3,2020-07-30,0.0,0.0,1.0,0405601000,Agdangan
4,2020-08-09,0.0,0.0,1.0,0405601000,Agdangan
...,...,...,...,...,...,...
14600,2023-11-17,0.0,0.0,2.0,0405649000,Unisan
14601,2023-12-20,1.0,0.0,0.0,0405649000,Unisan
14602,2023-12-24,1.0,0.0,0.0,0405649000,Unisan
14603,2023-12-29,0.0,0.0,1.0,0405649000,Unisan


## Compute for Cumulative Cases for each Municipality

Compute for change in cases per row

In [14]:
df_aggregated["d_cases"] = df_aggregated["NewCases"] - df_aggregated["Deaths"] - df_aggregated["Recoveries"]

df_aggregated

Unnamed: 0,Date,NewCases,Deaths,Recoveries,PSGC,Municipality,d_cases
0,2020-07-15,1.0,0.0,0.0,0405601000,Agdangan,1.0
1,2020-07-17,2.0,0.0,0.0,0405601000,Agdangan,2.0
2,2020-07-21,2.0,0.0,0.0,0405601000,Agdangan,2.0
3,2020-07-30,0.0,0.0,1.0,0405601000,Agdangan,-1.0
4,2020-08-09,0.0,0.0,1.0,0405601000,Agdangan,-1.0
...,...,...,...,...,...,...,...
14600,2023-11-17,0.0,0.0,2.0,0405649000,Unisan,-2.0
14601,2023-12-20,1.0,0.0,0.0,0405649000,Unisan,1.0
14602,2023-12-24,1.0,0.0,0.0,0405649000,Unisan,1.0
14603,2023-12-29,0.0,0.0,1.0,0405649000,Unisan,-1.0


Filter Data Frame to only use certain dates

In [14]:
#df_cases_filtered_date = df_aggregated[(df_aggregated["Date"] >= start_date) & (df_aggregated["Date"] <= end_date)]
#df_cases_filtered_date

Generate Date Range Series for Entire Data

In [15]:
# create series of date from min to max
date_range = pd.date_range(start=df_aggregated['Date'].min(), end=df_aggregated['Date'].max()).to_frame(name="Date")

date_range

Unnamed: 0,Date
2020-03-24,2020-03-24
2020-03-25,2020-03-25
2020-03-26,2020-03-26
2020-03-27,2020-03-27
2020-03-28,2020-03-28
...,...
2024-01-15,2024-01-15
2024-01-16,2024-01-16
2024-01-17,2024-01-17
2024-01-18,2024-01-18


Calculate cumulative cases per municipality

In [16]:
df_cumsum = pd.DataFrame()

def calculate_cumsum(df, mun):
    # dataframe with all dates included
    df_extended = pd.DataFrame()
    df_extended["Date"] = date_range

    # merge with existing dataframe
    df_extended = pd.merge(df_extended, df, how='outer', on='Date')

    # add additional data
    df_extended['PSGC'] = mun
    df_extended["Municipality"] = df_psgc_mun[df_psgc_mun['psgc_clean'] == mun]['adm3_en'].values[0]

    df_extended = df_extended.fillna(0)

    # compute cumulative data
    df_extended["n"] = df_extended["d_cases"].cumsum()

    return df_extended

for mun in mun_pgsc:
    df_mun = calculate_cumsum(df_aggregated[df_aggregated["PSGC"]==mun], mun)
    df_cumsum = pd.concat([df_cumsum, df_mun])

df_cumsum

Unnamed: 0,Date,NewCases,Deaths,Recoveries,PSGC,Municipality,d_cases,n
0,2020-03-24,0.0,0.0,0.0,0405601000,Agdangan,0.0,0.0
1,2020-03-25,0.0,0.0,0.0,0405601000,Agdangan,0.0,0.0
2,2020-03-26,0.0,0.0,0.0,0405601000,Agdangan,0.0,0.0
3,2020-03-27,0.0,0.0,0.0,0405601000,Agdangan,0.0,0.0
4,2020-03-28,0.0,0.0,0.0,0405601000,Agdangan,0.0,0.0
...,...,...,...,...,...,...,...,...
1392,2024-01-15,0.0,0.0,0.0,0405649000,Unisan,0.0,0.0
1393,2024-01-16,0.0,0.0,0.0,0405649000,Unisan,0.0,0.0
1394,2024-01-17,0.0,0.0,0.0,0405649000,Unisan,0.0,0.0
1395,2024-01-18,0.0,0.0,0.0,0405649000,Unisan,0.0,0.0


Filter Data Frame by Date

In [17]:
end_date

Timestamp('2022-12-31 00:00:00')

In [18]:
df_cases_filtered_date = df_cumsum[(df_cumsum["Date"] >= start_date) & (df_cumsum["Date"] <= end_date)]
df_cases_filtered_date

Unnamed: 0,Date,NewCases,Deaths,Recoveries,PSGC,Municipality,d_cases,n
283,2021-01-01,0.0,0.0,0.0,0405601000,Agdangan,0.0,1.0
284,2021-01-02,0.0,0.0,0.0,0405601000,Agdangan,0.0,1.0
285,2021-01-03,0.0,0.0,0.0,0405601000,Agdangan,0.0,1.0
286,2021-01-04,0.0,0.0,0.0,0405601000,Agdangan,0.0,1.0
287,2021-01-05,0.0,0.0,0.0,0405601000,Agdangan,0.0,1.0
...,...,...,...,...,...,...,...,...
1008,2022-12-27,0.0,0.0,0.0,0405649000,Unisan,0.0,3.0
1009,2022-12-28,0.0,0.0,0.0,0405649000,Unisan,0.0,3.0
1010,2022-12-29,0.0,0.0,0.0,0405649000,Unisan,0.0,3.0
1011,2022-12-30,0.0,0.0,0.0,0405649000,Unisan,0.0,3.0


In [None]:
df_cases_filtered_date['Municipality'].unique()

array(['Agdangan', 'Alabat', 'Atimonan', 'Buenavista', 'Burdeos',
       'Calauag', 'Candelaria', 'Catanauan', 'Dolores', 'General Luna',
       'General Nakar', 'Guinayangan', 'Gumaca', 'Infanta', 'Jomalig',
       'Lopez', 'Lucban', 'Macalelon', 'Mauban', 'Mulanay',
       'Padre Burgos', 'Pagbilao', 'Panukulan', 'Patnanungan', 'Perez',
       'Pitogo', 'Plaridel', 'Polillo', 'Quezon', 'Real', 'Sampaloc',
       'San Andres', 'San Antonio', 'San Francisco', 'San Narciso',
       'Sariaya', 'Tagkawayan', 'City of Tayabas', 'Tiaong', 'Unisan'],
      dtype=object)

## Compute Expected Values

Load simulated population data

In [65]:
# load population dataset
df_pop = pd.read_csv(f'../01_data/01_processed/01_population_data/{province}_simulated_population.csv')

#def clean_psgc(x):
##    x = str(x)
#    if len(x) == 9:
#        return '0' + x
#    else:
#        return x

#df_pop['PSGC'] = df_pop['PSGC'].apply(clean_psgc)

df_pop

Unnamed: 0,Municipality,PSGC,2021,2022
0,Agdangan,405601000,12851,12764
1,Alabat,405602000,15630,15936
2,Atimonan,405603000,63432,70698
3,Buenavista,405605000,30047,30160
4,Burdeos,405606000,26760,24644
5,Calauag,405607000,73139,71809
6,Candelaria,405608000,117434,137881
7,Catanauan,405610000,71073,72752
8,Dolores,405615000,28891,32514
9,General Luna,405616000,26494,24804


### Calcurate Average Case rate $r$

Where $r$ is the average case rate for an area for the entire period:

In [66]:
r = 0

year_range = range(int(2021), int(2022)+1)

# calculate rate per year
for year in year_range:
    # get sum per year
    r += (df_cumsum[df_cumsum['Date'].dt.year == year]['n']/ df_pop[f'{year}'].sum()).sum()
    
r

0.15385684031350924

### Calculate expected number of cases per municipality

Create new DataFrame for expected number of cases

In [95]:
df_exp = pd.DataFrame(columns=['PSGC', 'exp'])


for mun in df_pop['PSGC'].unique():
    # calculate expected number of cases, E_i = r*N_i
    N_i = 0

    for year in year_range:
        N_i += df_pop[df_pop['PSGC'] == mun][f'{year}']
    
    # expected number of cases is equal to the average number of cases per day per municipality
    E_i = (r * (N_i/ len(year_range))) / len(pd.date_range(start=start_date, end=end_date))

    # add expected value to row
    df_exp.loc[-1] = [mun, E_i.iloc[0]]
    df_exp.index = df_exp.index + 1 
    df_exp = df_exp.sort_index()


df_exp

Unnamed: 0,PSGC,exp
0,405649000.0,5.514819
1,405648000.0,21.706144
2,405647000.0,22.386908
3,405646000.0,11.260424
4,405645000.0,31.399441
5,405644000.0,10.213776
6,405642000.0,13.021979
7,405641000.0,7.309043
8,405640000.0,7.541093
9,405639000.0,2.931078


Combine results with dataset

In [104]:
df_exp_cleaned = df_exp.copy()

def clean_psgc2(x):
    res_list = re.findall('[0-9]+', str(x))

    try:
        res = res_list[0]

        ## add zero after province
        if len(res) == 9:
            res = '0' + res[0:]
        elif len(res) == 10:
            res = res[:2] + '0' + res[2:]

        return res
    except IndexError:
        return None

# clean psgc code
df_exp_cleaned['PSGC'] = df_exp['PSGC'].apply(clean_psgc2)

df_exp_cleaned

Unnamed: 0,PSGC,exp
0,405649000,5.514819
1,405648000,21.706144
2,405647000,22.386908
3,405646000,11.260424
4,405645000,31.399441
5,405644000,10.213776
6,405642000,13.021979
7,405641000,7.309043
8,405640000,7.541093
9,405639000,2.931078


In [105]:
df_cases_filtered_date

Unnamed: 0,Date,NewCases,Deaths,Recoveries,PSGC,Municipality,d_cases,n
283,2021-01-01,0.0,0.0,0.0,0405601000,Agdangan,0.0,1.0
284,2021-01-02,0.0,0.0,0.0,0405601000,Agdangan,0.0,1.0
285,2021-01-03,0.0,0.0,0.0,0405601000,Agdangan,0.0,1.0
286,2021-01-04,0.0,0.0,0.0,0405601000,Agdangan,0.0,1.0
287,2021-01-05,0.0,0.0,0.0,0405601000,Agdangan,0.0,1.0
...,...,...,...,...,...,...,...,...
1008,2022-12-27,0.0,0.0,0.0,0405649000,Unisan,0.0,3.0
1009,2022-12-28,0.0,0.0,0.0,0405649000,Unisan,0.0,3.0
1010,2022-12-29,0.0,0.0,0.0,0405649000,Unisan,0.0,3.0
1011,2022-12-30,0.0,0.0,0.0,0405649000,Unisan,0.0,3.0


In [107]:
df_cumsum_exp = pd.merge(df_cases_filtered_date, df_exp_cleaned, on='PSGC')
df_cumsum_exp

Unnamed: 0,Date,NewCases,Deaths,Recoveries,PSGC,Municipality,d_cases,n,exp
0,2021-01-01,0.0,0.0,0.0,0405601000,Agdangan,0.0,1.0,2.699344
1,2021-01-02,0.0,0.0,0.0,0405601000,Agdangan,0.0,1.0,2.699344
2,2021-01-03,0.0,0.0,0.0,0405601000,Agdangan,0.0,1.0,2.699344
3,2021-01-04,0.0,0.0,0.0,0405601000,Agdangan,0.0,1.0,2.699344
4,2021-01-05,0.0,0.0,0.0,0405601000,Agdangan,0.0,1.0,2.699344
...,...,...,...,...,...,...,...,...,...
29195,2022-12-27,0.0,0.0,0.0,0405649000,Unisan,0.0,3.0,5.514819
29196,2022-12-28,0.0,0.0,0.0,0405649000,Unisan,0.0,3.0,5.514819
29197,2022-12-29,0.0,0.0,0.0,0405649000,Unisan,0.0,3.0,5.514819
29198,2022-12-30,0.0,0.0,0.0,0405649000,Unisan,0.0,3.0,5.514819


## Export Data to CSV file

Reorder Columns

In [108]:
df_cumsum_exp = df_cumsum_exp.iloc[:, [5, 4, 0, 1, 2, 3, 6, 7, 8]]
df_cumsum_exp

Unnamed: 0,Municipality,PSGC,Date,NewCases,Deaths,Recoveries,d_cases,n,exp
0,Agdangan,0405601000,2021-01-01,0.0,0.0,0.0,0.0,1.0,2.699344
1,Agdangan,0405601000,2021-01-02,0.0,0.0,0.0,0.0,1.0,2.699344
2,Agdangan,0405601000,2021-01-03,0.0,0.0,0.0,0.0,1.0,2.699344
3,Agdangan,0405601000,2021-01-04,0.0,0.0,0.0,0.0,1.0,2.699344
4,Agdangan,0405601000,2021-01-05,0.0,0.0,0.0,0.0,1.0,2.699344
...,...,...,...,...,...,...,...,...,...
29195,Unisan,0405649000,2022-12-27,0.0,0.0,0.0,0.0,3.0,5.514819
29196,Unisan,0405649000,2022-12-28,0.0,0.0,0.0,0.0,3.0,5.514819
29197,Unisan,0405649000,2022-12-29,0.0,0.0,0.0,0.0,3.0,5.514819
29198,Unisan,0405649000,2022-12-30,0.0,0.0,0.0,0.0,3.0,5.514819


In [109]:
df_cumsum_exp.to_csv(f"../01_data/01_processed/00_case_data/{province}_case_data_psgc.csv",  index=False)