# Aggregate Case Information by Date and Municiaplity

## Preliminaries

Import dependencies

In [1]:
import pandas as pd
import re
import numpy as np

Set Province to be used

In [2]:
province = "CATANDUANES"

Read from CSV File

In [3]:
# read from csv file
df = pd.read_csv(f"filtered_cases/{province}_case_information.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,CaseCode,Age,AgeGroup,Sex,DateSpecimen,DateResultRelease,DateRepConf,DateDied,DateRecover,...,ProvRes,CityMunRes,CityMuniPSGC,BarangayRes,BarangayPSGC,HealthStatus,Quarantined,DateOnset,Pregnanttab,ValidationStatus
0,19761,C781369,50.0,50 to 54,MALE,2020-05-30,,2020-06-06,,2020-07-07,...,CATANDUANES,SAN MIGUEL,PH052009000,,,RECOVERED,YES,,,Age or Birthdate is Invalid\nCase has Admittin...
1,29313,C636837,47.0,45 to 49,MALE,,,2020-06-22,,2020-07-08,...,CATANDUANES,BAGAMANOC,PH052001000,,,RECOVERED,YES,,,Age or Birthdate is Invalid
2,43311,C951557,1.0,0 to 4,MALE,2020-07-02,2020-07-04,2020-07-06,,2020-07-23,...,CATANDUANES,VIRAC (CAPITAL),PH052011000,,,RECOVERED,NO,2020-06-28,,Age or Birthdate is Invalid
3,45859,C639154,21.0,20 to 24,MALE,2020-06-30,2020-07-05,2020-07-08,,,...,CATANDUANES,CARAMORAN,PH052004000,,,RECOVERED,NO,2020-06-30,,"Removal Type is ""Recovered"", but no Recovered ..."
4,49871,C619176,22.0,20 to 24,FEMALE,2020-07-06,2020-07-07,2020-07-10,,2020-07-23,...,CATANDUANES,VIRAC (CAPITAL),PH052011000,,,RECOVERED,YES,,NO,Age or Birthdate is Invalid


In [21]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Age
count,4258.0,4249.0
mean,509856.041099,39.900447
std,260555.186166,20.402246
min,333.0,0.0
25%,307701.5,25.0
50%,568280.0,37.0
75%,718138.75,54.0
max,999644.0,107.0


Remove rows with no Municipality

In [27]:
df.dropna(subset='CityMunRes').describe()

Unnamed: 0.1,Unnamed: 0,Age
count,4055.0,4049.0
mean,493348.352898,39.891331
std,251336.037496,20.373987
min,1032.0,0.0
25%,295296.0,25.0
50%,549353.0,37.0
75%,706109.5,54.0
max,999644.0,107.0


## Aggregating Data

### List down columns
- Date : date recorded
- Municipality : municipality of case
- NewCases : number of new cases
- Deaths : number of deaths
- Recoveries : number of recoveries

Get List of Municipalities

In [30]:
mun_list = df["CityMunRes"].unique()

# remove nan
mun_list = mun_list[~pd.isnull(mun_list)]

# remove parenthesis
#mun_list = [re.sub(' \(.*\)', '', x) for x in mun_list]

mun_list

array(['SAN MIGUEL', 'BAGAMANOC', 'VIRAC (CAPITAL)', 'CARAMORAN',
       'GIGMOTO', 'BARAS', 'PANGANIBAN (PAYO)', 'PANDAN', 'BATO',
       'SAN ANDRES (CALOLBON)', 'VIGA'], dtype=object)

Parse Data by Municipality

In [31]:
# load municipality location data
locations = pd.read_csv(f"location_data/filtered_data/{province}.csv")

# set municipality name to uppercase
locations["Municipality/City"] = locations["Municipality/City"].apply(lambda x: x.upper())

locations = locations.set_index("Municipality/City")

locations

Unnamed: 0_level_0,Unnamed: 0.1,Unnamed: 0,Province,Latitude,Longitude
Municipality/City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BAGAMANOC,401,401,Catanduanes,13.939637,124.28783
BARAS,402,402,Catanduanes,13.659104,124.370468
BATO,403,403,Catanduanes,13.608036,124.297787
CARAMORAN,404,404,Catanduanes,13.983835,124.133872
GIGMOTO,405,405,Catanduanes,13.778669,124.391173
PANDAN,406,406,Catanduanes,14.049227,124.170211
PANGANIBAN,407,407,Catanduanes,13.908167,124.30082
SAN ANDRES,408,408,Catanduanes,13.598915,124.096969
SAN MIGUEL,409,409,Catanduanes,13.640329,124.303109
VIGA,410,410,Catanduanes,13.871531,124.30873


Create DataFrame for aggregated data

Use DateRepRem instead for consistency

In [33]:
df_aggregated = pd.DataFrame()
#df_aggregated

for mun in mun_list:
    print(mun)

    # for each unique dates, count the number of new cases
    cases = df[df['CityMunRes'] == mun].groupby("DateRepConf").size()
    cases.index.name = "Date"
    #print(cases)
    print(cases.describe())

    # count number of deaths
    deaths = df[(df['CityMunRes'] == mun) & (df["RemovalType"] == "DIED")].groupby("DateRepRem").size()
    deaths.index.name = "Date"

    # count number of recoveries
    recoveries = df[(df['CityMunRes'] == mun) & (df["RemovalType"] == "RECOVERED")].groupby("DateRepRem").size()
    recoveries.index.name = "Date"

    # create new DataFrame for data
    new_df = pd.DataFrame()

    data = {
        "NewCases" : cases,
        "Deaths" : deaths,
        "Recoveries" : recoveries
    }

    new_df = pd.concat(data, axis = 1)

    #print(new_df)
    clean_mun = re.sub(' \(.*\)', '', mun)

    #new_df.index.name = "Date"
    #new_df["NewCases"] = cases
    #new_df["Deaths"] = deaths
    #new_df["Recoveries"] = recoveries
    new_df["Municipality"] = clean_mun
    new_df["Latitude"] = locations.loc[clean_mun]["Latitude"]
    new_df["Longitude"] = locations.loc[clean_mun]["Longitude"]
    new_df["Date"] = new_df.index

    new_df = new_df.fillna(0)

    # append to aggregated dataframe
    df_aggregated = pd.concat([df_aggregated, new_df], ignore_index=True)

    #mun_dict[mun] = new_df

    #print(new_df)

# reorganize columns
df_aggregated =df_aggregated.iloc[:, [6, 3, 4, 5, 0, 1, 2]]

# reformat date
df_aggregated["Date"] = pd.to_datetime(df_aggregated.Date, format='mixed')

df_aggregated


SAN MIGUEL
count    106.000000
mean       1.962264
std        1.406952
min        1.000000
25%        1.000000
50%        1.000000
75%        3.000000
max        9.000000
dtype: float64
BAGAMANOC
count    91.000000
mean      2.329670
std       1.598992
min       1.000000
25%       1.000000
50%       2.000000
75%       3.500000
max       8.000000
dtype: float64
VIRAC (CAPITAL)
count    390.000000
mean       3.705128
std        4.263521
min        1.000000
25%        1.000000
50%        2.000000
75%        4.000000
max       25.000000
dtype: float64
CARAMORAN
count    138.000000
mean       2.594203
std        2.973454
min        1.000000
25%        1.000000
50%        1.000000
75%        3.000000
max       18.000000
dtype: float64
GIGMOTO
count    84.000000
mean      1.785714
std       1.456778
min       1.000000
25%       1.000000
50%       1.000000
75%       2.000000
max       9.000000
dtype: float64
BARAS
count    116.000000
mean       1.491379
std        1.008621
min        1.000000


Unnamed: 0,Date,Municipality,Latitude,Longitude,NewCases,Deaths,Recoveries
0,2023-04-20,SAN MIGUEL,13.640329,124.303109,2.0,0.0,0.0
1,2023-04-28,SAN MIGUEL,13.640329,124.303109,1.0,0.0,2.0
2,2023-05-07,SAN MIGUEL,13.640329,124.303109,1.0,0.0,0.0
3,2023-05-14,SAN MIGUEL,13.640329,124.303109,4.0,0.0,0.0
4,2023-05-19,SAN MIGUEL,13.640329,124.303109,2.0,0.0,0.0
...,...,...,...,...,...,...,...
2570,2022-05-16,VIGA,13.871531,124.308730,0.0,0.0,1.0
2571,2022-06-10,VIGA,13.871531,124.308730,0.0,0.0,1.0
2572,2022-07-13,VIGA,13.871531,124.308730,0.0,0.0,1.0
2573,2022-07-19,VIGA,13.871531,124.308730,0.0,0.0,1.0


In [34]:
df_aggregated.sum(numeric_only=True)

Latitude       35381.021811
Longitude     319935.346525
NewCases        4055.000000
Deaths            38.000000
Recoveries      4015.000000
dtype: float64

Output aggregated data to CSV files

In [35]:
#for mun in mun_dict:
#    mun_dict[mun].to_csv(f"aggregated_data/{province}/{mun}.csv")

df_aggregated.to_csv(f"aggregated_data/{province}.csv")