# Aggregate Case Information by Date and Municiaplity

## Preliminaries

Import dependencies

In [2]:
import pandas as pd

Set Province to be used

In [3]:
province = "CATANDUANES"

Read from CSV File

In [4]:
# read from csv file
df = pd.read_csv(f"filtered_cases/{province}_case_information.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,CaseCode,Age,AgeGroup,Sex,DateSpecimen,DateResultRelease,DateRepConf,DateDied,DateRecover,...,ProvRes,CityMunRes,CityMuniPSGC,BarangayRes,BarangayPSGC,HealthStatus,Quarantined,DateOnset,Pregnanttab,ValidationStatus
0,19761,C781369,50.0,50 to 54,MALE,2020-05-30,,2020-06-06,,2020-07-07,...,CATANDUANES,SAN MIGUEL,PH052009000,,,RECOVERED,YES,,,Age or Birthdate is Invalid\nCase has Admittin...
1,29313,C636837,47.0,45 to 49,MALE,,,2020-06-22,,2020-07-08,...,CATANDUANES,BAGAMANOC,PH052001000,,,RECOVERED,YES,,,Age or Birthdate is Invalid
2,43311,C951557,1.0,0 to 4,MALE,2020-07-02,2020-07-04,2020-07-06,,2020-07-23,...,CATANDUANES,VIRAC (CAPITAL),PH052011000,,,RECOVERED,NO,2020-06-28,,Age or Birthdate is Invalid
3,45859,C639154,21.0,20 to 24,MALE,2020-06-30,2020-07-05,2020-07-08,,,...,CATANDUANES,CARAMORAN,PH052004000,,,RECOVERED,NO,2020-06-30,,"Removal Type is ""Recovered"", but no Recovered ..."
4,49871,C619176,22.0,20 to 24,FEMALE,2020-07-06,2020-07-07,2020-07-10,,2020-07-23,...,CATANDUANES,VIRAC (CAPITAL),PH052011000,,,RECOVERED,YES,,NO,Age or Birthdate is Invalid


## Aggregating Data

### List down columns
- Date : date recorded
- Municipality : municipality of case
- NewCases : number of new cases
- Deaths : number of deaths
- Recoveries : number of recoveries

In [5]:
col_names = ["NewCases", "Deaths", "Recoveries"]

Create new DataFrame for data

In [6]:
df_aggregated = pd.DataFrame(columns = col_names)
df_aggregated

Unnamed: 0,NewCases,Deaths,Recoveries


Get List of Municipalities

In [7]:
mun_list = df["CityMunRes"].unique()
mun_list

array(['SAN MIGUEL', 'BAGAMANOC', 'VIRAC (CAPITAL)', 'CARAMORAN', nan,
       'GIGMOTO', 'BARAS', 'PANGANIBAN (PAYO)', 'PANDAN', 'BATO',
       'SAN ANDRES (CALOLBON)', 'VIGA'], dtype=object)

Parse Data by Municipality

In [None]:
"""
mun_dict = {}

for mun in mun_list:
    #print(mun)

    # for each unique dates, count the number of new cases
    cases = df.groupby("DateRepConf").size()
    #print(cases)

    ########################################

    # count number of deaths

    # for deaths with death date
    deaths = df[df["RemovalType"] == "DIED"].groupby("DateDied").size()
    #print(deaths)

    deaths_dict = deaths.to_dict()
    # for deaths with no death date, use DateRepRem
    for index, row in df[df["RemovalType"] == "DIED"].iterrows():

        # check if value is nan
        if row["DateDied"] !=  row["DateDied"]:
            # add date to deaths
            new_date = row["DateRepRem"]

            # if already in count
            if new_date in deaths_dict:
                deaths_dict[new_date] += 1

            #if not yet in count
            else:
                deaths_dict[new_date] = 1

    deaths_combined = pd.Series(deaths_dict)


    ########################################
     
    # count number of recoveries
    recoveries =  df[df["RemovalType"] == "RECOVERED"].groupby("DateRecover").size()

    #for recoveries with no DateRecover, use DateRepRem
    recoveries_dict = recoveries.to_dict()
    # for deaths with no death date, use DateRepRem
    for index, row in df[df["RemovalType"] == "RECOVERED"].iterrows():

        # check if value is nan
        if row["DateRecover"] !=  row["DateRecover"]:
            # add date to deaths
            new_date = row["DateRepRem"]

            # if already in count
            if new_date in recoveries_dict:
                recoveries_dict[new_date] += 1

            #if not yet in count
            else:
                recoveries_dict[new_date] = 1

    recoveries_combined = pd.Series(recoveries_dict)
    
    # add all values to DataFrame
    new_df = pd.DataFrame(columns = col_names)
    new_df.index.name = "Date"

    new_df["NewCases"] = cases
    new_df["Deaths"] = deaths_combined
    new_df["Recoveries"] = recoveries_combined

    new_df = new_df.fillna(0)

    #print(new_df)

    mun_dict[mun] = new_df

"""

Use DateRepRem instead for consistency

In [None]:
mun_dict = {}

for mun in mun_list:
    #print(mun)

    # for each unique dates, count the number of new cases
    cases = df.groupby("DateRepConf").size()

    # count number of deaths
    deaths = df[df["RemovalType"] == "DIED"].groupby("DateRepRem").size()

    # count number of recoveries
    recoveries = df[df["RemovalType"] == "RECOVERED"].groupby("DateRepRem").size()

    # create new DataFrame for data
    new_df = pd.DataFrame(columns = col_names)
    new_df.index.name = "Date"

    new_df["NewCases"] = cases
    new_df["Deaths"] = deaths
    new_df["Recoveries"] = recoveries

    new_df = new_df.fillna(0)

    mun_dict[mun] = new_df


DateRepRem
2020-09-14    1
2020-10-02    1
2021-01-28    1
2021-04-07    2
2021-05-16    1
2021-05-22    1
2021-05-28    1
2021-05-31    1
2021-07-01    1
2021-07-17    1
2021-07-30    1
2021-07-31    1
2021-08-18    1
2021-09-23    1
2021-09-28    1
2021-10-13    1
2021-10-30    1
2021-11-03    1
2021-11-05    1
2021-11-15    2
2021-11-26    1
2021-11-30    1
2021-12-04    2
2021-12-27    1
2022-01-08    1
2022-03-20    1
2022-08-16    1
2022-08-20    1
2022-08-22    1
2022-08-23    1
2022-08-29    1
2022-09-01    2
2022-09-04    1
2022-11-03    2
2022-11-17    1
dtype: int64
DateRepRem
2020-09-14    1
2020-10-02    1
2021-01-28    1
2021-04-07    2
2021-05-16    1
2021-05-22    1
2021-05-28    1
2021-05-31    1
2021-07-01    1
2021-07-17    1
2021-07-30    1
2021-07-31    1
2021-08-18    1
2021-09-23    1
2021-09-28    1
2021-10-13    1
2021-10-30    1
2021-11-03    1
2021-11-05    1
2021-11-15    2
2021-11-26    1
2021-11-30    1
2021-12-04    2
2021-12-27    1
2022-01-08    1
2022-

Output aggregated data to CSV files

In [83]:
for mun in mun_dict:
    mun_dict[mun].to_csv(f"aggregated_data/{province}/{mun}.csv")