# Python Notebook for Processing Dataset

## Preliminaries

Import Dependencies

In [23]:
import pandas as pd
import os, re

Set Province

In [24]:
province = "CATANDUANES"

## Filter Case Information by Province

Combine all CSV files into one DataFrame

In [None]:
# initialize an empty DataFrame
df_cases = pd.DataFrame()

# get all paths for each CSV file
dir = os.listdir("../01_data/00_raw/00_case_information")

# append each CSV file to DataFrame
for csv_file in dir:
    path = f'../01_data/00_raw/00_case_information/{csv_file}'

    # check if dataframe is empty
    if (df_cases.empty):
        #print(csv_file)
        df_cases = pd.read_csv(path)
    else:
        df_cases = pd.concat([df_cases, pd.read_csv(path)])

  df_cases = pd.read_csv(path)
  df_cases = pd.concat([df_cases, pd.read_csv(path)])


In [26]:
#verify number of cases
df_cases.shape

(4136488, 23)

Filter by Province

In [27]:
df_cases_filtered = df_cases[df_cases['ProvRes'] == province]
df_cases_filtered.shape

(4258, 23)

Delete Initial DataFrame to free up memory

In [28]:
del df_cases

## Aggregate Data

Drop rows with no listed municipality

In [29]:
df_cases_filtered = df_cases_filtered.dropna(subset='CityMunRes')
df_cases_filtered.shape

(4055, 23)

Get List of Municipalities

In [30]:
mun_list = df_cases_filtered["CityMunRes"].unique()

# remove parentheses to retain name only
#mun_list = [re.sub(' \(.*\)', '', x) for x in mun_list]

mun_list

array(['SAN MIGUEL', 'BAGAMANOC', 'VIRAC (CAPITAL)', 'CARAMORAN',
       'GIGMOTO', 'BARAS', 'PANGANIBAN (PAYO)', 'PANDAN', 'BATO',
       'SAN ANDRES (CALOLBON)', 'VIGA'], dtype=object)

Aggregate Data for each Municipality

In [31]:
# create initial empty DataFrame
df_aggregated = pd.DataFrame()

def aggregate_data(df, mun):
    # for each unique dates, count the number of new cases, deaths, and recoveries
    cases = df.groupby("DateRepConf").size()
    cases.index.name = "Date"

    deaths = df[df["RemovalType"] == "DIED"].groupby("DateRepRem").size()
    deaths.index.name = "Date"

    recoveries = df[df["RemovalType"] == "RECOVERED"].groupby("DateRepRem").size()
    recoveries.index.name = "Date"

    # create a new DataFrame for aggregated data
    new_df = pd.DataFrame()

    data = {
        "NewCases" : cases,
        "Deaths" : deaths,
        "Recoveries" : recoveries
    }

    new_df = pd.concat(data, axis = 1)

    # add information
    clean_mun = re.sub(' \(.*\)', '', mun)
    new_df["Municipality"] = clean_mun

    # sort by date
    new_df = new_df.reset_index()
    new_df["Date"] = pd.to_datetime(new_df.Date, format='mixed')

    new_df = new_df.sort_values(by="Date")
    new_df = new_df.fillna(0)

    return new_df


for mun in mun_list:
    df_mun = aggregate_data(df_cases_filtered[df_cases_filtered["CityMunRes"] == mun], mun)
    df_aggregated = pd.concat([df_aggregated, df_mun], ignore_index=True)


# reorganize columns
df_aggregated = df_aggregated.iloc[:, [4, 0, 1, 2, 3]]
#df_aggregated =df_aggregated.iloc[:, [6, 3, 4, 5, 0, 1, 2]]

# reformat date
#df_aggregated["Date"] = pd.to_datetime(df_aggregated.Date, format='mixed')

df_aggregated


Unnamed: 0,Municipality,Date,NewCases,Deaths,Recoveries
0,SAN MIGUEL,2020-06-06,1.0,0.0,0.0
1,SAN MIGUEL,2020-07-16,0.0,0.0,1.0
2,SAN MIGUEL,2020-09-10,1.0,0.0,0.0
3,SAN MIGUEL,2020-09-20,0.0,0.0,1.0
4,SAN MIGUEL,2020-09-24,1.0,0.0,0.0
...,...,...,...,...,...
2570,VIGA,2023-06-09,0.0,0.0,1.0
2571,VIGA,2023-06-28,1.0,0.0,0.0
2572,VIGA,2023-07-07,0.0,0.0,1.0
2573,VIGA,2023-12-30,1.0,0.0,0.0


Delete df_cases_filtered to free up memory

In [11]:
del df_cases_filtered

## Compute for Cumulative Cases for each Municipality

Compute for change in cases per row

In [32]:
df_aggregated["d_cases"] = df_aggregated["NewCases"] - df_aggregated["Deaths"] - df_aggregated["Recoveries"]

df_aggregated

Unnamed: 0,Municipality,Date,NewCases,Deaths,Recoveries,d_cases
0,SAN MIGUEL,2020-06-06,1.0,0.0,0.0,1.0
1,SAN MIGUEL,2020-07-16,0.0,0.0,1.0,-1.0
2,SAN MIGUEL,2020-09-10,1.0,0.0,0.0,1.0
3,SAN MIGUEL,2020-09-20,0.0,0.0,1.0,-1.0
4,SAN MIGUEL,2020-09-24,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...
2570,VIGA,2023-06-09,0.0,0.0,1.0,-1.0
2571,VIGA,2023-06-28,1.0,0.0,0.0,1.0
2572,VIGA,2023-07-07,0.0,0.0,1.0,-1.0
2573,VIGA,2023-12-30,1.0,0.0,0.0,1.0


Get Date Range for Entire Data

In [33]:
# get min and max date
min_date = df_aggregated["Date"].min()
max_date = df_aggregated["Date"].max()

# create series of date from min to max
date_range = pd.date_range(start=min_date, end=max_date).to_frame(name="Date")

date_range

Unnamed: 0,Date
2020-06-06,2020-06-06
2020-06-07,2020-06-07
2020-06-08,2020-06-08
2020-06-09,2020-06-09
2020-06-10,2020-06-10
...,...
2024-01-04,2024-01-04
2024-01-05,2024-01-05
2024-01-06,2024-01-06
2024-01-07,2024-01-07


Calculate cumulative cases per municipality

In [40]:
df_cumsum = pd.DataFrame()
mun_list = df_aggregated["Municipality"].unique()

#print(mun_list)

def calculate_cumsum(df, mun):
    # dataframe with all dates included
    df_extended = pd.DataFrame()
    df_extended["Date"] = date_range

    # merge with existing dataframe
    df_extended = pd.merge(df_extended, df, how='outer', on='Date')

    # add additional data
    df_extended['Municipality'] = mun
    df_extended = df_extended.fillna(0)

    # compute cumulative data
    df_extended["n"] = df_extended["d_cases"].cumsum()

    return df_extended

for mun in mun_list:
    df_mun = calculate_cumsum(df_aggregated[df_aggregated["Municipality"]==mun], mun)
    df_cumsum = pd.concat([df_cumsum, df_mun])

df_cumsum

Unnamed: 0,Date,Municipality,NewCases,Deaths,Recoveries,d_cases,n
0,2020-06-06,SAN MIGUEL,1.0,0.0,0.0,1.0,1.0
1,2020-06-07,SAN MIGUEL,0.0,0.0,0.0,0.0,1.0
2,2020-06-08,SAN MIGUEL,0.0,0.0,0.0,0.0,1.0
3,2020-06-09,SAN MIGUEL,0.0,0.0,0.0,0.0,1.0
4,2020-06-10,SAN MIGUEL,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
1307,2024-01-04,VIGA,0.0,0.0,0.0,0.0,1.0
1308,2024-01-05,VIGA,0.0,0.0,0.0,0.0,1.0
1309,2024-01-06,VIGA,0.0,0.0,0.0,0.0,1.0
1310,2024-01-07,VIGA,0.0,0.0,1.0,-1.0,0.0


## Export Data to CSV file

In [None]:
df_cumsum.to_csv(f"../01_data/01_processed/00_case_data/{province}_case_data.csv",index=False)