# Python Notebook for Processing Dataset

## Preliminaries

Import Dependencies

In [3]:
import pandas as pd
import os, re

Set Province

In [1]:
province = "CATANDUANES"

Set Start and End Date for model

In [4]:
start_date = pd.to_datetime('2021-01-01')
end_date = pd.to_datetime('2022-12-31')

(start_date, end_date)

(Timestamp('2021-01-01 00:00:00'), Timestamp('2022-12-31 00:00:00'))

## Filter Case Information by Province

Combine all CSV files into one DataFrame

In [5]:
# initialize an empty DataFrame
df_cases = pd.DataFrame()

# get all paths for each CSV file
dir = os.listdir("../01_data/00_raw/00_case_information")

# append each CSV file to DataFrame
for csv_file in dir:
    path = f'../01_data/00_raw/00_case_information/{csv_file}'

    # check if dataframe is empty
    if (df_cases.empty):
        #print(csv_file)
        df_cases = pd.read_csv(path)
    else:
        df_cases = pd.concat([df_cases, pd.read_csv(path)])

  df_cases = pd.read_csv(path)
  df_cases = pd.concat([df_cases, pd.read_csv(path)])


In [6]:
#verify number of cases
df_cases.shape

(4136488, 24)

Filter by Province

In [7]:
df_cases_filtered = df_cases[df_cases['ProvRes'] == province]
df_cases_filtered.shape

(4258, 24)

Delete Initial DataFrame to free up memory

In [8]:
del df_cases

## Aggregate Data

Drop rows with no listed municipality

In [9]:
df_cases_filtered = df_cases_filtered.dropna(subset='CityMunRes')
df_cases_filtered.shape

(4055, 24)

Get List of Municipalities

In [10]:
mun_list = df_cases_filtered["CityMunRes"].unique()

# remove parentheses to retain name only
#mun_list = [re.sub(' \(.*\)', '', x) for x in mun_list]

mun_list

array(['SAN MIGUEL', 'BAGAMANOC', 'VIRAC (CAPITAL)', 'CARAMORAN',
       'GIGMOTO', 'BARAS', 'PANGANIBAN (PAYO)', 'PANDAN', 'BATO',
       'SAN ANDRES (CALOLBON)', 'VIGA'], dtype=object)

Aggregate Data for each Municipality

In [11]:
# create initial empty DataFrame
df_aggregated = pd.DataFrame()

def aggregate_data(df, mun):
    # for each unique dates, count the number of new cases, deaths, and recoveries
    cases = df.groupby("DateRepConf").size()
    cases.index.name = "Date"

    deaths = df[df["RemovalType"] == "DIED"].groupby("DateRepRem").size()
    deaths.index.name = "Date"

    recoveries = df[df["RemovalType"] == "RECOVERED"].groupby("DateRepRem").size()
    recoveries.index.name = "Date"

    # create a new DataFrame for aggregated data
    new_df = pd.DataFrame()

    data = {
        "NewCases" : cases,
        "Deaths" : deaths,
        "Recoveries" : recoveries
    }

    new_df = pd.concat(data, axis = 1)

    # add information
    clean_mun = re.sub(' \(.*\)', '', mun)
    new_df["Municipality"] = clean_mun

    # sort by date
    new_df = new_df.reset_index()
    new_df["Date"] = pd.to_datetime(new_df.Date, format='mixed')

    new_df = new_df.sort_values(by="Date")
    new_df = new_df.fillna(0)

    return new_df


for mun in mun_list:
    df_mun = aggregate_data(df_cases_filtered[df_cases_filtered["CityMunRes"] == mun], mun)
    df_aggregated = pd.concat([df_aggregated, df_mun], ignore_index=True)


# reorganize columns
df_aggregated = df_aggregated.iloc[:, [4, 0, 1, 2, 3]]
#df_aggregated =df_aggregated.iloc[:, [6, 3, 4, 5, 0, 1, 2]]

# reformat date
#df_aggregated["Date"] = pd.to_datetime(df_aggregated.Date, format='mixed')

df_aggregated


Unnamed: 0,Municipality,Date,NewCases,Deaths,Recoveries
0,SAN MIGUEL,2020-06-06,1.0,0.0,0.0
1,SAN MIGUEL,2020-07-16,0.0,0.0,1.0
2,SAN MIGUEL,2020-09-10,1.0,0.0,0.0
3,SAN MIGUEL,2020-09-20,0.0,0.0,1.0
4,SAN MIGUEL,2020-09-24,1.0,0.0,0.0
...,...,...,...,...,...
2570,VIGA,2023-06-09,0.0,0.0,1.0
2571,VIGA,2023-06-28,1.0,0.0,0.0
2572,VIGA,2023-07-07,0.0,0.0,1.0
2573,VIGA,2023-12-30,1.0,0.0,0.0


Delete df_cases_filtered to free up memory

In [12]:
del df_cases_filtered

## Compute for Cumulative Cases for each Municipality

Compute for change in cases per row

In [13]:
df_aggregated["d_cases"] = df_aggregated["NewCases"] - df_aggregated["Deaths"] - df_aggregated["Recoveries"]

df_aggregated

Unnamed: 0,Municipality,Date,NewCases,Deaths,Recoveries,d_cases
0,SAN MIGUEL,2020-06-06,1.0,0.0,0.0,1.0
1,SAN MIGUEL,2020-07-16,0.0,0.0,1.0,-1.0
2,SAN MIGUEL,2020-09-10,1.0,0.0,0.0,1.0
3,SAN MIGUEL,2020-09-20,0.0,0.0,1.0,-1.0
4,SAN MIGUEL,2020-09-24,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...
2570,VIGA,2023-06-09,0.0,0.0,1.0,-1.0
2571,VIGA,2023-06-28,1.0,0.0,0.0,1.0
2572,VIGA,2023-07-07,0.0,0.0,1.0,-1.0
2573,VIGA,2023-12-30,1.0,0.0,0.0,1.0


Filter Data Frame to only use certain dates

In [17]:
df_cases_filtered_date = df_aggregated[(df_aggregated["Date"] >= start_date) & (df_aggregated["Date"] <= end_date)]
df_cases_filtered_date

Unnamed: 0,Municipality,Date,NewCases,Deaths,Recoveries,d_cases
8,SAN MIGUEL,2021-01-14,1.0,0.0,0.0,1.0
9,SAN MIGUEL,2021-01-24,0.0,0.0,1.0,-1.0
10,SAN MIGUEL,2021-02-01,1.0,0.0,0.0,1.0
11,SAN MIGUEL,2021-02-21,0.0,0.0,1.0,-1.0
12,SAN MIGUEL,2021-04-05,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...
2561,VIGA,2022-08-10,1.0,0.0,0.0,1.0
2562,VIGA,2022-08-20,1.0,0.0,1.0,0.0
2563,VIGA,2022-09-01,0.0,1.0,1.0,-2.0
2564,VIGA,2022-10-10,1.0,0.0,0.0,1.0


Delete df_aggregated

In [23]:
del df_aggregated

Generate Date Range Series for Entire Data

In [18]:
# create series of date from min to max
date_range = pd.date_range(start=start_date, end=end_date).to_frame(name="Date")

date_range

Unnamed: 0,Date
2021-01-01,2021-01-01
2021-01-02,2021-01-02
2021-01-03,2021-01-03
2021-01-04,2021-01-04
2021-01-05,2021-01-05
...,...
2022-12-27,2022-12-27
2022-12-28,2022-12-28
2022-12-29,2022-12-29
2022-12-30,2022-12-30


Calculate cumulative cases per municipality

In [22]:
df_cumsum = pd.DataFrame()
mun_list = df_cases_filtered_date["Municipality"].unique()

#print(mun_list)

def calculate_cumsum(df, mun):
    # dataframe with all dates included
    df_extended = pd.DataFrame()
    df_extended["Date"] = date_range

    # merge with existing dataframe
    df_extended = pd.merge(df_extended, df, how='outer', on='Date')

    # add additional data
    df_extended['Municipality'] = mun
    df_extended = df_extended.fillna(0)

    # compute cumulative data
    df_extended["n"] = df_extended["d_cases"].cumsum()

    return df_extended

for mun in mun_list:
    df_mun = calculate_cumsum(df_cases_filtered_date[df_cases_filtered_date["Municipality"]==mun], mun)
    df_cumsum = pd.concat([df_cumsum, df_mun])

df_cumsum

Unnamed: 0,Date,Municipality,NewCases,Deaths,Recoveries,d_cases,n
0,2021-01-01,SAN MIGUEL,0.0,0.0,0.0,0.0,0.0
1,2021-01-02,SAN MIGUEL,0.0,0.0,0.0,0.0,0.0
2,2021-01-03,SAN MIGUEL,0.0,0.0,0.0,0.0,0.0
3,2021-01-04,SAN MIGUEL,0.0,0.0,0.0,0.0,0.0
4,2021-01-05,SAN MIGUEL,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
725,2022-12-27,VIGA,0.0,0.0,0.0,0.0,0.0
726,2022-12-28,VIGA,0.0,0.0,0.0,0.0,0.0
727,2022-12-29,VIGA,0.0,0.0,0.0,0.0,0.0
728,2022-12-30,VIGA,0.0,0.0,0.0,0.0,0.0


## Compute for expected number of cases

Load simulated population data

In [20]:
# load population dataset
df_pop = pd.read_csv(f'../01_data/01_processed/01_population_data/{province}_simulated_population.csv')

### Calcurate Average Case rate $r$

Where $r$ is the average case rate for an area for the entire period:

In [24]:
r = 0

year_range = range(int(start_date.year), int(end_date.year)+1)

# calculate rate per year
for year in year_range:
    # get sum per year
    r += (df_cumsum[df_cumsum['Date'].dt.year == year]['n']/ df_pop[f'{year}'].sum()).sum()
    
r

0.11776288880803139

### Calculate expected number of cases per municipality

Create new DataFrame for expected number of cases

In [25]:
df_exp = pd.DataFrame(columns=['Municipality', 'exp'])


for mun in df_pop['Municipality'].unique():
    # calculate expected number of cases, E_i = r*N_i
    N_i = 0

    for year in year_range:
        N_i += df_pop[df_pop['Municipality'] == mun][f'{year}']
    
    # expected number of cases is equal to the average number of cases per day per municipality
    E_i = (r * (N_i/ len(year_range))) / len(pd.date_range(start=start_date, end=end_date))

    # add expected value to row
    df_exp.loc[-1] = [mun, E_i.iloc[0]]
    df_exp.index = df_exp.index + 1 
    df_exp = df_exp.sort_index()


df_exp

Unnamed: 0,Municipality,exp
0,VIRAC,12.114978
1,VIGA,3.589752
2,SAN MIGUEL,2.475682
3,SAN ANDRES,6.207556
4,PANGANIBAN,1.566892
5,PANDAN,3.387538
6,GIGMOTO,1.405411
7,CARAMORAN,5.180599
8,BATO,3.470859
9,BARAS,2.12441


Combine results with dataset

In [26]:
df_cumsum_exp = pd.merge(df_cumsum, df_exp, on='Municipality')
df_cumsum_exp

Unnamed: 0,Date,Municipality,NewCases,Deaths,Recoveries,d_cases,n,exp
0,2021-01-01,SAN MIGUEL,0.0,0.0,0.0,0.0,0.0,2.475682
1,2021-01-02,SAN MIGUEL,0.0,0.0,0.0,0.0,0.0,2.475682
2,2021-01-03,SAN MIGUEL,0.0,0.0,0.0,0.0,0.0,2.475682
3,2021-01-04,SAN MIGUEL,0.0,0.0,0.0,0.0,0.0,2.475682
4,2021-01-05,SAN MIGUEL,0.0,0.0,0.0,0.0,0.0,2.475682
...,...,...,...,...,...,...,...,...
8025,2022-12-27,VIGA,0.0,0.0,0.0,0.0,0.0,3.589752
8026,2022-12-28,VIGA,0.0,0.0,0.0,0.0,0.0,3.589752
8027,2022-12-29,VIGA,0.0,0.0,0.0,0.0,0.0,3.589752
8028,2022-12-30,VIGA,0.0,0.0,0.0,0.0,0.0,3.589752


## Export Data to CSV file

In [27]:
df_cumsum_exp.to_csv(f"../01_data/01_processed/00_case_data/{province}_case_data.csv",  index=False)