# Smoothing Notebook for COVID data

Authors (in alphabetical order): Frederic Poitevin, Joao Rodrigues, Andrea Scaiewicz

This notebook takes as input a CSV file generated by Data-Wrangler (see 00_processing) and produces a new CSV file with smoothed data.

In [None]:
import pathlib
import re

import numpy as np
import pandas as pd

### Read the raw data as a DataFrame

In [None]:
csv_dir = pathlib.Path('..') / 'output'  # directory where the csv files are

df_fpath = csv_dir / 'Data_COVID-19_v2_bycountry.csv'
#df_fpath = csv_dir / 'Data_COVID-19_v2_bystate.csv'
#df_fpath = csv_dir / 'Data_COVID-19_v2.csv'   # combined

In [None]:
df = pd.read_csv(df_fpath)

### Find which columns to smooth

Columns that contain dates by matching column names to a regular expression

In [None]:
date_regex = re.compile('\d{1,2}/\d{1,2}/\d{2,4}')
cols = df.columns
date_cols = [c for i, c in enumerate(cols) if date_regex.match(c)]  # indexes of the date cols

### Smooth using Michael's favorite function

In [None]:
def smooth_window(data, window_length=3):
    """Smoothing function implemented by Frederic Poitevin"""
    data_f = data.values.astype(np.float64)
    average = np.copy(data_f)
    
    for i in np.arange(1, window_length):
        average += np.r_[np.zeros(i), data_f[:-i]]
    
    average /= window_length
    average += 1e-60  # to avoid division by 0
    average *= data_f[-1] / average[-1]
    return average

In [None]:
def smooth_dataset(dataframe, w):
    smooth_func = lambda d: smooth_window(d, window_length=w)

    # Cast dates as floats to avoid internal conversion
    dataframe = dataframe.astype(
        {
            c: 'float64'
            for c in date_cols
        }
    )
    
    df_smooth = dataframe.copy(deep=True)
    df_smooth[date_cols] = dataframe[date_cols].apply(smooth_func, axis=1, result_type='broadcast')

    # Round to a few decimal points
    # Restrict rounding to data columns
    precision = 3
    df_smooth = df_smooth.round(
        {
            c: precision
            for c in date_cols
        }
    )

    output_fname = csv_dir / f'{df_fpath.stem}_smooth_{w}.csv'
    print(f'Saving smoothed CSV file to: {output_fname}')
    df_smooth.to_csv(output_fname, index=False)
    return df_smooth

Generate different CSV files with varying smoothing windows

In [None]:
smooth_dataset(df, 3);
smooth_dataset(df, 5);
smooth_dataset(df, 7);