In [None]:
import pandas as pd
import numpy as np

# Load data set

In [None]:
LATEST_DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'

In [None]:
def load_dataset(url):
    latest_df = pd.read_csv(url,
                            parse_dates=['Date'],
                            encoding="ISO-8859-1",
                            dtype={"RegionName": str,
                                   "RegionCode": str},
                            error_bad_lines=False)
    latest_df["RegionName"] = latest_df["RegionName"].fillna("")
    return latest_df

In [None]:
latest_df = load_dataset(LATEST_DATA_URL)

In [None]:
latest_df.sample(3)

# Get NPIs

In [None]:
NPI_COLUMNS = ['C1_School closing',
               'C2_Workplace closing',
               'C3_Cancel public events',
               'C4_Restrictions on gatherings',
               'C5_Close public transport',
               'C6_Stay at home requirements',
               'C7_Restrictions on internal movement',
               'C8_International travel controls',
               'H1_Public information campaigns',
               'H2_Testing policy',
               'H3_Contact tracing',
               'H6_Facial Coverings']

In [None]:
npis_df = latest_df[["CountryName", "RegionName", "Date"] + NPI_COLUMNS]

In [None]:
npis_df.sample(3)

# Dates

In [None]:
start_date_str = "2020-08-01"
end_date_str = "2020-08-04"

In [None]:
start_date = pd.to_datetime(start_date_str, format='%Y-%m-%d')
end_date = pd.to_datetime(end_date_str, format='%Y-%m-%d')

In [None]:
actual_npis_df = npis_df[(npis_df.Date >= start_date) & (npis_df.Date <= end_date)]
actual_npis_df.sample(3)

In [None]:
npis_file_name = start_date.strftime('%Y-%m-%d') + "_" + end_date.strftime('%Y-%m-%d') + "_" + "npis_example.csv"
npis_file_name

In [None]:
# Save to file
# actual_npis_df.to_csv(npis_file_name, index=False)

# Get actual cases between these dates

In [None]:
NUM_PREV_DAYS_TO_INCLUDE = 6
WINDOW_SIZE = 7

In [None]:
def get_actual_cases(df, start_date, end_date):
    # 1 day earlier to compute the daily diff
    start_date_for_diff = start_date - pd.offsets.Day(WINDOW_SIZE)
    actual_df = df[["CountryName", "RegionName", "Date", "ConfirmedCases"]]
    # Filter out the data set to include all the data needed to compute the diff
    actual_df = actual_df[(actual_df.Date >= start_date_for_diff) & (actual_df.Date <= end_date)]
    # Add GeoID column that combines CountryName and RegionName for easier manipulation of data
    # np.where usage: if A then B else C
    actual_df["GeoID"] = np.where(actual_df["RegionName"].isnull(),
                                  actual_df["CountryName"],
                                  actual_df["CountryName"] + ' / ' + actual_df["RegionName"])
    actual_df.sort_values(by=["GeoID","Date"], inplace=True)
    # Compute the diff
    actual_df["ActualDailyNewCases"] = actual_df.groupby("GeoID")["ConfirmedCases"].diff().fillna(0)
    # Compute the 7 day moving average
    actual_df["ActualDailyNewCases7DMA"] = actual_df.groupby(
        "GeoID")['ActualDailyNewCases'].rolling(
        WINDOW_SIZE, center=False).mean().reset_index(0, drop=True)
    return actual_df

In [None]:
actual_df = get_actual_cases(latest_df, start_date, end_date)

In [None]:
actual_df.head(12)

# Get historical data for 7 days moving average calculation
In order to compute the 7 days moving average, we need to get the historical true new cases for the last 7 days before start date

In [None]:
ma_df = actual_df[actual_df["Date"] < start_date]
ma_df = ma_df[["CountryName", "RegionName", "Date", "ActualDailyNewCases"]]
ma_df = ma_df.rename(columns={"ActualDailyNewCases": "PredictedDailyNewCases"})
ma_df.head()

# Get predictions from submissions

In [None]:
def get_predictions_from_file(predictor_name, predictions_file, ma_df):
    preds_df = pd.read_csv(predictions_file,
                           parse_dates=['Date'],
                           encoding="ISO-8859-1",
                           error_bad_lines=False)
    preds_df["RegionName"] = preds_df["RegionName"].fillna("")
    preds_df["PredictorName"] = predictor_name
    preds_df["Prediction"] = True
    
    # Append the true number of cases before start date
    ma_df["PredictorName"] = predictor_name
    ma_df["Prediction"] = False
    preds_df = ma_df.append(preds_df, ignore_index=True)

    # Add GeoID column that combines CountryName and RegionName for easier manipulation of data
    # np.where usage: if A then B else C
    preds_df["GeoID"] = np.where(preds_df["RegionName"].isnull(),
                                 preds_df["CountryName"],
                                 preds_df["CountryName"] + ' / ' + preds_df["RegionName"])
    # Sort
#     preds_df.sort_values(by=["CountryName","RegionName", "Date"], inplace=True)
    preds_df.sort_values(by=["GeoID","Date"], inplace=True)
    # Compute the 7 days moving average for PredictedDailyNewCases
    preds_df["PredictedDailyNewCases7DMA"] = preds_df.groupby(
        "GeoID")['PredictedDailyNewCases'].rolling(
        WINDOW_SIZE, center=False).mean().reset_index(0, drop=True)

    # Put PredictorName first
    preds_df = preds_df[["PredictorName"] + [col for col in preds_df.columns if col != "PredictorName"]]
    return preds_df

In [None]:
# File used to request the predictions
IP_FILE = "validation/data/2020-09-30_historical_ip.csv"
# Resulting predictions
predictions = {"Linear": "examples/predictors/linear/predictions/val_4_days.csv",
               "LSTM": "examples/predictors/lstm/predictions/2020-08-01_2020-08-04.csv"}

In [None]:
test_predictor_name = "Linear"
temp_df = get_predictions_from_file(test_predictor_name, predictions[test_predictor_name], ma_df.copy())
temp_df.head(12)

In [None]:
actual_df.head(8)

In [None]:
# actual_as_pred_df = actual_df.copy()
# actual_as_pred_df["PredictorName"] = "Ground truth"
# actual_as_pred_df["Prediction"] = False
# actual_as_pred_df = actual_as_pred_df.rename(columns={"ActualDailyNewCases": "PredictedDailyNewCases",
#                                                       "ActualDailyNewCases7DMA": "PredictedDailyNewCases7DMA"})
# actual_as_pred_df.head(8)

In [None]:
from validation.validation import validate_submission

ranking_df = pd.DataFrame()
for predictor_name, predictions_file in predictions.items():
    errors = validate_submission(start_date_str, end_date_str, IP_FILE, predictions_file)
    if not errors:
        preds_df = get_predictions_from_file(predictor_name, predictions_file, ma_df)
        merged_df = actual_df.merge(preds_df, on=['CountryName', 'RegionName', 'Date', 'GeoID'], how='left')
        ranking_df = ranking_df.append(merged_df)
    else:
        print(f"Predictor {predictor_name} did not submit valid predictions! Please check its errors:")
        print(errors)

In [None]:
ranking_df['DiffDaily'] = (ranking_df["ActualDailyNewCases"] - ranking_df["PredictedDailyNewCases"]).abs()

In [None]:
ranking_df['Diff7DMA'] = (ranking_df["ActualDailyNewCases7DMA"] - ranking_df["PredictedDailyNewCases7DMA"]).abs()

In [None]:
# Compute the cumulative sum of 7DMA errors
ranking_df['CumulDiff7DMA'] = ranking_df.groupby(["GeoID", "PredictorName"])['Diff7DMA'].cumsum()

In [None]:
# Keep only predictions (either Prediction == True) or on or after start_date
ranking_df = ranking_df[ranking_df["Date"] >= start_date]

In [None]:
# Sort by 7 days moving average diff
ranking_df.sort_values(by=["CountryName","RegionName","Date","Diff7DMA"], inplace=True)

In [None]:
# # Set true dailycases before start_date
# ranking_df["Prediction"] = False
# ranking_df["Prediction"][ranking_df["Date"] >= start_date] = True

In [None]:
ranking_df.head(4*2)

In [None]:
ranking_df[(ranking_df.CountryName == "United States") &
           (ranking_df.Date == '2020-08-02')]

In [None]:
# Save to file
# ranking_df.to_csv("/Users/m_754337/workspace/esp-demo/xprize/tests/fixtures/ranking.csv", index=False)

# Ranking

## Global

In [None]:
ranking_df.groupby('PredictorName').Diff7DMA.sum().sort_values()

## Countries

In [None]:
countries_ranking_df = ranking_df.groupby(["CountryName", "RegionName", "PredictorName"])[["CountryName", "RegionName", "PredictorName", "Diff7DMA"]].sum().sort_values(by=["CountryName", "RegionName", "Diff7DMA"])


In [None]:
countries_ranking_df.head(12)

## Specific country

In [None]:
cr_df = countries_ranking_df.reset_index()

In [None]:
cr_df[(cr_df.CountryName == "Italy") & (cr_df.RegionName == "")]

In [None]:
ranking_df[ranking_df.CountryName == "Italy"]

## Specific country (group by)

In [None]:
ranking_df[(ranking_df.CountryName == "United States") & (ranking_df.RegionName == "")].groupby(["PredictorName"]).Diff7DMA.sum().sort_values()

## Specific region

In [None]:
cr_df[(cr_df.CountryName == "United States") & (cr_df.RegionName == "California")]

## Continent

In [None]:
NORTH_AMERICA = ["Canada", "United States", "Mexico"]

In [None]:
cr_df[(cr_df.CountryName.isin(NORTH_AMERICA)) & (cr_df.RegionName == "")].groupby('PredictorName').Diff7DMA.sum().sort_values().reset_index()

In [None]:
cr_df[(cr_df.CountryName.isin(NORTH_AMERICA)) & (cr_df.RegionName == "")]

# Plots

In [None]:
ALL_GEO = "Overall"
DEFAULT_GEO = ALL_GEO

## Prediction vs actual

In [None]:
predictor_names = list(ranking_df.PredictorName.dropna().unique())
geoid_names = list(ranking_df.GeoID.unique())

In [None]:
# country_df = ranking_df[ranking_df.CountryName == "Italy"]
# country_df[country_df["PredictorName"] == 'Predictor #27']

## Filter by country

In [None]:
# pred_country_df = ranking_df[(ranking_df.CountryName == "India") &
#                         (ranking_df.PredictorName == "Predictor #27")]
# pred_country_df

In [None]:
all_df = ranking_df.groupby(["PredictorName", "Date"])[["GeoID", "PredictorName", "PredictedDailyNewCases7DMA"]].sum(). \
    sort_values(by=["PredictorName", "Date"]).reset_index()
all_df

In [None]:
import plotly.graph_objects as go

fig = go.Figure(layout=dict(title=dict(text=f"{DEFAULT_GEO} Daily New Cases 7-day Average ",
                                       y=0.9,
                                       x=0.5,
                                       xanchor='center',
                                       yanchor='top'
                                       ),
                             plot_bgcolor='#f2f2f2',
                             xaxis_title="Date",
                             yaxis_title="Daily new cases 7-day average"
                             ))

# Keep track of trace visibility by geo ID name
geoid_plot_names = []

all_df = ranking_df.groupby(["PredictorName", "Date"])[["GeoID", "PredictorName", "PredictedDailyNewCases7DMA"]].sum(). \
    sort_values(by=["PredictorName", "Date"]).reset_index()

# Add 1 trace per predictor, for all geos
for predictor_name in predictor_names:
    all_geo_df = all_df[all_df.PredictorName == predictor_name]
    fig.add_trace(go.Scatter(x=all_geo_df.Date,
                             y=all_geo_df.PredictedDailyNewCases7DMA,
                             name=predictor_name,
                             visible=(ALL_GEO == DEFAULT_GEO))
                 )
    geoid_plot_names.append(ALL_GEO)

# Add 1 trace per predictor, per geo id
for predictor_name in predictor_names:
    for geoid_name in geoid_names:
        pred_geoid_df = ranking_df[(ranking_df.GeoID == geoid_name) &
                                   (ranking_df.PredictorName == predictor_name)]
        fig.add_trace(go.Scatter(x=pred_geoid_df.Date,
                                 y=pred_geoid_df.PredictedDailyNewCases7DMA,
                                 name=predictor_name,
                                 visible=(geoid_name == DEFAULT_GEO))
                     )
        geoid_plot_names.append(geoid_name)

# For each geo
# Add 1 trace for the true number of cases
for geoid_name in geoid_names:
    geo_actual_df = actual_df[(actual_df.GeoID == geoid_name) &
                                  (actual_df.Date >= start_date)]
    fig.add_trace(go.Scatter(x=geo_actual_df.Date,
                             y=geo_actual_df.ActualDailyNewCases7DMA,
                             name="Ground Truth",
                             visible= (geoid_name == DEFAULT_GEO),
                             line=dict(color='orange', width=4, dash='dash'))
                  )
    geoid_plot_names.append(geoid_name)
    
# Add 1 trace for the overall ground truth
overall_actual_df = actual_df[actual_df.Date >= start_date].groupby(["Date"])[["GeoID", "ActualDailyNewCases7DMA"]].sum(). \
    sort_values(by=["Date"]).reset_index()
fig.add_trace(go.Scatter(x=overall_actual_df.Date,
                         y=overall_actual_df.ActualDailyNewCases7DMA,
                         name="Ground Truth",
                         visible= (ALL_GEO == DEFAULT_GEO),
                         line=dict(color='orange', width=4, dash='dash'))
                  )
geoid_plot_names.append(geoid_name)

# Format x axis
fig.update_xaxes(
dtick="D1",  # Means 1 day
tickformat="%d\n%b")

# Filter
buttons=[]
for geoid_name in ([ALL_GEO] + geoid_names):
    buttons.append(dict(method='update',
                        label=geoid_name,
                        args = [{'visible': [geoid_name==r for r in geoid_plot_names]},
                                {'title': f"{geoid_name} Daily New Cases 7-day Average "}]))
fig.update_layout(showlegend=True,
                  updatemenus=[{"buttons": buttons,
                                "direction": "down",
                                "active": ([ALL_GEO] + geoid_names).index(DEFAULT_GEO),
                                "showactive": True,
                                "x": 0.1,
                                "y": 1.15}])

fig.show()

## Rankings: by cumulative 7DMA error

In [None]:
ranking_fig = go.Figure(layout=dict(title=dict(text=f'{DEFAULT_GEO} submission rankings',
                                               y=0.9,
                                               x=0.5,
                                               xanchor='center',
                                               yanchor='top'
                                               ),
                                    plot_bgcolor='#f2f2f2',
                                    xaxis_title="Date",
                                    yaxis_title="Cumulative 7DMA error"
                                    ))

# Keep track of trace visibility by geo name
ranking_geoid_plot_names = []

all_df = ranking_df.groupby(["PredictorName", "Date"])[["GeoID", "PredictorName", "CumulDiff7DMA"]].sum(). \
    sort_values(by=["PredictorName", "Date"]).reset_index()

# Add 1 trace per predictor, for all geos
for predictor_name in predictor_names:
    ranking_geoid_df = all_df[all_df.PredictorName == predictor_name]
    ranking_fig.add_trace(go.Scatter(x=ranking_geoid_df.Date,
                             y=ranking_geoid_df.CumulDiff7DMA,
                             name=predictor_name,
                             visible=(ALL_GEO == DEFAULT_GEO))
                 )
    ranking_geoid_plot_names.append(ALL_GEO)


# Add 1 trace per predictor, per country
for predictor_name in predictor_names:
    for geoid_name in geoid_names:
        ranking_geoid_df = ranking_df[(ranking_df.GeoID == geoid_name) &
                                        (ranking_df.PredictorName == predictor_name)]
        ranking_fig.add_trace(go.Scatter(x=ranking_geoid_df.Date,
                                 y=ranking_geoid_df.CumulDiff7DMA,
                                 name=predictor_name,
                                 visible= (geoid_name == DEFAULT_GEO))
                     )
        ranking_geoid_plot_names.append(geoid_name)

# Format x axis
ranking_fig.update_xaxes(
dtick="D1",  # Means 1 day
tickformat="%d\n%b")

# Filter
buttons=[]
for geoid_name in ([ALL_GEO] + geoid_names):
    buttons.append(dict(method='update',
                        label=geoid_name,
                        args = [{'visible': [geoid_name==r for r in ranking_geoid_plot_names]},
                                {'title': f'{geoid_name} submission rankings'}]))
ranking_fig.update_layout(showlegend=True,
                          updatemenus=[{"buttons": buttons,
                                        "direction": "down",
                                        "active": ([ALL_GEO] + geoid_names).index(DEFAULT_GEO),
                                        "showactive": True,
                                        "x": 0.1,
                                        "y": 1.15}])

ranking_fig.show()

In [None]:
# all_df = ranking_df.groupby(["PredictorName", "Date"])[["CountryName", "RegionName", "PredictorName", "CumulDiff7DMA"]].sum(). \
#     sort_values(by=["PredictorName", "Date"]).reset_index()
# all_df
