In [None]:
import pandas as pd

# Load data set

In [None]:
LATEST_DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'

In [None]:
def load_dataset(url):
    latest_df = pd.read_csv(url,
                            parse_dates=['Date'],
                            encoding="ISO-8859-1",
                            error_bad_lines=False)
    latest_df["RegionName"] = latest_df["RegionName"].fillna("")
    return latest_df

In [None]:
latest_df = load_dataset(LATEST_DATA_URL)

In [None]:
latest_df.sample(3)

# Get NPIs

In [None]:
NPI_COLUMNS = ['C1_School closing',
               'C2_Workplace closing',
               'C3_Cancel public events',
               'C4_Restrictions on gatherings',
               'C5_Close public transport',
               'C6_Stay at home requirements',
               'C7_Restrictions on internal movement',
               'C8_International travel controls',
               'H1_Public information campaigns',
               'H2_Testing policy',
               'H3_Contact tracing']

In [None]:
npis_df = latest_df[["CountryName", "RegionName", "Date"] + NPI_COLUMNS]

In [None]:
npis_df.sample(3)

# Dates

In [None]:
start_date = pd.to_datetime("2020-08-01", format='%Y-%m-%d')

In [None]:
end_date = pd.to_datetime("2020-08-04", format='%Y-%m-%d')

In [None]:
actual_npis_df = npis_df[(npis_df.Date >= start_date) & (npis_df.Date <= end_date)]

In [None]:
actual_npis_df.sample(3)

In [None]:
npis_file_name = start_date.strftime('%Y-%m-%d') + "_" + end_date.strftime('%Y-%m-%d') + "_" + "npis_example.csv"
npis_file_name

In [None]:
# Save to file
# actual_npis_df.to_csv("data/input/" + npis_file_name, index=False)

# Get actual cases between these dates

In [None]:
def get_actual_cases(df, start_date, end_date):
    # 1 day earlier to compute the daily diff
    start_date_for_diff = start_date - pd.offsets.Day(1)
    actual_df = df[["CountryName", "RegionName", "Date", "ConfirmedCases"]]
    # Filter out the data set to include all the data needed to compute the diff
    actual_df = actual_df[(actual_df.Date >= start_date_for_diff) & (actual_df.Date <= end_date)]
    actual_df.sort_values(by=["CountryName","RegionName","Date"], inplace=True)
    # Compute the diff
    actual_df["ActualDailyNewCases"] = actual_df.groupby(["CountryName", "RegionName"])["ConfirmedCases"].diff().fillna(0)
    # Return only the data between start_date and end_date
    actual_df = actual_df[(actual_df.Date >= start_date) & (actual_df.Date <= end_date)]
    return actual_df

In [None]:
NUM_PREV_DAYS_TO_INCLUDE = 6
WINDOW_SIZE = 7

In [None]:
# def get_actual_cases_new(df, start_date, end_date):
#     actual_df = df[["CountryName", "RegionName", "Date", "ConfirmedCases"]].reset_index(drop=True)
#     # Filter out the data set but make sure to include data need for 7MA and Diff computations
#     start_date_for_diff = start_date - pd.offsets.Day(NUM_PREV_DAYS_TO_INCLUDE)
#     actual_df = actual_df[(actual_df.Date >= start_date_for_diff) & (actual_df.Date <= end_date)]
# #     actual_df.sort_values(by=["CountryName","RegionName","Date"], inplace=True)
#     actual_df = actual_df.sort_values(by=["CountryName","RegionName","Date"]).reset_index(drop=True)
    
#     # Compute the diff
#     actual_df["ActualDailyNewCases"] = actual_df.groupby(
#         ["CountryName", "RegionName"])["ConfirmedCases"].diff().fillna(0).reset_index(0, drop=True)
# #     # Make sure daily cases are not negative, which happens in case of corrections
# #     actual_df['ActualDailyNewCases'] = actual_df['ActualDailyNewCases'].clip(lower=0)
    
#     # 7 day moving average
#     actual_df["7DayActualDailyNewCases"] = actual_df.groupby(
#         ["CountryName", "RegionName"])["ActualDailyNewCases"].rolling(WINDOW_SIZE, center=False).mean().reset_index(0, drop=True)
# #     ma_df = actual_df["7DayActualDailyNewCases"] = actual_df.groupby(["CountryName", "RegionName"])["ActualDailyNewCases"].rolling(7, center=False).mean().reset_index(0, drop=True)

#     # Return only the data between start_date and end_date
# #     actual_df = actual_df[(actual_df.Date >= start_date) & (actual_df.Date <= end_date)]
#     return actual_df

In [None]:
actual_df = get_actual_cases(latest_df, start_date, end_date)

In [None]:
actual_df.head(12)

In [None]:
# ma_df = actual_df.groupby(["CountryName", "RegionName"])["ActualDailyNewCases"].rolling(7, center=False).mean().reset_index()
# ma_df.head(12)


# Get predictions from submissions

In [None]:
def get_predictions_from_file(predictor_name, predictions_file):
    preds_df = pd.read_csv(predictions_file,
                           parse_dates=['Date'],
                           encoding="ISO-8859-1",
                           error_bad_lines=False)
    preds_df["RegionName"] = preds_df["RegionName"].fillna("")
    preds_df["PredictorName"] = predictor_name
    # Put PredictorName first
    preds_df = preds_df[["PredictorName"] + [col for col in preds_df.columns if col != "PredictorName"]]
    return preds_df

In [None]:
predictions = {"Predictor #27": "examples/lstm/tests/fixtures/pred27/20200801_20200804_predictions.csv",
               "Predictor #30": "examples/lstm/tests/fixtures/pred30/20200801_20200804_predictions.csv",
               "Predictor #31": "examples/lstm/tests/fixtures/pred31/20200801_20200804_predictions.csv",
              }

In [None]:
test_predictor_name = "Predictor #27"
get_predictions_from_file(test_predictor_name, predictions[test_predictor_name]).head()

In [None]:
actual_df.head()

In [None]:
ranking_df = pd.DataFrame()
for predictor_name, predictions_file in predictions.items():
    preds_df = get_predictions_from_file(predictor_name, predictions_file)
    merged_df = actual_df.merge(preds_df, on=['CountryName', 'RegionName', 'Date'], how='left')
    ranking_df = ranking_df.append(merged_df)

In [None]:
ranking_df['Diff'] = (ranking_df["ActualDailyNewCases"] - ranking_df["PredictedDailyNewCases"]).abs()

In [None]:
ranking_df.sort_values(by=["CountryName","RegionName","Date","Diff"], inplace=True)

In [None]:
ranking_df.head(12)

In [None]:
ranking_df[(ranking_df.CountryName == "United States") &
           (ranking_df.Date == '2020-08-02')]

In [None]:
# Save to file
# ranking_df.to_csv("/Users/m_754337/workspace/esp-demo/xprize/tests/fixtures/ranking.csv", index=False)

# Ranking

## Global

In [None]:
ranking_df.groupby('PredictorName').Diff.sum().sort_values()

## Countries

In [None]:
countries_ranking_df = ranking_df.groupby(["CountryName", "RegionName", "PredictorName"])[["CountryName", "RegionName", "PredictorName", "Diff"]].sum().sort_values(by=["CountryName", "RegionName", "Diff"])


In [None]:
countries_ranking_df.head(12)

## Specific country

In [None]:
cr_df = countries_ranking_df.reset_index()

In [None]:
cr_df[(cr_df.CountryName == "Italy") & (cr_df.RegionName == "")]

In [None]:
ranking_df[ranking_df.CountryName == "Italy"]

## Specific country (group by)

In [None]:
ranking_df[(ranking_df.CountryName == "United States") & (ranking_df.RegionName == "")].groupby(["PredictorName"]).Diff.sum().sort_values()

## Specific region

In [None]:
cr_df[(cr_df.CountryName == "United States") & (cr_df.RegionName == "California")]

## Continent

In [None]:
NORTH_AMERICA = ["Canada", "United States", "Mexico"]

In [None]:
cr_df[(cr_df.CountryName.isin(NORTH_AMERICA)) & (cr_df.RegionName == "")].groupby('PredictorName').Diff.sum().sort_values().reset_index()

In [None]:
cr_df[(cr_df.CountryName.isin(NORTH_AMERICA)) & (cr_df.RegionName == "")]