In [None]:
import numpy as np
import pandas as pd
from keras.models import Model, load_model

# Load data set

In [None]:
LATEST_DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
LOCAL_DATA_URL = "data/additional/OxCGRT_latest.csv"

In [None]:
def load_dataset(url):
    latest_df = pd.read_csv(url,
                            parse_dates=['Date'],
                            encoding="ISO-8859-1",
                            error_bad_lines=False)
    # Handle regions
    latest_df["RegionName"].fillna('', inplace=True)
    # Replace CountryName by CountryName / RegionName
    # np.where usage: if A then B else C
    latest_df["CountryName"] = np.where(latest_df["RegionName"] == '',
                                        latest_df["CountryName"],
                                        latest_df["CountryName"] + ' / ' + latest_df["RegionName"])
    return latest_df

In [None]:
latest_df = load_dataset(LATEST_DATA_URL)

In [None]:
latest_df.sample(3)

# Get NPIs

In [None]:
NPI_COLUMNS = ['C1_School closing',
               'C2_Workplace closing',
               'C3_Cancel public events',
               'C4_Restrictions on gatherings',
               'C5_Close public transport',
               'C6_Stay at home requirements',
               'C7_Restrictions on internal movement',
               'C8_International travel controls',
               'H1_Public information campaigns',
               'H2_Testing policy',
               'H3_Contact tracing']

In [None]:
npis_df = latest_df[["CountryName", "Date"] + NPI_COLUMNS]

In [None]:
npis_df.sample(3)

# Dates

In [None]:
start_date = np.datetime64("2020-08-01")

In [None]:
end_date = np.datetime64("2020-08-04")

In [None]:
actual_npis_df = npis_df[(npis_df.Date >= start_date) & (npis_df.Date <= end_date)]

In [None]:
actual_npis_df.sample(3)

In [None]:
# start_date.tostring('%Y%m%d-%H%M%S')
start_date_str = np.datetime_as_string(start_date, unit='D').replace("-", "")
end_date_str = np.datetime_as_string(end_date, unit='D').replace("-", "")
npis_file_name = start_date_str + "_" + end_date_str + "_" + "npis.csv"
npis_file_name

In [None]:
# Save to file
# actual_npis_df.to_csv("data/input" + npis_file_name, index=False)

# Get actual cases between these dates

In [None]:
def get_actual_cases(df):
    actual_df = df[["CountryName", "RegionName", "Date", "ConfirmedCases"]]
    actual_df = actual_df[(actual_df.Date >= start_date) & (actual_df.Date <= end_date)]
    actual_df['ActualDailyNewCases'] = actual_df.groupby('CountryName').ConfirmedCases.diff().fillna(0)
    return actual_df

In [None]:
actual_df = get_actual_cases(latest_df)

In [None]:
actual_df.head()

# Get predictions from submissions

In [None]:
def get_predictions_from_file(predictor_name, predictions_file):
    preds_df = pd.read_csv(predictions_file,
                           parse_dates=['Date'],
                           encoding="ISO-8859-1",
                           error_bad_lines=False)
    preds_df["PredictorName"] = predictor_name
    # Put PredictorName first
    preds_df = preds_df[["PredictorName"] + [col for col in preds_df.columns if col != "PredictorName"]]
    return preds_df

In [None]:
def get_predictions_from_model():
    pass

In [None]:
predictions = {"Predictor #27": "tests/fixtures/pred27/20200801_20200804_predictions.csv",
               "Predictor #30": "tests/fixtures/pred30/20200801_20200804_predictions.csv",
               "Predictor #31": "tests/fixtures/pred31/20200801_20200804_predictions.csv",
              }

In [None]:
test_predictor_name = "Predictor #27"
get_predictions_from_file(test_predictor_name, predictions[test_predictor_name]).head()

In [None]:
actual_df.head()

In [None]:
ranking_df = pd.DataFrame()
for predictor_name, predictions_file in predictions.items():
    preds_df = get_predictions_from_file(predictor_name, predictions_file)
    merged_df = actual_df.merge(preds_df, on=['CountryName', 'Date'], how='left')
    ranking_df = ranking_df.append(merged_df)

In [None]:
ranking_df['Diff'] = (ranking_df["ActualDailyNewCases"] - ranking_df["PredictedDailyNewCases"]).abs()

In [None]:
ranking_df.sort_values(by=["CountryName","RegionName","Date","Diff"], inplace=True)

In [None]:
ranking_df.head(7)

In [None]:
ranking_df[(ranking_df.CountryName == "United States") &
           (ranking_df.Date == '2020-08-01')]

In [None]:
# Save to file
# ranking_df.to_csv("/Users/m_754337/workspace/esp-demo/xprize/tests/fixtures/ranking.csv", index=False)

# Ranking

## Global

In [None]:
ranking_df.groupby('PredictorName').Diff.sum().sort_values()

## Countries

In [None]:
countries_ranking_df = ranking_df.groupby(['CountryName', 'PredictorName'])[["CountryName", "PredictorName", "Diff"]].sum().sort_values(by=["CountryName", "Diff"])

In [None]:
countries_ranking_df.head(12)

## Specific country

In [None]:
country = "United States"

In [None]:
ranking_df[ranking_df.CountryName == country].groupby('PredictorName').Diff.sum().sort_values()

In [None]:
cr_df = countries_ranking_df.reset_index()

In [None]:
cr_df[cr_df.CountryName == "France"]

## Continent

In [None]:
NORTH_AMERICA = ["Canada", "United States", "Mexico"]

In [None]:
ranking_df[ranking_df.CountryName.isin(NORTH_AMERICA)].groupby('PredictorName').Diff.sum().sort_values()