In [1]:
import numpy as np
import pandas as pd
from keras.models import Model, load_model

Using Theano backend.


# Load data set

In [4]:
LATEST_DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
LOCAL_DATA_URL = "tests/fixtures/OxCGRT_latest.csv"

In [6]:
def load_dataset(url):
    latest_df = pd.read_csv(url,
                            parse_dates=['Date'],
                            encoding="ISO-8859-1",
                            error_bad_lines=False)
    # Handle regions
    latest_df["RegionName"].fillna('', inplace=True)
    # Replace CountryName by CountryName / RegionName
    # np.where usage: if A then B else C
    latest_df["CountryName"] = np.where(latest_df["RegionName"] == '',
                                        latest_df["CountryName"],
                                        latest_df["CountryName"] + ' / ' + latest_df["RegionName"])
    return latest_df

In [7]:
latest_df = load_dataset(LATEST_DATA_URL)

  if (await self.run_code(code, result,  async_=asy)):


In [11]:
latest_df.sample(3)

Unnamed: 0,CountryName,CountryCode,RegionName,RegionCode,Date,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,...,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
8418,Cameroon,CMR,,,2020-08-12,2.0,1.0,0.0,,2.0,...,60.19,60.19,72.62,72.62,54.49,54.49,59.85,59.85,25.0,25.0
3098,Belgium,BEL,,,2020-07-25,1.0,0.0,2.0,1.0,2.0,...,48.15,48.15,53.57,53.57,55.77,55.77,54.55,54.55,62.5,62.5
38604,Sweden,SWE,,,2020-02-14,0.0,,0.0,,0.0,...,0.0,0.0,0.0,0.0,10.26,10.26,12.12,12.12,0.0,0.0


# Get NPIs

In [8]:
NPI_COLUMNS = ['C1_School closing',
               'C2_Workplace closing',
               'C3_Cancel public events',
               'C4_Restrictions on gatherings',
               'C5_Close public transport',
               'C6_Stay at home requirements',
               'C7_Restrictions on internal movement',
               'C8_International travel controls',
               'H1_Public information campaigns',
               'H2_Testing policy',
               'H3_Contact tracing']

In [13]:
npis_df = latest_df[["CountryName", "Date"] + NPI_COLUMNS]

In [14]:
npis_df.sample(3)

Unnamed: 0,CountryName,Date,C1_School closing,C2_Workplace closing,C3_Cancel public events,C4_Restrictions on gatherings,C5_Close public transport,C6_Stay at home requirements,C7_Restrictions on internal movement,C8_International travel controls,H1_Public information campaigns,H2_Testing policy,H3_Contact tracing
35192,Saudi Arabia,2020-01-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21320,Italy,2020-04-22,3.0,3.0,2.0,4.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0
53331,United States / South Dakota,2020-03-11,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,2.0,1.0,1.0


# Dates

In [20]:
start_date = np.datetime64("2020-08-01")

In [21]:
end_date = np.datetime64("2020-08-04")

In [22]:
actual_npis_df = npis_df[(npis_df.Date >= start_date) & (npis_df.Date <= end_date)]

In [25]:
actual_npis_df.sample(3)

Unnamed: 0,CountryName,Date,C1_School closing,C2_Workplace closing,C3_Cancel public events,C4_Restrictions on gatherings,C5_Close public transport,C6_Stay at home requirements,C7_Restrictions on internal movement,C8_International travel controls,H1_Public information campaigns,H2_Testing policy,H3_Contact tracing
21182,Israel,2020-08-03,1.0,1.0,2.0,3.0,1.0,0.0,0.0,4.0,2.0,1.0,1.0
11060,Djibouti,2020-08-03,2.0,2.0,1.0,3.0,0.0,0.0,0.0,1.0,2.0,3.0,2.0
12745,Eritrea,2020-08-01,3.0,3.0,2.0,4.0,2.0,2.0,2.0,3.0,2.0,1.0,1.0


# Get actual cases between these dates

In [36]:
def get_actual_cases(df):
    actual_df = df[["CountryName", "RegionName", "Date", "ConfirmedCases"]]
    actual_df = actual_df[(actual_df.Date >= start_date) & (actual_df.Date <= end_date)]
    actual_df['ActualDailyNewCases'] = actual_df.groupby('CountryName').ConfirmedCases.diff().fillna(0)
    return actual_df

In [37]:
actual_df = get_actual_cases(latest_df)

In [38]:
actual_df.head()

Unnamed: 0,CountryName,RegionName,Date,ConfirmedCases,ActualDailyNewCases
213,Aruba,,2020-08-01,121.0,0.0
214,Aruba,,2020-08-02,121.0,0.0
215,Aruba,,2020-08-03,122.0,1.0
216,Aruba,,2020-08-04,124.0,2.0
454,Afghanistan,,2020-08-01,36710.0,0.0


# Get predictions from submissions

In [32]:
def get_predictions_from_file(predictor_name, predictions_file):
    preds_df = pd.read_csv(predictions_file,
                           parse_dates=['Date'],
                           encoding="ISO-8859-1",
                           error_bad_lines=False)
    preds_df["PredictorName"] = predictor_name
    return preds_df

In [29]:
def get_predictions_from_model():
    pass

In [115]:
predictions = {"Predictor #27": "tests/fixtures/20200727_predictions.csv",
               "Predictor #30": "tests/fixtures/20200730_predictions.csv",
               "Predictor #31": "tests/fixtures/20200731_predictions.csv",
              }

In [114]:
get_predictions_from_file("Predictor #27", "tests/fixtures/20200727_predictions.csv").head()

Unnamed: 0,CountryName,Date,PredictedDailyNewCases,PredictorName
0,Aruba,2020-08-01,0.78984,Predictor #27
1,Aruba,2020-08-02,0.85769,Predictor #27
2,Aruba,2020-08-03,0.0,Predictor #27
3,Aruba,2020-08-04,0.0,Predictor #27
4,Afghanistan,2020-08-01,73.132138,Predictor #27


In [74]:
actual_df.head()

Unnamed: 0,CountryName,RegionName,Date,ConfirmedCases,ActualDailyNewCases
213,Aruba,,2020-08-01,121.0,0.0
214,Aruba,,2020-08-02,121.0,0.0
215,Aruba,,2020-08-03,122.0,1.0
216,Aruba,,2020-08-04,124.0,2.0
454,Afghanistan,,2020-08-01,36710.0,0.0


In [75]:
ranking_df = pd.DataFrame()
for predictor_name, predictions_file in predictions.items():
    preds_df = get_predictions_from_file(predictor_name, predictions_file)
    merged_df = actual_df.merge(preds_df, on=['CountryName', 'Date'], how='left')
    ranking_df = ranking_df.append(merged_df)

In [76]:
ranking_df['Diff'] = (ranking_df["ActualDailyNewCases"] - ranking_df["PredictedDailyNewCases"]).abs()

In [77]:
ranking_df.sort_values(by=["CountryName","RegionName","Date","Diff"], inplace=True)

In [81]:
ranking_df.head(7)

Unnamed: 0,CountryName,RegionName,Date,ConfirmedCases,ActualDailyNewCases,PredictedDailyNewCases,PredictorName,Diff
4,Afghanistan,,2020-08-01,36710.0,0.0,71.030063,Predictor #30,71.030063
4,Afghanistan,,2020-08-01,36710.0,0.0,72.84468,Predictor #31,72.84468
4,Afghanistan,,2020-08-01,36710.0,0.0,73.132138,Predictor #27,73.132138
5,Afghanistan,,2020-08-02,36710.0,0.0,83.300741,Predictor #30,83.300741
5,Afghanistan,,2020-08-02,36710.0,0.0,84.184771,Predictor #31,84.184771
5,Afghanistan,,2020-08-02,36710.0,0.0,88.111154,Predictor #27,88.111154
6,Afghanistan,,2020-08-03,36710.0,0.0,73.019972,Predictor #30,73.019972


In [82]:
ranking_df[(ranking_df.CountryName == "United States") &
           (ranking_df.Date == '2020-08-01')]

Unnamed: 0,CountryName,RegionName,Date,ConfirmedCases,ActualDailyNewCases,PredictedDailyNewCases,PredictorName,Diff
716,United States,,2020-08-01,4562037.0,0.0,75688.520235,Predictor #30,75688.520235
716,United States,,2020-08-01,4562037.0,0.0,75965.313708,Predictor #31,75965.313708
716,United States,,2020-08-01,4562037.0,0.0,77886.825161,Predictor #27,77886.825161


In [87]:
# Save to file
# ranking_df.to_csv("/Users/m_754337/workspace/esp-demo/xprize/tests/fixtures/ranking.csv", index=False)

# Ranking

## Global

In [84]:
ranking_df.groupby('PredictorName').Diff.sum().sort_values()

PredictorName
Predictor #30    538846.017275
Predictor #31    543817.409773
Predictor #27    580506.131360
Name: Diff, dtype: float64

## Countries

In [102]:
countries_ranking_df = ranking_df.groupby(['CountryName', 'PredictorName'])[["CountryName", "PredictorName", "Diff"]].sum().sort_values(by=["CountryName", "Diff"])

In [108]:
countries_ranking_df.head(12)

Unnamed: 0_level_0,Unnamed: 1_level_0,Diff
CountryName,PredictorName,Unnamed: 2_level_1
Afghanistan,Predictor #30,269.016505
Afghanistan,Predictor #31,276.695271
Afghanistan,Predictor #27,282.474691
Albania,Predictor #30,200.881096
Albania,Predictor #31,203.352445
Albania,Predictor #27,211.087547
Algeria,Predictor #30,1003.766848
Algeria,Predictor #31,1024.419959
Algeria,Predictor #27,1110.465449
Andorra,Predictor #31,22.802686


## Specific country

In [88]:
country = "United States"

In [90]:
ranking_df[ranking_df.CountryName == country].groupby('PredictorName').Diff.sum().sort_values()

PredictorName
Predictor #30     93012.326626
Predictor #31     95197.295834
Predictor #27    102601.571423
Name: Diff, dtype: float64

In [106]:
cr_df = countries_ranking_df.reset_index()

In [107]:
cr_df[cr_df.CountryName == "France"]

Unnamed: 0,CountryName,PredictorName,Diff
129,France,Predictor #30,2396.493863
130,France,Predictor #31,2432.556056
131,France,Predictor #27,2521.886334


## Continent

In [109]:
NORTH_AMERICA = ["Canada", "United States", "Mexico"]

In [111]:
ranking_df[ranking_df.CountryName.isin(NORTH_AMERICA)].groupby('PredictorName').Diff.sum().sort_values()

PredictorName
Predictor #30    105175.948707
Predictor #31    107569.625580
Predictor #27    115495.464296
Name: Diff, dtype: float64