In [None]:
import pandas as pd

# Load data set

In [None]:
LATEST_DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'

In [None]:
def load_dataset(url):
    latest_df = pd.read_csv(url,
                            parse_dates=['Date'],
                            encoding="ISO-8859-1",
                            error_bad_lines=False)
    latest_df["RegionName"] = latest_df["RegionName"].fillna("")
    return latest_df

In [None]:
latest_df = load_dataset(LATEST_DATA_URL)

In [None]:
latest_df.sample(3)

# Get NPIs

In [None]:
NPI_COLUMNS = ['C1_School closing',
               'C2_Workplace closing',
               'C3_Cancel public events',
               'C4_Restrictions on gatherings',
               'C5_Close public transport',
               'C6_Stay at home requirements',
               'C7_Restrictions on internal movement',
               'C8_International travel controls',
               'H1_Public information campaigns',
               'H2_Testing policy',
               'H3_Contact tracing']

In [None]:
npis_df = latest_df[["CountryName", "RegionName", "Date"] + NPI_COLUMNS]

In [None]:
npis_df.sample(3)

# Dates

In [None]:
start_date = pd.to_datetime("2020-08-01", format='%Y-%m-%d')

In [None]:
end_date = pd.to_datetime("2020-08-04", format='%Y-%m-%d')

In [None]:
actual_npis_df = npis_df[(npis_df.Date >= start_date) & (npis_df.Date <= end_date)]

In [None]:
actual_npis_df.sample(3)

In [None]:
npis_file_name = start_date.strftime('%Y-%m-%d') + "_" + end_date.strftime('%Y-%m-%d') + "_" + "npis_example.csv"
npis_file_name

In [None]:
# Save to file
# actual_npis_df.to_csv("data/input/" + npis_file_name, index=False)

# Get actual cases between these dates

In [None]:
NUM_PREV_DAYS_TO_INCLUDE = 6
WINDOW_SIZE = 7

In [None]:
def get_actual_cases(df, start_date, end_date):
    # 1 day earlier to compute the daily diff
    start_date_for_diff = start_date - pd.offsets.Day(WINDOW_SIZE)
    actual_df = df[["CountryName", "RegionName", "Date", "ConfirmedCases"]]
    # Filter out the data set to include all the data needed to compute the diff
    actual_df = actual_df[(actual_df.Date >= start_date_for_diff) & (actual_df.Date <= end_date)]
    actual_df['GeoID'] = actual_df['CountryName'] + '__' + actual_df['RegionName'].astype(str)
    actual_df.sort_values(by=["GeoID","Date"], inplace=True)
    # Compute the diff
    actual_df["ActualDailyNewCases"] = actual_df.groupby("GeoID")["ConfirmedCases"].diff().fillna(0)
    # Compute the 7 day moving average
    actual_df["ActualDailyNewCases7DMA"] = actual_df.groupby(
        "GeoID")['ActualDailyNewCases'].rolling(
        WINDOW_SIZE, center=False).mean().reset_index(0, drop=True)
#     # Return only the data between start_date and end_date
#     actual_df = actual_df[(actual_df.Date >= start_date) & (actual_df.Date <= end_date)]
    return actual_df

In [None]:
actual_df = get_actual_cases(latest_df, start_date, end_date)

In [None]:
actual_df.head(12)

# Get historical data for 7 days moving average calculation
In order to compute the 7 days moving average, we need to get the historical true new cases for the last 7 days before start date

In [None]:
ma_df = actual_df[actual_df["Date"] < start_date]
ma_df = ma_df[["CountryName", "RegionName", "Date", "ActualDailyNewCases"]]
ma_df = ma_df.rename(columns={"ActualDailyNewCases": "PredictedDailyNewCases"})
ma_df.head()

# Get predictions from submissions

In [None]:
def get_predictions_from_file(predictor_name, predictions_file, ma_df):
    preds_df = pd.read_csv(predictions_file,
                           parse_dates=['Date'],
                           encoding="ISO-8859-1",
                           error_bad_lines=False)
    preds_df["RegionName"] = preds_df["RegionName"].fillna("")
    preds_df["PredictorName"] = predictor_name
    preds_df["Prediction"] = True
    
    # Append the true number of cases before start date
    ma_df["PredictorName"] = predictor_name
    ma_df["Prediction"] = False
    preds_df = ma_df.append(preds_df, ignore_index=True)

    # Compute the 7 days moving average for PredictedDailyNewCases
    preds_df['GeoID'] = preds_df['CountryName'] + '__' + preds_df['RegionName'].astype(str)
    # Sort
#     preds_df.sort_values(by=["CountryName","RegionName", "Date"], inplace=True)
    preds_df.sort_values(by=["GeoID","Date"], inplace=True)
    preds_df["PredictedDailyNewCases7DMA"] = preds_df.groupby(
        "GeoID")['PredictedDailyNewCases'].rolling(
        WINDOW_SIZE, center=False).mean().reset_index(0, drop=True)

    # Put PredictorName first
    preds_df = preds_df[["PredictorName"] + [col for col in preds_df.columns if col != "PredictorName"]]
    return preds_df

In [None]:
predictions = {"Predictor #27": "examples/lstm/tests/fixtures/pred27/20200801_20200804_predictions.csv",
               "Predictor #30": "examples/lstm/tests/fixtures/pred30/20200801_20200804_predictions.csv",
               "Predictor #31": "examples/lstm/tests/fixtures/pred31/20200801_20200804_predictions.csv",
              }

In [None]:
test_predictor_name = "Predictor #27"
temp_df = get_predictions_from_file(test_predictor_name, predictions[test_predictor_name], ma_df.copy())
temp_df.head(12)

In [None]:
# (121 + 106 + 105 + 103 + 0 + 71 + 73.132138) / 7

In [None]:
actual_df.head()

In [None]:
ranking_df = pd.DataFrame()
for predictor_name, predictions_file in predictions.items():
    preds_df = get_predictions_from_file(predictor_name, predictions_file, ma_df)
    merged_df = actual_df.merge(preds_df, on=['CountryName', 'RegionName', 'Date', 'GeoID'], how='left')
    ranking_df = ranking_df.append(merged_df)

In [None]:
ranking_df['DiffDaily'] = (ranking_df["ActualDailyNewCases"] - ranking_df["PredictedDailyNewCases"]).abs()

In [None]:
ranking_df['Diff7DMA'] = (ranking_df["ActualDailyNewCases7DMA"] - ranking_df["PredictedDailyNewCases7DMA"]).abs()

In [None]:
# Keep only predictions (either Prediction == True) or on or after start_date
ranking_df = ranking_df[ranking_df["Date"] >= start_date]

In [None]:
# Sort by 7 days moving average diff
ranking_df.sort_values(by=["CountryName","RegionName","Date","Diff7DMA"], inplace=True)

In [None]:
# # Set true dailycases before start_date
# ranking_df["Prediction"] = False
# ranking_df["Prediction"][ranking_df["Date"] >= start_date] = True

In [None]:
# ranking_df.head((7+4)*3)

In [None]:
ranking_df.head(3*4)

In [None]:
ranking_df[(ranking_df.CountryName == "United States") &
           (ranking_df.Date == '2020-08-02')]

In [None]:
# Save to file
# ranking_df.to_csv("/Users/m_754337/workspace/esp-demo/xprize/tests/fixtures/ranking.csv", index=False)

# Ranking

## Global

In [None]:
ranking_df.groupby('PredictorName').Diff7DMA.sum().sort_values()

## Countries

In [None]:
countries_ranking_df = ranking_df.groupby(["CountryName", "RegionName", "PredictorName"])[["CountryName", "RegionName", "PredictorName", "Diff7DMA"]].sum().sort_values(by=["CountryName", "RegionName", "Diff7DMA"])


In [None]:
countries_ranking_df.head(12)

## Specific country

In [None]:
cr_df = countries_ranking_df.reset_index()

In [None]:
cr_df[(cr_df.CountryName == "Italy") & (cr_df.RegionName == "")]

In [None]:
ranking_df[ranking_df.CountryName == "Italy"]

## Specific country (group by)

In [None]:
ranking_df[(ranking_df.CountryName == "United States") & (ranking_df.RegionName == "")].groupby(["PredictorName"]).Diff7DMA.sum().sort_values()

## Specific region

In [None]:
cr_df[(cr_df.CountryName == "United States") & (cr_df.RegionName == "California")]

## Continent

In [None]:
NORTH_AMERICA = ["Canada", "United States", "Mexico"]

In [None]:
cr_df[(cr_df.CountryName.isin(NORTH_AMERICA)) & (cr_df.RegionName == "")].groupby('PredictorName').Diff7DMA.sum().sort_values().reset_index()

In [None]:
cr_df[(cr_df.CountryName.isin(NORTH_AMERICA)) & (cr_df.RegionName == "")]