In [1]:
import warnings
import pandas as pd
from itertools import cycle
import gpflow
import numpy as np
from scipy import linalg
from gpflow.utilities import print_summary, positive
from gpflow.ci_utils import ci_niter
from gpflow.optimizers import NaturalGradient
from gpflow import set_trainable
import tensorflow as tf
import tensorflow_probability as tfp
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from collections import namedtuple
import statsmodels.api as sm
import timeit

sns.set(style="white")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

np.random.seed(123)

gpflow.config.set_default_float(np.float64)
gpflow.config.set_default_summary_fmt("notebook")

  pd.set_option('display.max_colwidth', -1)


In [2]:
def country_selection(country_name, loc_mobility=None, loc_covid=None, print_summary=True, plots=True):
    
    avaliable_countries = ["Germany", "Japan", "Australia", "Canada", 
                          "United States", "South Korea", "Italy", "Argentina", 
                          "Turkey", "Thailand", "Colombia", "Mexico",
                          "India", "Kenya", "Bolivia", "Egypt"]
    
    if country_name not in avaliable_countries:
        warnings.warn("The selected country might not have optimal COVID data.")
    else:
        # https://en.wikipedia.org/wiki/List_of_countries_by_inequality-adjusted_HDI
        if country_name in ["Germany", "Japan", "Australia", "Canada"]:
            # Tier 1: 0.800–1.000 (very high)
            tier = 1         
        elif country_name in ["United States", "South Korea", "Italy", "Argentina"]:
            # Tier 2: 0.700–0.799 (high)
            tier = 2
        elif country_name in ["Turkey", "Thailand", "Algeria", "Mexico"]:
            # Tier 3: 0.550–0.699 (medium)
            tier = 3
        else:
            # Tier 4: 0.350–0.549 (low)
            tier = 4
            
    if loc_mobility==None:
        loc_mobility="./Data/Google/Global_Mobility_Report.csv"
        
    mobility_data = pd.read_csv(loc_mobility, 
                                encoding="ISO-8859-1", 
                                parse_dates=["date"],
                                dayfirst=True, low_memory=False)
    
    mobility_data = mobility_data[
        (mobility_data["country_region"]==country_name) & 
        (mobility_data["sub_region_1"].isnull()) & 
        (mobility_data["metro_area"].isnull())]
    mobility_data = mobility_data.iloc[:,7:].reset_index(drop=True) # keep only relevant columns
    mobility_data.iloc[:,1:] = mobility_data.iloc[:,1:].apply(lambda x: x/100 + 1)
    
    mobility_data = pd.DataFrame({
        "date": mobility_data["date"],
        "HOME": mobility_data['residential_percent_change_from_baseline'],
        "WORK": mobility_data['workplaces_percent_change_from_baseline'],
        "RECREATIONAL": mobility_data['retail_and_recreation_percent_change_from_baseline'],
        "OTHER": mobility_data.iloc[:,2:5].mean(axis=1)
    })
    mobility_data.reset_index(drop=True, inplace=True)
    
    if loc_covid==None:
        loc_covid="./Data/Microsoft/COVID_CASES.csv"
     # COVID Cases from Microsoft, includes Recovered and Infected
    ms_covid = pd.read_csv(loc_covid, 
                           encoding="ISO-8859-1",
                           parse_dates=["Updated"],
                           dayfirst=False, low_memory=False)
    
    ms_covid = ms_covid[(ms_covid["Country_Region"]==country_name) & (ms_covid["AdminRegion1"].isnull())]
    ms_covid = ms_covid.loc[:,["Updated", "Confirmed", "ConfirmedChange", "Deaths", "Recovered"]]   
    
    # Create the 3 categories S, I, R for US data (UK does not include R)
    ms_covid = ms_covid.fillna(method='ffill').fillna(0)

    # DIFF: Instead of smoothing the I, Daily cases and S, ammend the problematic R value @ 2020-07-18
    # To match the next 2020-07-19
    if country_name=="United States":
        ms_covid.iloc[178, 4] = 1117084.0
        
    # Read the countries population from World Bank Database 2019
    pop_dt = pd.read_excel(
        "./Data/Population/Data_Extract_From_World_Development_Indicators.xls")
    

    population_size = pop_dt.loc[pop_dt.CountryName==country_name, "Total"].values[0]
    
    ms_covid["I"] = ms_covid["Confirmed"] - ms_covid["Deaths"] - ms_covid["Recovered"]
    ms_covid["S"] = population_size - ms_covid["I"]
    ms_covid["index"] = ms_covid["Updated"].dt.dayofyear

    # DIFF: Create a time variable
    # If we have a weekend then assign 0 or weekday 1
    ms_covid["WEEKDAY"] = (ms_covid["Updated"].dt.dayofweek < 5).astype(int)
    
    ms_covid["ConfirmedChange"] = np.log(ms_covid["ConfirmedChange"].rolling(7).median()+1)
    ms_covid["I"] = ms_covid["I"].rolling(7).median()
    ms_covid["S"] = ms_covid["S"].rolling(7).median()

    ms_covid.dropna(inplace=True)
    ms_covid.reset_index(drop=True, inplace=True)
    
    final_data = ms_covid.set_index("Updated").merge(
    mobility_data.set_index("date"), how='left', left_index=True, right_index=True).reset_index()
    final_data.dropna(inplace=True)
    final_data.reset_index(drop=True, inplace=True)
    final_data.columns.values[0] = "Updated"
    
    if plots:
        
        fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(20, 4))
        fig.suptitle("{0} is a Tier {1} country in IHDI".format(country_name, int(tier)), fontsize=16)
        final_data.set_index("Updated")[["ConfirmedChange"]].plot(ax=axs[0, 0])
        final_data.set_index("Updated")[["I"]].plot(ax=axs[0, 1])
        final_data.set_index("Updated")[["S"]].plot(ax=axs[0, 2])
        final_data.set_index("Updated")[["HOME"]].plot(ax=axs[1, 0])
        final_data.set_index("Updated")[["WORK"]].plot(ax=axs[1, 1])
        final_data.set_index("Updated")[["OTHER"]].plot(ax=axs[1, 2])
    
    if print_summary:
        print(round(final_data.describe(), 3))
    
    return(final_data)

In [3]:
def create_multioutput_table(long=False):
    
    countries_list = ["Germany", "Japan", "Australia", "Canada", 
                      "United States", "South Korea", "Italy", "Argentina", 
                      "Turkey", "Thailand", "Colombia", "Mexico",
                      "India", "Kenya", "Bolivia", "Egypt"]
    
    if long: # Long table format. We create a column to name each country and we concat the tables.
        for i, country_name in enumerate(countries_list):

            print("\n Read {0} and extract the data...{1}".format(country_name, i))

            if i==0:
                final_data = country_selection(country_name, print_summary=False, plots=False)
                final_data["Country"] = country_name
            else:
                data_batch = country_selection(country_name, print_summary=False, plots=False)
                data_batch["Country"] = country_name
                final_data = pd.concat([final_data, data_batch], axis=0)
            
    else: # We add each country's table as a column extension. We add a suffix on each column country-group.
        for i, country_name in enumerate(countries_list):
            
            print("\n Read {0} and extract the data...{1}".format(country_name, i))
            
            if i==0:
                final_data = country_selection(country_name, print_summary=False, plots=False)
                final_data.columns = [str(col) + '_' + str(country_name) for col in final_data.columns]
            else:    
                data_batch = country_selection(country_name, print_summary=False, plots=False)
                data_batch.columns = [str(col) + '_' + str(country_name) for col in data_batch.columns]
                final_data = pd.concat([final_data, data_batch], axis=1)

        
    return(final_data) 

In [4]:
final_data = create_multioutput_table()


 Read Germany and extract the data...0

 Read Japan and extract the data...1

 Read Australia and extract the data...2

 Read Canada and extract the data...3

 Read United States and extract the data...4

 Read South Korea and extract the data...5

 Read Italy and extract the data...6

 Read Argentina and extract the data...7

 Read Turkey and extract the data...8

 Read Thailand and extract the data...9

 Read Colombia and extract the data...10

 Read Mexico and extract the data...11

 Read India and extract the data...12

 Read Kenya and extract the data...13

 Read Bolivia and extract the data...14

 Read Egypt and extract the data...15


In [5]:
final_data.shape

(164, 208)

## Data Preprocessing (similar to the experiment.py)

I will not create a class for that since we wish to check a very specific task.
The GP model that I will use is going to be the 

In [6]:
# Drop rows with missing values
# In this particular example are going to be 20-30 bottom rows. 
final_data.dropna(axis=0, inplace=True)

In [7]:
# Split the dataset
y = final_data.filter(regex='^ConfirmedChange', axis=1).to_numpy() # Extract columns that contain each country's daily COVID cases
y_cols = final_data.filter(regex='^ConfirmedChange', axis=1).columns


X = final_data.drop(final_data.filter(regex='^Updated|^Confirmed', axis=1).columns, axis=1).to_numpy() # Extract mobility and SI data for each country
X_cols = final_data.drop(final_data.filter(regex='^Updated|^Confirmed', axis=1).columns, axis=1).columns

X_train, X_test = X[:-21,:], X[-21:,:]
y_train, y_test = y[:-21,:], y[-21:,:]

In [8]:
print("The shape of y is: {0} and the shape of X is: {1}".format(y.shape, X.shape))

The shape of y is: (131, 16) and the shape of X is: (131, 160)


In [10]:
countries_list = ["Germany", "Japan", "Australia", "Canada",
                  "United States", "South Korea", "Italy", "Argentina", 
                  "Turkey", "Thailand", "Colombia", "Mexico",
                  "India", "Kenya", "Bolivia", "Egypt"]

In [11]:
P = len(y_cols) # number of target outputs, alternatively the number of countries: 16

# Create a temp dataset as an image of the input data X
temp_data = final_data.filter(X_cols, axis=1).copy()

# Empty lists for kernels and inducing points
mogp_kernel = []
Zs = []

# Create country specific kernels and inducing points through a loop
for i, country in enumerate(countries_list):
    
    # Extract the index location of the country-columns from the dataset X
    # For example, extract the location of mobility and SI columns for Germany
    col_index = [temp_data.columns.get_loc(c) for c in temp_data.filter(regex=str(country)+'$', axis=1).columns.values]
    
    # Create multi-output ARD kernel, one for each country. By linking the kernel to the appropriate X data location
    kern_temp = gpflow.kernels.SquaredExponential(lengthscales=np.ones(len(col_index)), active_dims=col_index) * \
        gpflow.kernels.Linear(active_dims=col_index) * \
        gpflow.kernels.RationalQuadratic(lengthscales=np.ones(len(col_index)), active_dims=col_index)
    
    #Add all the country-specific kernels together in a list
    mogp_kernel.append(kern_temp)
    
    # Create inducing point matrices (<number of points> X <columns allocated to each country>) for each country
    for j, col in enumerate(col_index):
        
        if j==0:
            Z = temp_data.iloc[np.random.randint(temp_data.shape[0], size=15), col]
        else:
            Z = np.vstack([Z, temp_data.iloc[np.random.randint(temp_data.shape[0], size=15), col]])
    
    # Add all the country-specific inducing point matrices together in a list
    Zs.append(Z.T)
            
            
    
# Create multi-output kernel from kernel list
kernel = gpflow.kernels.SeparateIndependent(mogp_kernel)

# Initialize as list inducing inducing variables
iv_list = [gpflow.inducing_variables.InducingPoints(Z) for Z in Zs]

# Create multi-output inducing variables from iv_list
iv = gpflow.inducing_variables.SeparateIndependentInducingVariables(iv_list)



In [14]:
# create SVGP model
m = gpflow.models.SVGP(kernel, gpflow.likelihoods.Gaussian(), inducing_variable=iv, num_latent_gps=P)

In [16]:
def optimize_model_with_scipy(model):
    optimizer = gpflow.optimizers.Scipy()
    optimizer.minimize(
        model.training_loss_closure((X_train, y_train)),
        variables=model.trainable_variables,
        method="l-bfgs-b",
        options={"disp": True, "maxiter": ci_niter(2000)},
    )


optimize_model_with_scipy(m)

Instructions for updating:
Use fn_output_signature instead


InvalidArgumentError:  indices[0] = 90 is not in [0, 10)
	 [[{{node StatefulPartitionedCall/GatherV2_106}}]] [Op:__inference__tf_eval_33983]

Function call stack:
_tf_eval


In [None]:
mogp_kernel

In [None]:
kern_list