### In this notebook, the predictions from the platform are downloaded, saved, and organized to be used by the ensemble algorithms proposed 


In [1]:
import numpy as np
import pandas as pd
from epiweeks import Week
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from mosqlient import get_predictions, get_prediction_by_id

import seaborn as sns
import dataframe_image as dfi
from itertools import product
import matplotlib.dates as mdates

plt.rcParams.update({'font.size': 14})

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Get the predictions

In [2]:
def get_preds(model_id, predict_date = None):
    '''
    Function to fetch the predictions by model id
    '''

    if predict_date is not None: 
        list_of_preds = get_predictions(model_id = model_id, predict_date = predict_date)
    
    else: 
        list_of_preds = get_predictions(model_id = model_id)
    
    return list_of_preds 

The function below get the predictions and extract important informations as state, predict_date, and predict_range. This informations are used to deduplicated the predictions if necessary.  

In [3]:
%%time 
preds_det = np.empty((0, 7))

for model_id in [21,22,25, 26, 27,28,30,34]:

    print(model_id)
    list_of_preds = get_preds(model_id)
    
    for idx in np.arange(0, len(list_of_preds)): 

        df = list_of_preds[idx].to_dataframe()

        preds_det_ = np.array([[model_id, df.adm_1[0], 
                              f'{list_of_preds[idx].id}',df.date.min()[:4], list_of_preds[idx].predict_date, 
                              df.date.min(), df.date.max()]])

        preds_det = np.concatenate((preds_det, preds_det_), axis=0)
        

21
22
25
26
27
28
30
34
CPU times: user 1 s, sys: 60.1 ms, total: 1.06 s
Wall time: 29.3 s


In [4]:
df_desc = pd.DataFrame(preds_det, columns = ['model_id', 'state', 'pred_id', 'year',
                                  'predict_date', 'min_date', 'max_date'])

df_desc.head()

Unnamed: 0,model_id,state,pred_id,year,predict_date,min_date,max_date
0,21,RJ,828,2024,2024-09-12,2024-10-06,2025-09-28
1,21,MT,827,2024,2024-09-12,2024-10-06,2025-09-28
2,21,MS,826,2024,2024-09-12,2024-10-06,2025-09-28
3,21,MT,825,2023,2024-09-12,2023-10-08,2024-09-29
4,21,MS,824,2023,2024-09-12,2023-10-08,2024-09-29


The function below is use to get the latest prediction sent to API in the case of duplicated predictions.

In [5]:
# Define custom function for selecting preds between the duplicated preds
def custom_filter(group):
    # Replace with custom logic, e.g., keep row with max 'Value'
    return group.loc[group.pred_id == group.pred_id.max()]

# Apply custom logic to handle duplicates based on 'Category' and 'Subcategory'
result = df_desc.groupby(['model_id', 'state',  'year']).apply(custom_filter).reset_index(drop=True)
result.head()

  result = df_desc.groupby(['model_id', 'state',  'year']).apply(custom_filter).reset_index(drop=True)


Unnamed: 0,model_id,state,pred_id,year,predict_date,min_date,max_date
0,21,AC,776,2022,2024-09-12,2022-10-09,2023-10-01
1,21,AC,792,2023,2024-09-12,2023-10-08,2024-09-29
2,21,AC,808,2024,2024-09-12,2024-10-06,2025-09-28
3,21,AL,461,2022,2024-09-02,2022-10-09,2023-10-01
4,21,AL,483,2023,2024-09-02,2023-10-08,2024-09-29


In [6]:
result.model_id.value_counts()

model_id
21    81
22    81
30    81
34    81
27    79
28    74
25    13
26     2
Name: count, dtype: int64

Since the predictions will be used in the ensemble model, if the model did not send the predictions for 2023, 2024 and 2025 it will be removed.  

In [7]:
result.loc[result.model_id == '26', 'model_id'] = '25'

In [10]:
filter_ = pd.DataFrame(result.groupby(['model_id', 'state']).count()['pred_id']).reset_index()

filter_ = filter_.loc[filter_.pred_id < 3]

# Create list of pairs to exclude
exclude_pairs = list(set(zip(filter_.model_id, filter_.state)))

#exclude = [('25', 'AM')]
#exclude_pairs = [item for item in exclude_pairs if item not in exclude]

# Filter out rows where (column1, column2) matches any pair in exclude_pairs
filtered_df = result[~result.apply(lambda row: (row['model_id'], row['state']) in exclude_pairs, axis=1)]

filtered_df.head()

Unnamed: 0,model_id,state,pred_id,year,predict_date,min_date,max_date
0,21,AC,776,2022,2024-09-12,2022-10-09,2023-10-01
1,21,AC,792,2023,2024-09-12,2023-10-08,2024-09-29
2,21,AC,808,2024,2024-09-12,2024-10-06,2025-09-28
3,21,AL,461,2022,2024-09-02,2022-10-09,2023-10-01
4,21,AL,483,2023,2024-09-02,2023-10-08,2024-09-29


In [11]:
filtered_df.model_id.value_counts()

model_id
21    81
22    81
30    81
34    81
27    78
28    66
25    15
Name: count, dtype: int64

Based on the dataframe with the information of the latest predictions the cell below creates a dataframe with will all the predictions and states that will be used to train and appy the ensemble models. The dataframe will contain the columns: ['date', 'pred', 'lower', 'upper', 'adm_1', 'model_id']

In [12]:
dates_22 = pd.date_range(start= Week(2022, 41).startdate().strftime('%Y-%m-%d'),
                              end = Week(2023, 40).startdate().strftime('%Y-%m-%d'), freq = 'W-SUN')

dates_23 =  pd.date_range(start= Week(2023, 41).startdate().strftime('%Y-%m-%d'),
                              end = Week(2024, 40).startdate().strftime('%Y-%m-%d'), freq = 'W-SUN')


dates_24 =  pd.date_range(start= Week(2024, 41).startdate().strftime('%Y-%m-%d'),
                              end = Week(2025, 40).startdate().strftime('%Y-%m-%d'), freq = 'W-SUN')

In [13]:
%%time 
df_preds = pd.DataFrame()

for pred_, year in zip(filtered_df.pred_id, filtered_df.year): 
#for pred_, year in zip([711],['2022']): 
    preds_ = get_prediction_by_id(id = pred_)
    
    preds_df = preds_.to_dataframe()
    
    preds_df.drop(['adm_0', 'adm_2', 'adm_3'], axis = 1, inplace = True)
    
    preds_df['model_id'] = preds_.model.id

    preds_df.date = pd.to_datetime(preds_df.date)
    # to handle the teams that send predictions overlaping 
    if year == '2022':
        preds_df = preds_df.loc[preds_df.date.isin(dates_22)]

    elif year == '2023':
        preds_df = preds_df.loc[preds_df.date.isin(dates_23)]

    elif year == '2024':
        preds_df = preds_df.loc[preds_df.date.isin(dates_24)]

    else: 
        print('Error')
        break 
        
    df_preds = pd.concat([df_preds, preds_df], ignore_index = True)


df_preds.head()

CPU times: user 10.2 s, sys: 1.22 s, total: 11.4 s
Wall time: 9min 20s


Unnamed: 0,date,pred,lower,upper,adm_1,model_id
0,2022-10-09,110.690113,64.291013,219.826948,AC,21
1,2022-10-16,144.952846,77.564037,279.722708,AC,21
2,2022-10-23,163.613937,92.915072,276.591399,AC,21
3,2022-10-30,170.240443,87.197631,378.407811,AC,21
4,2022-11-06,215.117888,111.835667,445.024983,AC,21


In [14]:
df_preds.to_csv('preds_models.csv.gz')