# Tests for `load_data` 
This notebook serves to address [this](https://github.com/ciemss/pyciemss/issues/434) Github issue.

The new interface for calibrate requires csv files, but there are lots of ways that csv files can fail to provide information in the right format. Test the most common failure modes, such as:

1. Missing data
2. Incorrectly typed columns
3. Mislabeled columns
4. Header columns have one fewer column than data
5. Alignment issues
6. Escaping commas
7. Na, NaN, None, '',,

All of these issues will make it difficult to convert a dataframe to a correctly typed tensor.

### Load dependencies

In [1]:
import os
import pyciemss
from pyciemss.interfaces import calibrate

### Load a few models

In [2]:
MODEL_PATH = "https://raw.githubusercontent.com/DARPA-ASKEM/simulation-integration/main/data/models/"

petri1 = os.path.join(MODEL_PATH, "SEIRHD_base_model01_petrinet.json")
regnet1 = os.path.join(MODEL_PATH, "LV_goat_chupacabra_regnet.json")
stock1 = os.path.join(MODEL_PATH, "SIR_stockflow.json")

### Load datasets with various errors and forms of missing data

In [6]:
dataset1 = "../../../docs/source/sa-testing-notebooks/SIR_data_case_hosp.csv" # this dataset should work fine
dataset2 = "../../../docs/source/sa-testing-notebooks/SIR_missing_data.csv"
dataset3 = "../../../docs/source/sa-testing-notebooks/SIR_data_nan.csv"
dataset4 = "../../../docs/source/sa-testing-notebooks/SIR_missing_data_space.csv"
dataset5 = "../../../docs/source/sa-testing-notebooks/SIR_data_none_type.csv"
dataset6 = "../../../docs/source/sa-testing-notebooks/SIR_data_wrong_time_col_name.csv"

# Build `check_data` function to go inside of `load_data`

### Define `load` function

In [31]:
from typing import Dict, Tuple

import pandas as pd
import torch

def load_data(
    path: str, data_mapping: Dict[str, str] = {}
) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
    """
    Load data from a CSV file.

    - path: path to the CSV file
    - data_mapping: A mapping from column names in the data file to state variable names in the model.
        - keys: str name of column in dataset
        - values: str name of state/observable in model
    - If not provided, we will assume that the column names in the data file match the state variable names.
    """

    def check_data(data_path: str):
        # This function checks a dataset formatting errors, and returns a dataframe
        data_df = pd.read_csv(path)
        print(data_df.head())
        
        # Check that the first column name is "Timestamp"
        assert data_df.columns[0] == "Timestamp", "The column of timepoints must be first, and named 'Timestamp'."
        
        # Check that there are no NaN values or empty entries
        assert not data_df.isna().any().any(), "Dataset cannot contain NaN or empty entries"
        
        # Check that there is no missing data in the form of None type or char values
        assert data_df.applymap(lambda x: isinstance(x, (int, float))).all().all(), "Dataset cannot contain None type or char values. All entries must be of type `int` or `float`."
        
        return data_df
    
    df = check_data(path)

    data_timepoints = torch.tensor(df["Timestamp"].values, dtype=torch.float32)
    data = {}

    for col in df.columns:
        if col == "Timestamp":
            continue

        if col in data_mapping:
            data[data_mapping[col]] = torch.tensor(df[col].values, dtype=torch.float32)
        else:
            data[col] = torch.tensor(df[col].values, dtype=torch.float32)
        # TODO: address missing data

    return data_timepoints, data

### Check for errors in datasets

In [36]:
data_mapping = {"case": "I", "hosp": "H"}

load_data(dataset3, data_mapping)

   Timestamp  case  hosp
0        1.1  15.0   0.1
1        2.2   NaN   1.0
2        3.3  20.0   2.2


AssertionError: Dataset cannot contain NaN or empty entries

## Calibrate models to data

### Calibrate works when data is mapped to STATE VARIABLES...

In [4]:
data_mapping = {"case": "I", "hosp": "H"} # data_mapping = "column_name": "observable/state_variable"
num_iterations = 10
calibrated_results = calibrate(petri1, dataset1, data_mapping=data_mapping, num_iterations=num_iterations)
parameter_estimates = calibrated_results[0]
calibrated_results

(AutoGuideList(
   (0): AutoDelta()
   (1): AutoLowRankMultivariateNormal()
 ),
 49.29112687706947)

In [5]:
point_estimates = calibrated_results[0](0)
point_estimates

{'persistent_beta': tensor(0.3970, grad_fn=<ExpandBackward0>),
 'persistent_gamma': tensor(0.3801, grad_fn=<ExpandBackward0>),
 'persistent_hosp': tensor(0.0948, grad_fn=<ExpandBackward0>),
 'persistent_death_hosp': tensor(0.0401, grad_fn=<ExpandBackward0>),
 'persistent_I0': tensor(8.8743, grad_fn=<ExpandBackward0>)}

In [6]:
gaussian_estimates = calibrated_results[0](1)
gaussian_estimates

{'persistent_beta': tensor(0.4286, grad_fn=<ExpandBackward0>),
 'persistent_gamma': tensor(0.3866, grad_fn=<ExpandBackward0>),
 'persistent_hosp': tensor(0.1157, grad_fn=<ExpandBackward0>),
 'persistent_death_hosp': tensor(0.0383, grad_fn=<ExpandBackward0>),
 'persistent_I0': tensor(10.0013, grad_fn=<ExpandBackward0>)}

### Pass inferred parameters to sample 

In [9]:
start_time = 0.0
end_time = 100.
logging_step_size = 10.0
num_samples = 3

results = pyciemss.sample(petri1, end_time, logging_step_size, num_samples, 
                start_time=start_time, inferred_parameters=parameter_estimates)
results

{'data':     timepoint_id  sample_id  persistent_beta_param  persistent_gamma_param  \
 0              0          0               0.491275                0.369251   
 1              1          0               0.491275                0.369251   
 2              2          0               0.491275                0.369251   
 3              3          0               0.491275                0.369251   
 4              4          0               0.491275                0.369251   
 5              5          0               0.491275                0.369251   
 6              6          0               0.491275                0.369251   
 7              7          0               0.491275                0.369251   
 8              8          0               0.491275                0.369251   
 9              0          1               0.419461                0.365113   
 10             1          1               0.419461                0.365113   
 11             2          1               0

### ...but NOT when data is mapped to OBSERVABLES

In [10]:
# Try with infected_observable 
data_mapping = {"hosp": "hospitalized_observable", "case": "infected_observable"}
results = calibrate(petri1, dataset1, data_mapping=data_mapping)
results

KeyError: 'infected_observable'

### (1) Missing data

In [41]:
# Calibrate fails when there is missing data (an empty string) in the dataset
data_mapping = {"case": "I", "hosp": "H"} # data_mapping = "column_name": "observable/state_variable"
calibrated_results = calibrate(petri2, dataset2, data_mapping=data_mapping)
parameter_estimates = calibrated_results[0]
calibrated_results

ValueError: Error while computing log_prob at site 'I_noisy':
Expected value argument (Tensor of shape (3,)) to be within the support (Real()) of the distribution Normal(loc: torch.Size([3]), scale: torch.Size([3])), but found invalid values:
tensor([15., nan, 20.])
                               Trace Shapes:    
                                Param Sites:    
numeric_initial_state_func$$$_nodes.2._value    
numeric_initial_state_func$$$_nodes.3._value    
numeric_initial_state_func$$$_nodes.4._value    
numeric_initial_state_func$$$_nodes.5._value    
                               Sample Sites:    
                        persistent_beta dist |  
                                       value |  
                                    log_prob |  
                       persistent_gamma dist |  
                                       value |  
                                    log_prob |  
                        persistent_hosp dist |  
                                       value |  
                                    log_prob |  
                  persistent_death_hosp dist |  
                                       value |  
                                    log_prob |  
                          persistent_I0 dist |  
                                       value |  
                                    log_prob |  
                                H_noisy dist | 3
                                       value | 3
                                    log_prob |  
                                I_noisy dist | 3
                                       value | 3

In [42]:
# Calibrate fails when there is a NaN value in the dataset
data_mapping = {"case": "I", "hosp": "H"} # data_mapping = "column_name": "observable/state_variable"
calibrated_results = calibrate(petri2, dataset3, data_mapping=data_mapping)
parameter_estimates = calibrated_results[0]
calibrated_results

ValueError: Error while computing log_prob at site 'I_noisy':
Expected value argument (Tensor of shape (3,)) to be within the support (Real()) of the distribution Normal(loc: torch.Size([3]), scale: torch.Size([3])), but found invalid values:
tensor([15., nan, 20.])
                               Trace Shapes:    
                                Param Sites:    
numeric_initial_state_func$$$_nodes.2._value    
numeric_initial_state_func$$$_nodes.3._value    
numeric_initial_state_func$$$_nodes.4._value    
numeric_initial_state_func$$$_nodes.5._value    
                               Sample Sites:    
                        persistent_beta dist |  
                                       value |  
                                    log_prob |  
                       persistent_gamma dist |  
                                       value |  
                                    log_prob |  
                        persistent_hosp dist |  
                                       value |  
                                    log_prob |  
                  persistent_death_hosp dist |  
                                       value |  
                                    log_prob |  
                          persistent_I0 dist |  
                                       value |  
                                    log_prob |  
                                H_noisy dist | 3
                                       value | 3
                                    log_prob |  
                                I_noisy dist | 3
                                       value | 3

In [43]:
# Calibrate fails when there is a space ' ' in the dataset
data_mapping = {"case": "I", "hosp": "H"} # data_mapping = "column_name": "observable/state_variable"
calibrated_results = calibrate(petri2, dataset4, data_mapping=data_mapping)
parameter_estimates = calibrated_results[0]
calibrated_results

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [44]:
# Calibrate fails when there is None type in the dataset
data_mapping = {"case": "I", "hosp": "H"} # data_mapping = "column_name": "observable/state_variable"
calibrated_results = calibrate(petri2, dataset5, data_mapping=data_mapping)
parameter_estimates = calibrated_results[0]
calibrated_results

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

### (2) Incorrectly typed columns

### (3) Mislabeled columns

### (4) Header columns have one fewer column than data

### (5) Other alignment issues

### (6) Escaping commas

### (7) Na, NaN, None, empty string

### Check `load_data` function for each dataset

In [13]:
from typing import Dict, Tuple

import pandas as pd
import torch

def load_data(
    path: str, data_mapping: Dict[str, str] = {}
) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
    """
    Load data from a CSV file.

    - path: path to the CSV file
    - data_mapping: A mapping from column names in the data file to state variable names in the model.
        - keys: str name of column in dataset
        - values: str name of state/observable in model
    - If not provided, we will assume that the column names in the data file match the state variable names.
    """

    df = pd.read_csv(path)

    data_timepoints = torch.tensor(df["Timestamp"].values, dtype=torch.float32)
    data = {}

    for col in df.columns:
        if col == "Timestamp":
            continue

        if col in data_mapping:
            data[data_mapping[col]] = torch.tensor(df[col].values, dtype=torch.float32)
        else:
            data[col] = torch.tensor(df[col].values, dtype=torch.float32)
        # TODO: address missing data

    return data_timepoints, data

In [18]:
# Load data function works for datasets 1 - 3, but fails for datasets 4 (space ' ') and 5 (None type data)
load_data(dataset5, data_mapping)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.