# Tests for `load_data` 
This notebook serves to address [this](https://github.com/ciemss/pyciemss/issues/434) Github issue.

The new interface for calibrate requires csv files, but there are lots of ways that csv files can fail to provide information in the right format. Test the most common failure modes, such as:

1. Missing data
2. Incorrectly typed columns
3. Mislabeled columns
4. Header columns have one fewer column than data
5. Alignment issues
6. Escaping commas
7. Na, NaN, None, '',,

All of these issues will make it difficult to convert a dataframe to a correctly typed tensor.

### Load dependencies

In [1]:
import os
import pyciemss
from pyciemss.interfaces import calibrate

### Load models

In [2]:
MODEL_PATH = "https://raw.githubusercontent.com/DARPA-ASKEM/simulation-integration/main/data/models/"

# Petrinets
petri1 = os.path.join(MODEL_PATH, "SEIRD_base_model01_petrinet.json")
petri2 = os.path.join(MODEL_PATH, "SEIRHD_base_model01_petrinet.json")
petri3 = os.path.join(MODEL_PATH, "SEIRHD_with_reinfection01_petrinet.json")
petri4 = os.path.join(MODEL_PATH, "SEIRHD_NPI_Type1_petrinet.json")
petri5 = os.path.join(MODEL_PATH, "SEIRHD_NPI_Type2_petrinet.json")

# Regnets
regnet1 = os.path.join(MODEL_PATH, "LV_goat_chupacabra_regnet.json")
regnet2 = os.path.join(MODEL_PATH, "LV_sheep_foxes_regnet.json")
regnet3 = os.path.join(MODEL_PATH, "LV_rabbits_wolves_regnet.json")
regnet4 = os.path.join(MODEL_PATH, "LV_rabbits_wolves_model02_regnet.json")
regnet5 = os.path.join(MODEL_PATH, "LV_rabbits_wolves_model03_regnet.json")

# Stock-and-Flow
stock1 = os.path.join(MODEL_PATH, "SIR_stockflow.json")
stock2 = os.path.join(MODEL_PATH, "SEIR_stockflow.json")
stock3 = os.path.join(MODEL_PATH, "SEIRD_stockflow.json")
stock4 = os.path.join(MODEL_PATH, "SEIRHD_stockflow.json")
stock5 = os.path.join(MODEL_PATH, "SEIRHDS_stockflow.json")

### Load datasets

In [3]:
dataset1 = "../../../docs/source/sa-testing-notebooks/SIR_data_case_hosp.csv" # this dataset should work fine
dataset2 = "../../../docs/source/sa-testing-notebooks/SIR_missing_data.csv"
dataset3 = "../../../docs/source/sa-testing-notebooks/SIR_data_nan.csv"
dataset4 = "../../../docs/source/sa-testing-notebooks/SIR_missing_data_space.csv"
dataset5 = "../../../docs/source/sa-testing-notebooks/SIR_data_none_type.csv"

### Calibrate models to data

### Calibrate works when data is mapped to STATE VARIABLES...

In [23]:
data_mapping = {"case": "I", "hosp": "H"} # data_mapping = "column_name": "observable/state_variable"
calibrated_results = calibrate(petri2, dataset1, data_mapping=data_mapping)
parameter_estimates = calibrated_results[0]
calibrated_results

(AutoGuideList(
   (0): AutoDelta()
   (1): AutoLowRankMultivariateNormal()
 ),
 38.235247045755386)

In [37]:
point_estimates = calibrated_results[0](0)
point_estimates

{'persistent_beta': tensor(0.5314, grad_fn=<ExpandBackward0>),
 'persistent_gamma': tensor(0.3371, grad_fn=<ExpandBackward0>),
 'persistent_hosp': tensor(0.1528, grad_fn=<ExpandBackward0>),
 'persistent_death_hosp': tensor(0.0658, grad_fn=<ExpandBackward0>),
 'persistent_I0': tensor(6.7018, grad_fn=<ExpandBackward0>)}

In [38]:
gaussian_estimates = calibrated_results[0](1)
gaussian_estimates

{'persistent_beta': tensor(0.4895, grad_fn=<ExpandBackward0>),
 'persistent_gamma': tensor(0.3392, grad_fn=<ExpandBackward0>),
 'persistent_hosp': tensor(0.1540, grad_fn=<ExpandBackward0>),
 'persistent_death_hosp': tensor(0.0672, grad_fn=<ExpandBackward0>),
 'persistent_I0': tensor(6.5047, grad_fn=<ExpandBackward0>)}

### Pass inferred parameters to sample 

In [40]:
start_time = 0.0
end_time = 100.
logging_step_size = 10.0
num_samples = 3

pyciemss.sample(petri2, end_time, logging_step_size, num_samples, 
                start_time=start_time, inferred_parameters=parameter_estimates)

{'data':     timepoint_id  sample_id  persistent_beta_param  persistent_gamma_param  \
 0              0          0               0.520472                0.326866   
 1              1          0               0.520472                0.326866   
 2              2          0               0.520472                0.326866   
 3              3          0               0.520472                0.326866   
 4              4          0               0.520472                0.326866   
 5              5          0               0.520472                0.326866   
 6              6          0               0.520472                0.326866   
 7              7          0               0.520472                0.326866   
 8              8          0               0.520472                0.326866   
 9              0          1               0.528125                0.347415   
 10             1          1               0.528125                0.347415   
 11             2          1               0

### ...but NOT when data is mapped to OBSERVABLES

In [6]:
# Try with infected_observable 
data_mapping = {"hosp": "hospitalized_observable", "case": "infected_observable"}
results = calibrate(petri2, dataset2, data_mapping=data_mapping)
results

KeyError: 'hospitalized_observable'

In [4]:
data_mapping = {"case": "infected", "hosp": "hospitalized"}
results = calibrate(petri2, dataset2, data_mapping=data_mapping)
results

KeyError: 'hospitalized'

### (1) Missing data

In [41]:
# Calibrate fails when there is missing data (an empty string) in the dataset
data_mapping = {"case": "I", "hosp": "H"} # data_mapping = "column_name": "observable/state_variable"
calibrated_results = calibrate(petri2, dataset2, data_mapping=data_mapping)
parameter_estimates = calibrated_results[0]
calibrated_results

ValueError: Error while computing log_prob at site 'I_noisy':
Expected value argument (Tensor of shape (3,)) to be within the support (Real()) of the distribution Normal(loc: torch.Size([3]), scale: torch.Size([3])), but found invalid values:
tensor([15., nan, 20.])
                               Trace Shapes:    
                                Param Sites:    
numeric_initial_state_func$$$_nodes.2._value    
numeric_initial_state_func$$$_nodes.3._value    
numeric_initial_state_func$$$_nodes.4._value    
numeric_initial_state_func$$$_nodes.5._value    
                               Sample Sites:    
                        persistent_beta dist |  
                                       value |  
                                    log_prob |  
                       persistent_gamma dist |  
                                       value |  
                                    log_prob |  
                        persistent_hosp dist |  
                                       value |  
                                    log_prob |  
                  persistent_death_hosp dist |  
                                       value |  
                                    log_prob |  
                          persistent_I0 dist |  
                                       value |  
                                    log_prob |  
                                H_noisy dist | 3
                                       value | 3
                                    log_prob |  
                                I_noisy dist | 3
                                       value | 3

In [42]:
# Calibrate fails when there is a NaN value in the dataset
data_mapping = {"case": "I", "hosp": "H"} # data_mapping = "column_name": "observable/state_variable"
calibrated_results = calibrate(petri2, dataset3, data_mapping=data_mapping)
parameter_estimates = calibrated_results[0]
calibrated_results

ValueError: Error while computing log_prob at site 'I_noisy':
Expected value argument (Tensor of shape (3,)) to be within the support (Real()) of the distribution Normal(loc: torch.Size([3]), scale: torch.Size([3])), but found invalid values:
tensor([15., nan, 20.])
                               Trace Shapes:    
                                Param Sites:    
numeric_initial_state_func$$$_nodes.2._value    
numeric_initial_state_func$$$_nodes.3._value    
numeric_initial_state_func$$$_nodes.4._value    
numeric_initial_state_func$$$_nodes.5._value    
                               Sample Sites:    
                        persistent_beta dist |  
                                       value |  
                                    log_prob |  
                       persistent_gamma dist |  
                                       value |  
                                    log_prob |  
                        persistent_hosp dist |  
                                       value |  
                                    log_prob |  
                  persistent_death_hosp dist |  
                                       value |  
                                    log_prob |  
                          persistent_I0 dist |  
                                       value |  
                                    log_prob |  
                                H_noisy dist | 3
                                       value | 3
                                    log_prob |  
                                I_noisy dist | 3
                                       value | 3

In [43]:
# Calibrate fails when there is a space ' ' in the dataset
data_mapping = {"case": "I", "hosp": "H"} # data_mapping = "column_name": "observable/state_variable"
calibrated_results = calibrate(petri2, dataset4, data_mapping=data_mapping)
parameter_estimates = calibrated_results[0]
calibrated_results

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [44]:
# Calibrate fails when there is None type in the dataset
data_mapping = {"case": "I", "hosp": "H"} # data_mapping = "column_name": "observable/state_variable"
calibrated_results = calibrate(petri2, dataset5, data_mapping=data_mapping)
parameter_estimates = calibrated_results[0]
calibrated_results

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

### (2) Incorrectly typed columns

### (3) Mislabeled columns

### (4) Header columns have one fewer column than data

### (5) Other alignment issues

### (6) Escaping commas

### (7) Na, NaN, None, empty string