# Generate sensors from indicator data

In [None]:
# standard libraries
import os
import pickle
from datetime import datetime, timedelta
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# third party
import numpy as np
from tqdm.notebook import tqdm

# first party
from config import Config
from data_containers import LocationSeries, SensorConfig
import sensorization.regression as reg
import sensorization.ar as ar

In [None]:
class Directory:
    ROOT = '../data/'
    
    def __init__(self, gt_indicator = Config.ground_truth_indicator):
        self.gt = gt_indicator
        self.infections_root_dir = './results/ntf_tapered/'
        self.indicator_root_dir = os.path.join(Directory.ROOT, 'indicators')
        self.sensor_root_dir = os.path.join(Directory.ROOT, 'sensors')
        self.jhu_path = os.path.join(
            Directory.ROOT, f'jhu-csse_confirmed_incidence_prop/{self.gt.source}_{self.gt.signal}')
       
    def deconv_gt_file(self, as_of):
        return os.path.join(self.infections_root_dir, f'as_of_{as_of}.p')
    
    def indicator_file(self, indicator, as_of):
        return os.path.join(
            self.indicator_root_dir,
            f'{indicator.source}-{indicator.signal}_{as_of}.p')
    
    def sensor_file(self, config, as_of):
        return os.path.join(
            self.sensor_root_dir,
            f'{config.source}_{config.signal}_{as_of}.p')
    
    def maybe_load_file(self, file_name, verbose=False):
        if not os.path.isfile(file_name):
            if verbose:
                print(file_name, 'does not exist')
            return False
        
        return pickle.load(open(file_name, 'rb'))
    
    def maybe_write_file(self, data, file_name, overwrite=False, verbose=False):
        if os.path.isfile(file_name) and not overwrite:
            if verbose:
                print(file_name, 'exists')
            return False
        
        dir_name = os.path.dirname(file_name)
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)   
        
        pickle.dump(data, open(file_name, 'wb'))
        return True
    
    @staticmethod
    def exists(file_name, overwrite=False):
        if os.path.isfile(file_name) and not overwrite:
            return True
        return False
    
def conform(location_series):
    if location_series is None:
        return None
    
    if location_series.data is None or np.isnan(location_series.values).all():
        return None
    
    if isinstance(location_series.dates[0], datetime):
        location_series.data = dict(zip([d.date() for d in location_series.dates],
                                        location_series.values))
    return location_series

### Generate regression sensors

In [None]:
delay_dist_dir = "../data/km_delay_distributions/"
as_of_range = Config.as_of_range
infections_config = SensorConfig(
    'jhu-csse', 'confirmed_incidence_prop','deconv_infections', 2)
indicators = [Config.fb_cliic, Config.dv_cli, Config.google_aa, Config.chng_cli, Config.chng_covid]
directory = Directory(infections_config)
overwrite = False

In [None]:
for as_of in tqdm(as_of_range):
    delay_dist = Config.get_delay_distribution(
        as_of, storage_dir=delay_dist_dir)[as_of]
    infections_file = directory.deconv_gt_file(as_of)
    infections_data = directory.maybe_load_file(infections_file)
    last_infections_date = as_of - timedelta(directory.gt.lag)
    assert infections_data is not None
    
    for indicator in indicators:
        indicator_file = directory.indicator_file(indicator, as_of)
        indicator_data = directory.maybe_load_file(indicator_file)
        output_file = directory.sensor_file(indicator, as_of)
        last_indicator_date = as_of - timedelta(indicator.lag)
        last_date = min(last_infections_date, last_indicator_date)
        
        if directory.exists(output_file, overwrite):
            print(output_file, 'exists')
            continue
            
        if not indicator_data:
            print(as_of, indicator.source, 'not available')
            continue
            
        output = {}
        available_at_lowest_lag = True
        for loc, series in indicator_data.items():
            covariates = conform(series)
            response = conform(infections_data[series.geo_value])
            
            # Check if indicator was available at its lowest latency
            if covariates.dates[-1] != last_indicator_date:
                print(as_of, indicator.source, f'not available at latency {indicator.lag}')
                available_at_lowest_lag = False
                break
                
            # Check if response was available at its lowest latency
            if response.dates[-1] != last_infections_date:
                print(as_of, "infections", f'not available at latency {directory.gt.lag}')
                available_at_lowest_lag = False
                break
                
            try:
                output[loc] = reg.compute_regression_sensor(last_date, series, response, delay_dist)
            except np.linalg.LinAlgError as e:
                if str(e) != "Singular matrix":
                    raise
                else:
                    continue

        if available_at_lowest_lag:
            directory.maybe_write_file(output, output_file, overwrite)

### Generate autoregressive sensor

In [None]:
indicator = SensorConfig('ar3', 'ntf_tapered_infections', 'ar3', lag=1)
overwrite = True

for as_of in tqdm(as_of_range):
    delay_dist = Config.get_delay_distribution(
        as_of, storage_dir=delay_dist_dir)[as_of]
    infections_file = directory.deconv_gt_file(as_of)
    infections_data = directory.maybe_load_file(infections_file)
    last_infections_date = as_of - timedelta(directory.gt.lag)
    assert infections_data is not None
    
    output_file = directory.sensor_file(indicator, as_of)
    last_date = as_of - timedelta(indicator.lag)

    if directory.exists(output_file, overwrite):
        print(output_file, 'exists')
        continue

    output = {}
    available_at_lowest_lag = True 
    for loc, series in infections_data.items():
        # Check if indicator was available at lowest latency
        if series.dates[-1] != last_infections_date:
            print(as_of, f'infections not available at latency {directory.gt.lag}')
            available_at_lowest_lag = False
            break

        response = conform(infections_data[series.geo_value])
        est = ar.compute_ar_sensor(
            last_date, response, response, delay_dist,
            [1, 2, 3], 1)
        
        # Fill in first 3 days which are missing with the AR model
        # using the mean
        extra_dates = [d.date() for d in date_range(
            series.dates[0], est.dates[0] - timedelta(1))]
        extra_vals = np.full((len(extra_dates),), np.mean(est.values))
        output[loc] = LocationSeries(
            est.geo_value, est.geo_type, dict(zip(
                np.r_[extra_dates, est.dates], np.r_[extra_vals, est.values]))
        )

    if available_at_lowest_lag:
        directory.maybe_write_file(output, output_file, overwrite)