In [6]:
#import packages we'll use
import pandas as pd
import numpy as np
from os import listdir
from os.path import join
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.base import clone
import datetime as dt

_default_date_format = "%Y-%m-%d"


#define a function that takes a date and the format the date is in as inputs and changes that to an ordinal
#strptime: takes 2 inputs: (string that is a date, format that the date is in)
def date_to_int(date_string: str, form: str=_default_date_format) -> int:
    """Return date date_string in format form as an integer"""
    return dt.datetime.strptime(date_string, form).toordinal()


#function that's the inverse of the last function
def int_to_date(ordinal: int, form: str=_default_date_format) -> str:
    """Return the day number ordinal to as a string, formatted with form"""
    return dt.datetime.fromordinal(ordinal).strftime(form)


#adding lag: shifting features by n days. E.g. if n=14, then for a particular day we could be looking at
#the number of new covid cases 14 days ago, or the number of covid beds used 14 days ago
def add_lag(df, feature_dict):
    """Return a dataframe obtained from df by adding lag
    feature called featurename-n for each featurename in feature_dict
    and each n in feature_dict[featurename]"""
    series_list = []
    for feature in feature_dict.keys():
        for n in feature_dict[feature]:
            if n != 0:
                series = df[feature].shift(n).copy().rename(f"{feature}-{n}")
                series_list.append(series) 
    untrimmed = pd.concat([df[feature_dict.keys()]] + series_list, axis=1).copy()
    #adding the series to main data frame
    return untrimmed.iloc[max(np.max(feature_dict[feature]) \
                                  for feature in feature_dict \
                                 if len(feature_dict[feature]) > 0):]
    #getting rid of data that's too early--doesn't have enough previous days

    
#reading the spreadsheet. Making a dictionary where the keys are each state
#the values are the data frame gotten from the csv for that state
def load_states():
    """Read state covid data"""
    covid_all_states = {}
    state_list = []
    state_dir = join("..", "data", "input", "simple_states")
    for filename in listdir(state_dir):
        if filename[-4:] == ".csv":
            state = filename[:-4]
            covid_all_states[state] = pd.read_csv(join(state_dir, filename))
            state_list.append(state)
    return covid_all_states, state_list


    
class FuturePrediction:
    """Time series cross-validator
    
            Provides train/test indices to split data in train/test sets.

    Parameters
    ----------
    train_length : int
        Length of training period.
    
    future_time : int
        Number of time periods in the future we want to predict
        
    interval : int, default=1
        One out of this many time periods is chosen to validate
    
    randomize : bool, default=False
        Randomize which days are chosen to validate
    """
    def __init__(self, train_length, future_interval,
                 interval=1, randomize=False):
        self.train_length = train_length
        self.future_interval = future_interval
        self.interval = interval
        self.randomize = randomize
    #init method: like k-fold. but we're defining it ourselves
    #when you call a method of a class, the object of the class itself is already passed as a first argument
        
    def split(self, df):
        length = len(df)
        index = 0
        max_index = length - self.train_length - self.future_interval
        while index < max_index:
            yield np.arange(index, index+self.train_length), \
                    np.array([index+self.train_length+self.future_interval-1])
            #yield is a generator (i.e. a thing you can loop over). every time you see a yield statement, add this pair
            #of things to the list. return the list at the end. difference between yield and list: yield returns things one
            #at a time instead of all at once
            if self.randomize:
                index = index+np.randint(1, 2*self.interval)
            #if we specify self.randomize to be True, it increments by that value. otherwise, it just increments
            #by self.interval
            else:
                index = index+self.interval
                
                
#plot the actual number of hospital beds used against the number predicted based on our model and also
#the baseline prediction: predicting that the number of beds used in n days will be exactly the same as now
def plot_predictions(reg, df, train_length, future_interval, features):
    actuals, baselines, predicteds = list(), list(), list()
    cv = FuturePrediction(train_length, future_interval)
    for train_index, test_index in cv.split(df):
        train, test = df.iloc[train_index], df.iloc[test_index]
        actual = test.beds.iloc[0]
        reg_copy = clone(reg)
        reg_copy.fit(train[features], train["beds"])
        predicted = reg_copy.predict(test[features])[0]
        target = f"beds-{future_interval}"
        baseline = test[target].iloc[0]
        actuals.append(actual)
        baselines.append(baseline)
        predicteds.append(predicted)
    
    plt.plot(actuals, label="actual")
    plt.plot(baselines, label="baseline")
    plt.plot(predicteds, label="predicted")
    start_date = df.date.iloc[train_length+future_interval-1]
    end_date = df.date.iloc[-1]
    plt.xticks([0,len(actuals)-1], [start_date, end_date])
    plt.xlabel("Date")
    plt.ylabel("Covid beds in use")
    plt.legend()
    return None

def validate(covid_state, regressor, train_length,
             future_interval, max_lag, interval, param_grid,
            plot=False):
    """ Performs state-by-state optimization of hyperparameters
    
        Parameters:
        covid_state: dictionary
            Dictionary of data frames with state data
            
        regreessor: regression object
        train_length: integer
            Number of training data points to fit
        future_interval: integer
            Number of days in the future we want to predict
        max_lag: integer
            Max number of days worth of data to use in each data point
        interval: integer
            Space between testing days for GridSearchCV
        param_grid: dictionary
            Hyperparameter values to check (passed to GridSearchCV)
    """
    cv = FuturePrediction(train_length, future_interval, interval)
    cv_test = FuturePrediction(train_length, future_interval)
    ratios = []
    for state in covid_state:
        df = add_lag(covid_state[state],
                    {"beds": range(future_interval, future_interval+max_lag),
                    "cases_7day": range(future_interval, future_interval+max_lag),
                    "vaccines": [future_interval],
                    "date":[],
                    "day_number":[]})
        train = df.loc[(df.day_number >= date_to_int("2021-01-01")) &
                      (df.day_number <= date_to_int("2021-07-01"))].copy()
        test = df.loc[(df.day_number > date_to_int("2021-07-01")) &
                      (df.day_number < date_to_int("2021-09-01"))].copy()

        gs = GridSearchCV(regressor,
                    param_grid=param_grid,
                    scoring="neg_mean_squared_error",
                    cv=cv.split(train),
                    n_jobs=-1)
        #first argument: regression object. it gets cloned many times with different values of hyperparameters.
        #set hyperparameters, fit it, compare with other values of hyperparameters
        #param_grid argument: dictionary. each key of the dictionary is the name of a hyperparameter. value of dictionary at that key
        #is the list of possible values for that hyperparameter
        
        features = [f"beds-{k}" for k in range(future_interval, future_interval+max_lag)] + \
                    [f"cases_7day-{k}" for k in range(future_interval, future_interval+max_lag)] + \
                    [f"vaccines-{future_interval}"]
        gs.fit(train[features], train["beds"])
        reg = gs.best_estimator_
        #best_estimator_: one of the attributes gridsearch has built in. 
        q = cross_validate(reg, test[features],
                       test["beds"],
                       scoring="neg_mean_squared_error",
                       cv=cv_test.split(test))
        regression_mse = -q["test_score"].mean()
        baseline_mse = ((test[f"beds-{future_interval}"].iloc[train_length+future_interval-1:]-\
                         test["beds"].iloc[train_length+future_interval-1:])**2).mean()
        ratio = regression_mse/baseline_mse
        print(state, ratio)
        #print(gs.best_params_)
        ratios.append(ratio)
        if plot:
            plot_predictions(reg, test, train_length, future_interval, features)
            plt.title(f"{state} holdout data")
            plt.show()
    return np.mean(ratios)

In [7]:
class KFold:
    def __init__(self, number_of_pieces, randomize=False):
        self.number_of_pieces=number_of_pieces
        #on the right of = sign, has to match: number_of_pieces. On the left, we could name it something else.
        #self.number=number_of_pieces would also work
        #this stores self.number_of_pieces as part of k-fold class
        #we can still access it in another method
        self.randomize=randomize
        print('abcd', self.number_of_pieces)
    
        

In [8]:
kfold=KFold(5)
kfold.number_of_pieces
kfold.randomize

abcd 5


False