In [1]:
import numpy as np
import pandas as pd

from datetime import datetime
from scipy.interpolate import interp1d

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer


## Task Description
Predict drought level for the next 6 weeks, one prediction per week. Using 180 days of previous data.
## This notebook: creating pipelines and saving transformed data as CSVs

We want to make two sub pipelines:
raw df -> semi-processed save to csv -> fully processed
due to the capabilities of pipelines, we will need two pipelines:
* encoding training Ys according to interpolation
* encoding test Ys separately (discrete values)

data is quite big so we want to pre-process and save CSVs whenever possible

Pipeline 1 (for the training data)
1. encode date

Get training labels. we should save these Y labels separately for memory efficiency etc

2. round labels (Y/N)
3. interpolate NaNs (linear or nearest )
4. create 6 week ahead prediction vectors

Losses that we can use for training for each training label set:
* nearest interpolation, rounded labels: discrete
* nearest interpolation, unrounded: continuous
* linear interpolation: continuous 

(when evaluating on test set, we will always round predictions)

Pipeline 2 // training data:
1. normalize (with var or no)
2. add soil data (Y/N) + linearize (param: int, past observations)

For the test data:
1. delete NaN rows
2. round labels
3. create 6 week ahead prediction vectors

## Pipeline 1 with pd.pipe

In [8]:
def interpolate_nans(padata, pkind = 'linear'):
    """
    see: https://stackoverflow.com/a/53050216/2167159
    """
    aindexes = np.arange(padata.shape[0])
    agood_indexes, = np.where(np.isfinite(padata))
    f = interp1d(agood_indexes
               , padata[agood_indexes]
               , bounds_error=False
               , copy=False
               , fill_value="extrapolate"
               , kind=pkind)
    return f(aindexes)

def sort_df(df):
    df.set_index('date', append=True, inplace=True)
    df.sort_index(inplace=True) #sort ascending for both indices
    df.reset_index(level='date', inplace=True)
    return df

#simple centering by subtracting the mean. do this before date encoding
def normalize(df):
    #past scores can be part of the training data
    subdf = df.drop(columns=['date', 'score'])
    df.loc[:, ~df.columns.isin(['date', 'score'])] = (subdf-subdf.mean()).round(2)
    return df


def sin_cos_encoder(df):
    # first convert date to datetime col 
    #replace date col, no need to drop old cols
    df['date'] = pd.to_datetime(df['date'], yearfirst=True, 
                                          format="%Y/%m/%d")
    df['sin'] = np.sin(2 * np.pi * df['date'].dt.dayofyear / 366)
    df['cos'] = np.cos(2 * np.pi * df['date'].dt.dayofyear / 366)
    
    return df

def interpolate_round(df, pkind='nearest', rnd=True):
    '''input df must be sorted and have index fips and score col'''
    #round values
    if rnd:
        df['score'] = df['score'].round(decimals=0)
        
    # interpolate NaN labels for each fips code
    for fips in df.index.unique():
        df.loc[fips, 'score'] = interpolate_nans(df.loc[fips]['score'].to_numpy(), 
                                                 pkind)
    
    if (pkind == 'nearest') and rnd: #the interpolator outputs floats
        df = df.astype({'score': 'int32'}, copy=False)
    return df 

def extract_y(df):
    '''input df is sorted has fips index, column is label'''
    # training y can have rounded or not rounded labels
    # score is already in the correct type (float or int)
    
    for i in range(1,7):
        col_name = 'w' + str(i)
        empty = np.empty(len(df))
        empty[:] = np.NaN
        df[col_name] = empty
    
    for fips in df.index.unique():
        subdf = df.loc[fips]
        for i in range(1,7):
            # shift the dataframe
            col_name = 'w' + str(i)
            df.loc[fips, col_name] = subdf[['score']].shift(periods= -7 * i)['score']
    return df #outputs df with score w1,w2,w3, includes NaNs etc



## Dataset creation
How to deal with NaNs in the test/validation set?

Begin with once a week predictions. When we predict (once a week), we will have the drought measurement of the prediction day, so we can interpolate between measurements all the way into the past. 

The true test accuracy can only be measured on predictions on measurement days. So we'll treat the validation data the same way. 

step 1: process X,y pair then remove NaN rows from X. Then we can just remove NaN rows from all subsequent y's and the indices will match

In [9]:
files = ["validation_timeseries.csv", "test_timeseries.csv", 
        "train_timeseries.csv"]

In [10]:
#do everything to X and y in a single pass
for file in files:
    df = pd.read_csv(file).set_index('fips')
    df = df
    dfTrans = sort_df(df)
    
    #keep a copy of raw scores
    score = dfTrans['score'].copy()
    
    dfTrans = (df.pipe(normalize)
              .pipe(sin_cos_encoder)
              .pipe(interpolate_round, pkind='linear', rnd=False)
              .pipe(extract_y))
    
    dfTrans['score_copy'] = score
    dfTrans.dropna(inplace=True, subset=['w1','w2','w3','w4','w5','w6'])
    
    #write out interpolated scores, can delete rows as needed with original scores
    y_cols = ['date','score','w1','w2','w3','w4','w5','w6']
    dfTrans[y_cols].to_csv('linear' + file) 
    

    
    #keep original scores with NaNs as 'score_copy', 'score' is interpolated scores
    dfTrans.drop(columns=['w1','w2','w3','w4','w5','w6'], inplace=True)
    dfTrans.to_csv("X-" + file)

In [4]:
#write out ys as pickles!!!!!!
for file in files:
    df = pd.read_csv(file).set_index('fips')
    df = df[['date','score']].copy()
    
    for rd in [True, False]:
        for kind in ['nearest', 'linear']:
            dfTrans = (df.pipe(interpolate_round, pkind=kind, rnd=rd)
                          .pipe(extract_y))
            dfTrans.to_csv(kind + str(rd) + file)
        

In [2]:
df = pd.read_csv("X-train_timeseries.csv").set_index('fips')

In [4]:
df.columns

Index(['date', 'PRECTOT', 'PS', 'QV2M', 'T2M', 'T2MDEW', 'T2MWET', 'T2M_MAX',
       'T2M_MIN', 'T2M_RANGE', 'TS', 'WS10M', 'WS10M_MAX', 'WS10M_MIN',
       'WS10M_RANGE', 'WS50M', 'WS50M_MAX', 'WS50M_MIN', 'WS50M_RANGE',
       'score', 'sin', 'cos', 'score_copy'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0_level_0,date,PRECTOT,PS,QV2M,T2M,T2MDEW,T2MWET,T2M_MAX,T2M_MIN,T2M_RANGE,...,WS10M_MIN,WS10M_RANGE,WS50M,WS50M_MAX,WS50M_MIN,WS50M_RANGE,score,sin,cos,score_copy
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,2000-01-01,-2.42,3.85,1.83,1.94,6.56,6.52,2.28,4.05,-1.77,...,-0.44,-1.81,-0.68,-1.77,0.1,-1.87,0.571429,0.017166,0.999853,
1001,2000-01-02,-2.44,3.89,2.6,3.89,7.76,7.72,4.12,5.2,-1.09,...,-0.1,-1.67,-0.2,-1.68,0.59,-2.27,0.714286,0.034328,0.999411,
1001,2000-01-03,1.01,3.49,3.94,5.69,9.57,9.53,4.05,7.91,-3.86,...,0.73,-0.6,2.0,1.71,2.74,-1.02,0.857143,0.051479,0.998674,
1001,2000-01-04,13.31,3.63,-1.4,-1.4,-0.86,-0.89,-0.59,-5.25,4.65,...,0.15,0.32,1.2,1.5,0.61,0.9,1.0,0.068615,0.997643,1.0
1001,2000-01-05,-2.64,4.49,-4.87,-8.94,-10.24,-10.19,-7.86,-10.07,2.21,...,-1.41,-1.29,-2.59,-2.96,-2.48,-0.49,1.142857,0.085731,0.996318,
