> ## US Drought & Meteorological Data Starter Notebook
This notebook will walk you trough loading the data and create a Dummy Classifier, showing a range of F1 scores that correspond to random predictions if given theclass priors.

## Loading & Visualizing the Data
In this section, we load the training and validation data into numpy arrays and visualize the drought classes and meteorological attributes.

We load the csv files for training, validation and testing into the ``files`` dictionary.

In [2]:
import numpy as np
import pandas as pd
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
from datetime import datetime
sns.set_style('white')

files = {}

for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        if 'train' in filename:
            files['train'] = os.path.join(dirname, filename)
        if 'valid' in filename:
            files['valid'] = os.path.join(dirname, filename)
        if 'test' in filename:
            files['test'] = os.path.join(dirname, filename)
            
print(files)

{'valid': './validation_timeseries.csv', 'test': './test_timeseries.csv', 'train': './train_timeseries.csv'}


The following classes exist, ranging from no drought (``None``), to extreme drought (``D4``).
This could be treated as a regression, ordinal or classification problem, but for now we will treat it as 5 distinct classes.

In [3]:
class2id = {
    'None': 0,
    'D0': 1,
    'D1': 2,
    'D2': 3,
    'D3': 4,
    'D4': 5,
}
id2class = {v: k for k, v in class2id.items()}

Now we'll define a helper method to load the datasets. This just walks through the json and discards the few samples that are corrupted.

In [4]:
dfs = {
    k: pd.read_csv(files[k]).set_index(['fips', 'date'])
    for k in files.keys()
}

In [17]:
import numpy as np
from scipy.interpolate import interp1d

def interpolate_nans(padata, pkind='linear'):
    """
    see: https://stackoverflow.com/a/53050216/2167159
    """
    aindexes = np.arange(padata.shape[0])
    agood_indexes, = np.where(np.isfinite(padata))
    f = interp1d(agood_indexes
               , padata[agood_indexes]
               , bounds_error=False
               , copy=False
               , fill_value="extrapolate"
               , kind=pkind)
    return f(aindexes)

In [30]:
# load one of 'train', 'valid' or 'test'
def loadXY(df, shuffle=True, random_state=42, window_size=180, target_size=12):
    soil_df = pd.read_csv('soil_data.csv')
    time_data_cols = sorted([c for c in df.columns if c not in ['fips','date','score']])
    static_data_cols = sorted([c for c in soil_df.columns if c not in ['soil','lat','lon']])
    count = 0
    score_df = df.dropna(subset=['score'])
    X_static = np.empty((len(df)//window_size, len(static_data_cols)))
    X_time = np.empty((len(df)//window_size, window_size, len(time_data_cols)))
    y_past = np.empty((len(df)//window_size, window_size))
    y_target = np.empty((len(df)//window_size, target_size))
    for fips in tqdm(score_df.index.get_level_values(0).unique()):
        start_i = 1 #np.random.randint(1, window_size)
        fips_df = df[(df.index.get_level_values(0)==fips)]
        X = fips_df[time_data_cols].values
        y = fips_df['score'].values
        X_s = soil_df[soil_df['fips']==fips][static_data_cols].values[0]
        for i in range(start_i, len(y)-(window_size+target_size*7), window_size):
            X_time[count] = X[i:i+window_size]
            y_past[count] = interpolate_nans(y[i:i+window_size])
            temp_y = y[i+window_size:i+window_size+target_size*7]
            y_target[count] = np.array(temp_y[~np.isnan(temp_y)][:target_size])
            X_static[count] = X_s
            count += 1
    return X_static[:count], X_time[:count], y_past[:count], y_target[:count]

In [46]:
dfs['train'].head(91)


Unnamed: 0_level_0,Unnamed: 1_level_0,PRECTOT,PS,QV2M,T2M,T2MDEW,T2MWET,T2M_MAX,T2M_MIN,T2M_RANGE,TS,WS10M,WS10M_MAX,WS10M_MIN,WS10M_RANGE,WS50M,WS50M_MAX,WS50M_MIN,WS50M_RANGE,score
fips,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1001,2000-01-01,0.22,100.51,9.65,14.74,13.51,13.51,20.96,11.46,9.50,14.65,2.20,2.94,1.49,1.46,4.85,6.04,3.23,2.81,
1001,2000-01-02,0.20,100.55,10.42,16.69,14.71,14.71,22.80,12.61,10.18,16.60,2.52,3.43,1.83,1.60,5.33,6.13,3.72,2.41,
1001,2000-01-03,3.65,100.15,11.76,18.49,16.52,16.52,22.73,15.32,7.41,18.41,4.03,5.33,2.66,2.67,7.53,9.52,5.87,3.66,
1001,2000-01-04,15.95,100.29,6.42,11.40,6.09,6.10,18.09,2.16,15.92,11.31,3.84,5.67,2.08,3.59,6.73,9.31,3.74,5.58,1.0000
1001,2000-01-05,0.00,101.15,2.95,3.86,-3.29,-3.20,10.82,-2.66,13.48,2.65,1.60,2.50,0.52,1.98,2.94,4.85,0.65,4.19,
1001,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001,2000-03-27,4.08,98.90,8.91,14.53,11.78,11.78,22.22,7.08,15.14,14.54,2.58,4.73,1.41,3.32,5.12,7.62,2.72,4.91,
1001,2000-03-28,0.00,99.07,7.46,14.58,9.10,9.10,24.09,5.37,18.71,14.53,2.97,5.13,1.60,3.53,5.80,8.68,3.52,5.16,1.2818
1001,2000-03-29,1.77,99.65,8.12,14.82,10.34,10.35,21.28,8.17,13.11,14.89,1.84,2.67,0.89,1.78,3.64,5.34,1.90,3.45,
1001,2000-03-30,7.52,99.52,11.90,19.60,16.55,16.54,26.70,13.35,13.35,19.65,2.18,3.20,1.02,2.18,4.31,6.42,1.64,4.78,


In [32]:
X_static_train, X_time_train, y_past_train, y_target_train = loadXY(dfs['train'], window_size=90, target_size=1)

HBox(children=(FloatProgress(value=0.0, max=3108.0), HTML(value='')))




In [44]:
X_time_train[0][-1]

array([  0.14, 100.14,   8.85,  17.07,  12.09,  12.09,  25.05,   9.47,
        15.58,  17.03,   1.7 ,   2.1 ,   1.15,   0.96,   3.4 ,   4.96,
         2.05,   2.9 ])

In [36]:
y_target_train[0]

array([1.])

In [1]:
!pip install sktime



In [47]:
import numpy as np
from sklearn.linear_model import RidgeClassifierCV
from sklearn.pipeline import make_pipeline

from sktime.datasets import load_arrow_head  # univariate dataset
from sktime.datasets.base import load_basic_motions  # multivariate dataset
from sktime.transformations.panel.rocket import MiniRocket, MiniRocketMultivariate

In [None]:
minirocket_multi = MiniRocketMultivariate()
minirocket_multi.fit(X_time_train)
X_train_transform = minirocket_multi.transform(X_time_train)

In [None]:
X_train_transform

In [None]:
classifier = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier.fit(X_train_transform, y_train)

In [None]:
X_static_valid, X_time_valid, y_past_valid, y_target_valid = loadXY(dfs['valid'], window_size=90, target_size=1)

In [None]:
X_valid_transform = minirocket_multi.transform(X_time_valid)

In [None]:
classifier.score(X_valid_transform, y_target_valid)