In [1]:
# Autoreload packages that are modified
%load_ext autoreload
%autoreload 2

from datetime import datetime, timedelta
import glob
import os
import sys
import time

import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import cartopy.crs as ccrs
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd

cwd = os.getcwd()
sys.path.append(f"{cwd}/forecast_rodeo")
sys.path.append(f"{cwd}/forecast_rodeo/src/experiments")
from experiments_util import get_target_date, month_day_subset
from stepwise_util import default_stepwise_candidate_predictors

In [2]:
#https://stackoverflow.com/questions/43147983/could-not-create-cudnn-handle-cudnn-status-internal-error
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [3]:
target = "contest_tmp2m" # "contest_precip" or "contest_tmp2m"
target_horizon = "34w" # "34w" or "56w"

data_path = os.path.expanduser("forecast_rodeo/results/regression/shared")
data_matrices_folder = f"{target}_{target_horizon}"
fs = glob.glob(f"{data_path}/{data_matrices_folder}/*.h5")
print(fs)
lat_lon_date_data_file = fs[0]
date_data_file = fs[1]

['forecast_rodeo/results/regression/shared/contest_tmp2m_34w/lat_lon_date_data-contest_tmp2m_34w.h5', 'forecast_rodeo/results/regression/shared/contest_tmp2m_34w/date_data-contest_tmp2m_34w.h5']


## Define dates of interest for prediction

In [9]:
submission_dates = [datetime(y,4,18)+timedelta(14*i) for y in range(2011,2018) for i in range(26)]
submission_dates = ['{}{:02d}{:02d}'.format(date.year, date.month, date.day) for date in submission_dates]
target_date_objs = [get_target_date(submission_date_str, target_horizon) for submission_date_str in submission_dates]
target_dates = ['{}{:02d}{:02d}'.format(date.year, date.month, date.day) for date in target_date_objs]

# for submission_date, target_date in zip(submission_dates, target_dates):
#     print(f"submission date: {submission_date}, target date: {target_date}")
    
submission_date = submission_dates[0]
target_date_obj = target_date_objs[0]
target_date = target_dates[0]
    
print(f"submission date: {submission_date}, target date: {target_date}")

# some vars
gt_col = target.split('_')[-1]  # 'tmp2m'
clim_col = f"{gt_col}_clim"     # 'tmp2m_clim'
anom_col = f"{gt_col}_anom"     # 'tmp2m_anom'
base_col = 'zeros'
group_by_cols = ['lat', 'lon']
first_train_year = 1978 # use 1948 for precip, 1978 for temp
start_delta = 29 # 29 for 34w or 43 for 56w
last_train_date = target_date_obj - timedelta(start_delta)
print(f"submission date: {submission_date}")
print(f"last train date: {last_train_date}")

# get data array names we care about
candidate_x_cols = default_stepwise_candidate_predictors(target, target_horizon, hindcast=False)

relevant_cols = set(candidate_x_cols
                    +[base_col,clim_col,anom_col,'start_date','lat','lon','target','year','ones']
                    +group_by_cols)
print(relevant_cols)
print(candidate_x_cols)
for c in relevant_cols:
    if c not in candidate_x_cols:
        print(c)

submission date: 20110418, target date: 20110502
submission date: 20110418
last train date: 2011-04-03 00:00:00
{'wind_hgt_10_2010_1_shift30', 'tmp2m_shift29', 'nmme0_wo_ccsm3_nasa', 'rhum_shift30', 'start_date', 'ones', 'tmp2m_shift58_anom', 'tmp2m_shift29_anom', 'zeros', 'tmp2m_anom', 'phase_shift17', 'lat', 'sst_2010_2_shift30', 'icec_2010_2_shift30', 'wind_hgt_10_2010_2_shift30', 'sst_2010_1_shift30', 'mei_shift45', 'sst_2010_3_shift30', 'nmme_wo_ccsm3_nasa', 'tmp2m_clim', 'year', 'lon', 'tmp2m_shift58', 'icec_2010_1_shift30', 'icec_2010_3_shift30', 'target', 'pres_shift30'}
['ones', 'tmp2m_shift29', 'tmp2m_shift29_anom', 'tmp2m_shift58', 'tmp2m_shift58_anom', 'rhum_shift30', 'pres_shift30', 'nmme_wo_ccsm3_nasa', 'nmme0_wo_ccsm3_nasa', 'mei_shift45', 'phase_shift17', 'sst_2010_1_shift30', 'sst_2010_2_shift30', 'sst_2010_3_shift30', 'icec_2010_1_shift30', 'icec_2010_2_shift30', 'icec_2010_3_shift30', 'wind_hgt_10_2010_1_shift30', 'wind_hgt_10_2010_2_shift30']
start_date
zeros
tmp2m_

## Load the data files

In [5]:
# raw data files
date_data = pd.read_hdf(date_data_file)
lat_lon_date_data = pd.read_hdf(lat_lon_date_data_file)

In [6]:
# filter out data older than "first_train_year" and keep only relevant columns
data = lat_lon_date_data.loc[lat_lon_date_data.start_date.dt.year >= first_train_year,
                             lat_lon_date_data.columns.isin(relevant_cols)]
data = pd.merge(data, date_data.loc[date_data.start_date.dt.year >= first_train_year,
                                    date_data.columns.isin(relevant_cols)],
                on="start_date", how="left")
del lat_lon_date_data
del date_data

print(len(data.columns))
print(data.columns)

23
Index(['lat', 'lon', 'start_date', 'rhum_shift30', 'pres_shift30',
       'nmme_wo_ccsm3_nasa', 'nmme0_wo_ccsm3_nasa', 'tmp2m_clim', 'tmp2m_anom',
       'tmp2m_shift29', 'tmp2m_shift29_anom', 'tmp2m_shift58',
       'tmp2m_shift58_anom', 'mei_shift45', 'phase_shift17',
       'sst_2010_1_shift30', 'sst_2010_2_shift30', 'sst_2010_3_shift30',
       'icec_2010_1_shift30', 'icec_2010_2_shift30', 'icec_2010_3_shift30',
       'wind_hgt_10_2010_1_shift30', 'wind_hgt_10_2010_2_shift30'],
      dtype='object')


In [8]:
# filter to days within margin around target date
margin_in_days = 56
print(f"target date: {target_date_obj}, margin in days: {margin_in_days}")
sub_data = month_day_subset(data, target_date_obj, margin_in_days).copy()
del data

target date: 2011-05-02 00:00:00, margin in days: 56


In [10]:
print((sub_data.columns))
sub_data['year'] = sub_data.start_date.dt.year
sub_data['ones'] = 1.0
sub_data['zeros'] = 0.0
print((sub_data.columns))

Index(['lat', 'lon', 'start_date', 'rhum_shift30', 'pres_shift30',
       'nmme_wo_ccsm3_nasa', 'nmme0_wo_ccsm3_nasa', 'tmp2m_clim', 'tmp2m_anom',
       'tmp2m_shift29', 'tmp2m_shift29_anom', 'tmp2m_shift58',
       'tmp2m_shift58_anom', 'mei_shift45', 'phase_shift17',
       'sst_2010_1_shift30', 'sst_2010_2_shift30', 'sst_2010_3_shift30',
       'icec_2010_1_shift30', 'icec_2010_2_shift30', 'icec_2010_3_shift30',
       'wind_hgt_10_2010_1_shift30', 'wind_hgt_10_2010_2_shift30'],
      dtype='object')
Index(['lat', 'lon', 'start_date', 'rhum_shift30', 'pres_shift30',
       'nmme_wo_ccsm3_nasa', 'nmme0_wo_ccsm3_nasa', 'tmp2m_clim', 'tmp2m_anom',
       'tmp2m_shift29', 'tmp2m_shift29_anom', 'tmp2m_shift58',
       'tmp2m_shift58_anom', 'mei_shift45', 'phase_shift17',
       'sst_2010_1_shift30', 'sst_2010_2_shift30', 'sst_2010_3_shift30',
       'icec_2010_1_shift30', 'icec_2010_2_shift30', 'icec_2010_3_shift30',
       'wind_hgt_10_2010_1_shift30', 'wind_hgt_10_2010_2_shift30', 'ye

In [11]:
# this is really tmp2m_clim + tmp2m_anom
sub_data['target'] = sub_data[clim_col] + sub_data[anom_col]

# drop data that doesn't have valid targets
sub_data_valid_targets = sub_data.dropna(subset=candidate_x_cols+['target'])

In [12]:
print(sub_data_valid_targets.head())

       lat    lon start_date  rhum_shift30  pres_shift30  nmme_wo_ccsm3_nasa  \
1161  27.0  261.0 1982-03-07     63.620651  99115.838309           17.858008   
1162  27.0  261.0 1982-03-08     63.311109  99094.217494           17.858008   
1163  27.0  261.0 1982-03-09     63.496221  98991.845982           17.858008   
1164  27.0  261.0 1982-03-10     67.211407  98925.521903           17.858008   
1165  27.0  261.0 1982-03-11     67.912997  98960.091518           17.858008   

      nmme0_wo_ccsm3_nasa  tmp2m_clim  tmp2m_anom  tmp2m_shift29  ...  \
1161            13.923172   20.142679    3.084091      13.839942  ...   
1162            13.923172   20.109707    4.078370      15.032883  ...   
1163            13.923172   20.109381    4.329497      16.121520  ...   
1164            13.923172   20.213552    4.179805      16.665229  ...   
1165            13.923172   20.343285    3.923927      17.309051  ...   

      sst_2010_3_shift30  icec_2010_1_shift30  icec_2010_2_shift30  \
1161      

In [13]:
data_grouped_by_latlon = sub_data_valid_targets.loc[:,relevant_cols].groupby(group_by_cols)

In [14]:
lat_oi, lon_oi = (37.0, 238.0)

data_at_lat_lon = data_grouped_by_latlon.get_group((lat_oi, lon_oi))

## Do regression
We want to predict the temperature `tmp2m` (stored in `target`) using features from the list `candidate_x_cols`

In [22]:
Y = data_at_lat_lon['target']
X = data_at_lat_lon[candidate_x_cols]
dates = data_at_lat_lon['start_date']

In [21]:
print(X.columns)

Index(['ones', 'tmp2m_shift29', 'tmp2m_shift29_anom', 'tmp2m_shift58',
       'tmp2m_shift58_anom', 'rhum_shift30', 'pres_shift30',
       'nmme_wo_ccsm3_nasa', 'nmme0_wo_ccsm3_nasa', 'mei_shift45',
       'phase_shift17', 'sst_2010_1_shift30', 'sst_2010_2_shift30',
       'sst_2010_3_shift30', 'icec_2010_1_shift30', 'icec_2010_2_shift30',
       'icec_2010_3_shift30', 'wind_hgt_10_2010_1_shift30',
       'wind_hgt_10_2010_2_shift30'],
      dtype='object')
4146


In [26]:
print(Y)
print(dates)

2192545    11.084695
2192546    10.933958
2192547    10.668923
2192548    10.623228
2192549    10.389796
             ...    
2205767    15.621224
2205768    16.091940
2205769    16.508874
2205770    16.654487
2205771    16.728271
Name: target, Length: 4146, dtype: float64
2192545   1982-03-07
2192546   1982-03-08
2192547   1982-03-09
2192548   1982-03-10
2192549   1982-03-11
             ...    
2205767   2018-05-19
2205768   2018-05-20
2205769   2018-05-21
2205770   2018-05-22
2205771   2018-05-23
Name: start_date, Length: 4146, dtype: datetime64[ns]


In [20]:
print(len(data_at_lat_lon.columns))
print(len(candidate_x_cols))
print(data_at_lat_lon[candidate_x_cols].dropna())
print(data_at_lat_lon[candidate_x_cols])

27
19
         ones  tmp2m_shift29  tmp2m_shift29_anom  tmp2m_shift58  \
2192545   1.0      11.977637            0.617696       8.547097   
2192546   1.0      12.454148            1.037342       8.430989   
2192547   1.0      12.822954            1.350301       8.251327   
2192548   1.0      13.023114            1.476647       8.240416   
2192549   1.0      13.065857            1.516251       8.350149   
...       ...            ...                 ...            ...   
2205767   1.0      14.791862            0.453646      12.988252   
2205768   1.0      14.944562            0.521953      13.111979   
2205769   1.0      14.852663            0.365151      13.479980   
2205770   1.0      14.639905            0.108873      13.840489   
2205771   1.0      14.531434           -0.085892      14.187591   

         tmp2m_shift58_anom  rhum_shift30  pres_shift30  nmme_wo_ccsm3_nasa  \
2192545           -1.787171     88.196827  99062.348493           10.088030   
2192546           -1.887478    

In [44]:
print("--------------------")
print(len(sub_data))
for c in sub_data.columns:
    num_nan = np.sum(np.isnan(sub_data[c].to_numpy()))
    print(f"{c}: {num_nan}")
    
print("--------------------")
tmp = sub_data.dropna()
print(len(tmp))
for c in tmp.columns:
    num_nan = np.sum(np.isnan(tmp[c].to_numpy()))
    print(f"{c}: {num_nan}")

print("--------------------")
tmp2 = sub_data.dropna(subset=['target','sample_weight'])
print(len(tmp2))
for c in tmp.columns:
    num_nan = np.sum(np.isnan(tmp2[c].to_numpy()))
    print(f"{c}: {num_nan}")

    
print(sub_data.icec_2010_1_shift30)
print(tmp.icec_2010_1_shift30)
print(tmp2.icec_2010_1_shift30)

--------------------
2363372
lat: 0
lon: 0
start_date: 0
rhum_shift30: 43176
pres_shift30: 43176
nmme_wo_ccsm3_nasa: 216908
nmme0_wo_ccsm3_nasa: 216908
tmp2m_clim: 514
tmp2m_anom: 58082
tmp2m_shift29: 43176
tmp2m_shift29_anom: 43176
tmp2m_shift58: 40092
tmp2m_shift58_anom: 40092
mei_shift45: 40092
phase_shift17: 42662
sst_2010_1_shift30: 216908
sst_2010_2_shift30: 216908
sst_2010_3_shift30: 216908
icec_2010_1_shift30: 216908
icec_2010_2_shift30: 216908
icec_2010_3_shift30: 216908
wind_hgt_10_2010_1_shift30: 43176
wind_hgt_10_2010_2_shift30: 43176
year: 0
ones: 0
zeros: 0
sample_weight: 0
target: 58082
--------------------
2131044
lat: 0
lon: 0
start_date: 0
rhum_shift30: 0
pres_shift30: 0
nmme_wo_ccsm3_nasa: 0
nmme0_wo_ccsm3_nasa: 0
tmp2m_clim: 0
tmp2m_anom: 0
tmp2m_shift29: 0
tmp2m_shift29_anom: 0
tmp2m_shift58: 0
tmp2m_shift58_anom: 0
mei_shift45: 0
phase_shift17: 0
sst_2010_1_shift30: 0
sst_2010_2_shift30: 0
sst_2010_3_shift30: 0
icec_2010_1_shift30: 0
icec_2010_2_shift30: 0
icec_20

In [None]:
# print(sub_data.loc[(sub_data.lat==27.0) & (sub_data.lon == 261.0)].head())

