# Climate event detection task
Sandbox for preprocessing and first learning test

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
import shutil
import sys
sys.path.append('../..')

os.environ["CUDA_VISIBLE_DEVICES"] = "2"  # change to chosen GPU to use, nothing if work on CPU

# on CLI,  export LD_LIBRARY_PATH=/usr/local/cuda-9.0/extras/CUPTI/lib64:$LD_LIBRARY_PATH

import numpy as np
import time
import matplotlib.pyplot as plt
import healpy as hp
import pandas as pd

from tqdm import tqdm
from mpl_toolkits.mplot3d import Axes3D
import cartopy.crs as ccrs

In [None]:
import h5py

In [None]:
from deepsphere import models, experiment_helper, plot, utils
from deepsphere.data import LabeledDatasetWithNoise, LabeledDataset
import hyperparameters

List of parameters:
* TMQ: Total (vertically integrated) precipitatable water
* U850: Zonal wind at 850 mbar pressure surface
* V850: Meridional wind at 850 mbar pressure surface
* UBOT: Lowest model level zonal wind
* VBOT: Lowest model level meridional wind
* QREFHT: Reference height humidity
* PS: Surface pressure
* PSL: sea level pressure
* T200: temp at 200 mbar pressure surface
* T500: temp at 500 mbar pressure surface
* PRECT: Total (convective and large-scale) precipitation rate (liq + ice)
* TS: Surface temperature (radiative)
* Z100: Geopotential Z at 100 mbar pressure surface
* Z200: Geopotential Z at 200 mbar pressure surface
* ZBOT: Lowest model level height

resolution of 768 x 1152 equirectangular grid (25-km at equator)

The labels are 0 for background class, 1 for tropical cyclone, and 2 for atmoshperic river

In [None]:
path = '../../data/Climate/'

In [None]:
year, month, day, hour, run = 2106, 1, 1, 0, 1
datapath = '../../data/Climate/data_5_all/data-{}-{:0>2d}-{:0>2d}-{:0>2d}-{}-mesh.npz'.format(year, month, day, hour, run)

In [None]:
plop = np.load(datapath)
data = plop["data"]
labels = plop["labels"]

In [None]:
data.shape

In [None]:
stats = h5py.File('../../data/Climate/stats.h5')
stats = stats['climate']["stats"] # (16 X 4) (mean, max, min, std)

In [None]:
year, month, day, hour, run = 2106, 1, 1, 0, 1
datapath = '../../data/Climate/data-{}-{:0>2d}-{:0>2d}-{:0>2d}-{}.h5'.format(year, month, day, hour, run)

In [None]:
h5f = h5py.File(datapath)
data = h5f['climate']["data"] # (16,768,1152) numpy array
labels = h5f['climate']["labels"] # (768,1152) numpy array

In [None]:
lon_ = np.arange(1152)/1152*360
lat_ = np.arange(768)/768*180-90
lon, lat = np.meshgrid(lon_, lat_)

In [None]:
from deepsphere.utils import icosahedron_graph

In [None]:
g = icosahedron_graph(5)

In [None]:
icolong, icolat = np.rad2deg(g.lon), np.rad2deg(g.lat)

In [None]:
fig = plt.figure(figsize=(4, 4))
ax = fig.add_subplot(1, 1, 1, projection=ccrs.Orthographic(90, 0))
ax.set_global()
ax.coastlines(linewidth=2)

# zmin, zmax = -20, 40

scat1 = plt.scatter(lon, lat, s=1, rasterized=True,
            c=data[0,:,:], cmap=plt.get_cmap('RdYlBu_r'), alpha=1, transform=ccrs.PlateCarree())

AR = labels[:,:]==1
TC = labels[:,:]==2
scat2 = ax.scatter(lon[AR], lat[AR], s=0.5, color='c', label='AR', transform=ccrs.PlateCarree())
            #c=labels[show], cmap=plt.get_cmap('cool'), alpha=0.6, transform=ccrs.PlateCarree())
scat3 = ax.scatter(lon[TC], lat[TC], s=0.5, color='m', label='TC', transform=ccrs.PlateCarree())
            #c=labels[show], cmap=plt.get_cmap('cool'), alpha=0.6, transform=ccrs.PlateCarree())
ax.legend(markerscale=5, fontsize=10, loc=1, frameon=False, ncol=1, bbox_to_anchor=(0.1, 0.18))
ticks = range(np.min(data[0,:,:]).astype(int), np.max(data[0,:,:]).astype(int), 20)
cb = plt.colorbar(scat1, ax=ax, orientation="horizontal",anchor=(1.0,0.0), shrink=0.7, pad=0.05, ticks=ticks)
cb.ax.tick_params(labelsize=10)
cb.ax.set_xticklabels([f'${t}mm$' for t in ticks[1:]])

# cb = fig.colorbar(sc, ax=ax, orientation='horizontal', fraction=0.02, aspect=40, pad=0.03, ticks=ticks)

ax.text(0, 7e6, f'HAPPI20 Climate, TMQ, {year}-{month:02d}-{day:02d}-{hour:02d}-{run}', horizontalalignment='center')

In [None]:
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())

ax.set_global()
# ax.stock_img()
ax.coastlines()

plt.scatter(lon, lat, s=1,
            c=data[0,:,:], cmap=plt.get_cmap('RdYlBu_r'), alpha=1)

In [None]:
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())

ax.set_global()
# ax.stock_img()
ax.coastlines()

plt.scatter(lon, lat, s=1,
            c=labels[:,:], cmap=plt.get_cmap('RdYlBu_r'), alpha=1)

In [None]:
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())

ax.set_global()
# ax.stock_img()
ax.coastlines()

plt.scatter(icolong, icolat, s=20,
            c=data[0,:], cmap=plt.get_cmap('RdYlBu_r'), alpha=1)

In [None]:
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())

ax.set_global()
# ax.stock_img()
ax.coastlines()

plt.scatter(icolong, icolat, s=20,
            c=labels[0,:], cmap=plt.get_cmap('RdYlBu_r'), alpha=1)

In [None]:
from scipy.interpolate import griddata, RectBivariateSpline, RegularGridInterpolator, LinearNDInterpolator, interp2d, NearestNDInterpolator

In [None]:
from scipy.interpolate import griddata#, RectBivariateSpline, RegularGridInterpolator
Nside = 32
pix = np.arange(12*Nside**2)
coords_hp = hp.pix2ang(Nside, pix, nest=True, lonlat=True)
coords_hp = np.asarray(coords_hp).T
# lon_rad, lat_rad = np.deg2rad(lon), np.deg2rad(lat)
coords_map = hp.ang2vec(lon, lat, lonlat=True).reshape((-1, 3))
coords_map = np.stack([lon, lat], axis=-1).reshape((-1, 2))
# map_hp = griddata(coords_map, images[0,0].flatten(), coords_hp, 'linear')

In [None]:
t = time.time()
map_hp1 = griddata(coords_map, data[0].flatten(), coords_hp, 'linear')
print("time taken:", time.time()-t)

t = time.time()
f = RegularGridInterpolator((lon_, lat_), data[0].T)
map_hp3 = f(coords_hp)
print("time taken:", time.time()-t)

In [None]:
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())

ax.set_global()
# ax.stock_img()
ax.coastlines()

plt.scatter(coords_hp[:,0], coords_hp[:,1], s=10,
            c=map_hp3, cmap=plt.get_cmap('RdYlBu_r'), alpha=1)

In [None]:
coords_map.shape

In [None]:
f = NearestNDInterpolator(coords_map, labels[:].flatten(), rescale=False)
new_labels = f(coords_hp)

In [None]:
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())

ax.set_global()
# ax.stock_img()
ax.coastlines()

plt.scatter(coords_hp[:,0], coords_hp[:,1], s=10,
            c=new_labels, cmap=plt.get_cmap('RdYlBu_r'), alpha=1)

## data

In [None]:
from glob import glob
year = 2106
path = '../../data/Climate/data-{}-01*.h5'.format(year)
files = glob(path)
datas = np.zeros((len(files),16,768,1152))
labels = np.zeros((len(files),768,1152))
for i, file in enumerate(files):
#     _, _, month, day, hour, run = file.split('-')
#     month, day, hour, run = int(month), int(day), int(hour), int(run[0])
    data = h5py.File(file)
    datas[i] = data['climate']['data']
    labels[i] = data['climate']['labels']

In [None]:
from glob import glob
year = 2106
path = '../../data/Climate/data_5_all/data-{}-*.npz'.format(year)
files = glob(path)
datas = np.zeros((len(files),16,10242))
labels = np.zeros((len(files),3,10242))
for i, file in enumerate(files):
#     _, _, month, day, hour, run = file.split('-')
#     month, day, hour, run = int(month), int(day), int(hour), int(run[0])
    data = np.load(file)
    datas[i] = data['data']
    labels[i] = data['labels']
#     datas.append(data)
#     labels.append(label)
# datas = np.stack(datas)
# labels = np.stack(labels)
labels = np.argmax(labels, axis=1)
datas = np.transpose(datas, axes=(0,2,1))

In [None]:
limit=6000
x_train = datas[:limit,:,:]
labels_train = labels[:limit,:]
x_val = datas[limit:,:,:]
labels_val = labels[limit:,:]

training = LabeledDataset(x_train, labels_train)
validation = LabeledDataset(x_val, labels_val)

Jiang separation

In [None]:
precomp_mean = [26.160023, 0.98314494, 0.116573125, -0.45998842, 0.1930554, 0.010749293, 98356.03, 100982.02, 216.13145, 258.9456, 3.765611e-08, 288.82578, 288.03925, 342.4827, 12031.449, 63.435772]
precomp_std =  [17.04294, 8.164175, 5.6868863, 6.4967732, 5.4465833, 0.006383436, 7778.5957, 3846.1863, 9.791707, 14.35133, 1.8771327e-07, 19.866386, 19.094095, 624.22406, 679.5602, 4.2283397]


In [None]:
rotmat = np.array([[np.cos(np.pi/4),np.sin(np.pi/4)],
                    [-np.sin(np.pi/4),np.cos(np.pi/4)]])
# change to magnitude

In [None]:
data = {}
for partition in ['val']:
    with open(path+'data_5_all/'+partition+".txt", "r") as f:
        lines = f.readlines()
    flist = [os.path.join(path, 'data_5_all', l.replace('\n', '')) for l in lines]
    data[partition] = {'data': np.zeros((len(flist),10242,16)),
                       'labels': np.zeros((len(flist),10242))}
    for i, f in enumerate(flist):
        file = np.load(f)
        data[partition]['data'][i] = (file['data'].T - precomp_mean) / precomp_std
        data[partition]['data'][i,:,1] = np.arctan2(data[partition]['data'][i,:,1], data[partition]['data'][i,:,2])# data[partition]['data'][i,:,1:3] @ rotmat
        data[partition]['data'][i,:,2] = data[partition]['data'][i,:,1]
        data[partition]['data'][i,:,3] = np.arctan2(data[partition]['data'][i,:,3], data[partition]['data'][i,:,4]) # @ rotmat
        data[partition]['data'][i,:,4] = data[partition]['data'][i,:,3]
        data[partition]['labels'][i] = np.argmax(file['labels'].astype(np.int), axis=0)

In [None]:
x_train = data['train']['data']
labels_train = data['train']['labels']

In [None]:
x_test = data['test']['data']
labels_test = data['test']['labels']

In [None]:
# training = LabeledDataset(data['train']['data'], data['train']['labels'])
validation = LabeledDataset(data['val']['data'], data['val']['labels'])
# test = LabeledDataset(data['test']['data'], data['test']['labels'])

In [None]:
del data

TF dataset with Jiang separation

In [None]:
from ClimateDataLoader import IcosahedronDataset, EquiangularDataset

In [None]:
training = IcosahedronDataset(path+'data_5_all/', 'train')

In [None]:
validation = IcosahedronDataset(path+'data_5_all/', 'val')

In [None]:
tf_train = training.get_tf_dataset(32)

In [None]:
import tensorflow as tf
from tqdm import tqdm

data_next = tf_train.make_one_shot_iterator().get_next()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
steps = training.N // 32 + 1
with tf.Session(config=config) as sess:
    sess.run(tf.global_variables_initializer())
    try:
        for i in tqdm(range(steps)):
            out = sess.run(data_next)
    except tf.errors.OutOfRangeError:
        print("Done") 

In [None]:
training_local = EquiangularDataset(path, s3=False)

In [None]:
validation_local = EquiangularDataset(path, 'val', s3=False)

In [None]:
tf_local = training_local.get_tf_dataset(1)

In [None]:
from pygsp.graphs import SphereEquiangular

In [None]:
g2 = SphereEquiangular(bandwidth=(384, 576), sampling='SOFT')

In [None]:
glong, glat = np.rad2deg(g2.lon), np.rad2deg(g2.lat)

In [None]:
import tensorflow as tf
from tqdm import tqdm
from time import time

data_next = tf_local.make_one_shot_iterator().get_next()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
steps = training_local.N // 32 + 1
t_start = time()
with tf.Session(config=config) as sess:
    sess.run(tf.global_variables_initializer())
    try:
        for i in tqdm(range(steps)):
#             t_begin = time()
            out = sess.run(data_next)
#             print('loop time: ', time()-t_begin)
            fig = plt.figure(figsize=(20, 10))
            ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())

            ax.set_global()
            # ax.stock_img()
            ax.coastlines()

            plt.scatter(glong, glat, s=1,
                        c=out[0][0,:,:,0], cmap=plt.get_cmap('RdYlBu_r'), alpha=1)
            break
    except tf.errors.OutOfRangeError as e:
        print("Done")
print(time()-t_start)

In [None]:
training_s3 = EquiangularDataset(path, s3=True)

In [None]:
tf_s3 = training_s3.get_dataset_s3(32)

In [None]:
import tensorflow as tf
from tqdm import tqdm
from time import time

data_next = tf_s3.make_one_shot_iterator().get_next()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
steps = training_s3.N // 32 + 1
t_start = time()
with tf.Session(config=config) as sess:
    sess.run(tf.global_variables_initializer())
    try:
        for i in tqdm(range(steps)):
            t_begin = time()
            out = sess.run(data_next)
            print('loop time: ', time()-t_begin)
    except tf.errors.OutOfRangeError as e:
        print("Done")
print(time()-t_start)

In [None]:
from tensorflow.python.lib.io import file_io
gstats = file_io.stat('s3://10380-903b2ba14e0d980c25436f9ca5bb29f5/Datasets/Climate/data-2106-02-23-03-1.h5')

In [None]:
gstats.length

## Training

In [None]:
EXP_NAME = 'TestClimate_nopooling_ico_4layers_k4'

In [None]:
(bw1, bw2) = (384, 576)

In [None]:
import tensorflow as tf
params = {'nsides': [5, 5, 4, 3, 2, 1, 0, 0],
          'F': [32, 64, 128, 256, 512, 512, 512],#np.max(labels_train).astype(int)+1],
          'K': [4]*7,
          'batch_norm': [True]*7}
params['sampling'] = 'icosahedron'
params['dir_name'] = EXP_NAME
params['num_feat_in'] = 16 # x_train.shape[-1] # 2*days_pred+3
params['conv'] = 'chebyshev5'
params['pool'] = 'average'
params['activation'] = 'relu'
params['statistics'] = None#'mean'
params['regularization'] = 0
params['dropout'] = 1
params['num_epochs'] = 25  # Number of passes through the training data.
params['batch_size'] = 32
params['scheduler'] = lambda step: tf.train.exponential_decay(1e-3, step, decay_steps=2000, decay_rate=1)
#params['optimizer'] = lambda lr: tf.train.GradientDescentOptimizer(lr)
# params['optimizer'] = lambda lr: tf.train.AdamOptimizer(lr, beta1=0.9, beta2=0.999, epsilon=1e-8)
params['optimizer'] = lambda lr: tf.train.RMSPropOptimizer(lr, decay=0.9, momentum=0.)
n_evaluations = 100
params['eval_frequency'] = int(params['num_epochs'] * (training.N) / params['batch_size'] / n_evaluations)
params['M'] = []
params['Fseg'] = 3 # np.max(labels_train).astype(int)+1
params['dense'] = True
params['tf_dataset'] = training.get_tf_dataset(params['batch_size'])
# params['profile'] = True

In [None]:
import tensorflow as tf
params = {'nsides': [5, 5, 5, 5, 5],
          'F': [32, 64, 128, 256],#np.max(labels_train).astype(int)+1],
          'K': [4]*4,
          'batch_norm': [True]*4}
params['sampling'] = 'icosahedron'
params['dir_name'] = EXP_NAME
params['num_feat_in'] = 16 # x_train.shape[-1] # 2*days_pred+3
params['conv'] = 'chebyshev5'
params['pool'] = 'average'
params['activation'] = 'relu'
params['statistics'] = None#'mean'
params['regularization'] = 0
params['dropout'] = 1
params['num_epochs'] = 30  # Number of passes through the training data.
params['batch_size'] = 8
params['scheduler'] = lambda step: tf.train.exponential_decay(1e-3, step, decay_steps=2000, decay_rate=1)
#params['optimizer'] = lambda lr: tf.train.GradientDescentOptimizer(lr)
# params['optimizer'] = lambda lr: tf.train.AdamOptimizer(lr, beta1=0.9, beta2=0.999, epsilon=1e-8)
params['optimizer'] = lambda lr: tf.train.RMSPropOptimizer(lr, decay=0.9, momentum=0.)
n_evaluations = 60
params['eval_frequency'] = int(params['num_epochs'] * (training.N) / params['batch_size'] / n_evaluations)
params['M'] = []
params['Fseg'] = 3 # np.max(labels_train).astype(int)+1
params['dense'] = True
params['weighted'] = False
params['tf_dataset'] = training.get_tf_dataset(params['batch_size'])
# params['profile'] = True

In [None]:
EXP_NAME = "TestClimate_pooling_weight_equi"

In [None]:
# Cleanup before running again.
shutil.rmtree('../../summaries/{}/'.format(EXP_NAME), ignore_errors=True)
shutil.rmtree('../../checkpoints/{}/'.format(EXP_NAME), ignore_errors=True)

In [None]:
import tensorflow as tf
params = {'nsides': [(bw1, bw2), (bw1, bw2),(bw1//4, bw2//4),(bw1//16, bw2//16),(bw1//16, bw2//16)],
          'F': [8, 32, 64, 128],#np.max(labels_train).astype(int)+1],
          'K': [5]*4,
          'batch_norm': [True]*4}
params['sampling'] = 'equiangular'
params['dir_name'] = EXP_NAME
params['num_feat_in'] = 16 # x_train.shape[-1] # 2*days_pred+3
params['conv'] = 'chebyshev5'
params['pool'] = 'average'
params['activation'] = 'relu'
params['statistics'] = None#'mean'
params['regularization'] = 0
params['dropout'] = 1
params['num_epochs'] = 25  # Number of passes through the training data.
params['batch_size'] = 1
params['scheduler'] = lambda step: tf.train.exponential_decay(1e-3, step, decay_steps=2000, decay_rate=1)
#params['optimizer'] = lambda lr: tf.train.GradientDescentOptimizer(lr)
params['optimizer'] = lambda lr: tf.train.AdamOptimizer(lr, beta1=0.9, beta2=0.999, epsilon=1e-8)
# params['optimizer'] = lambda lr: tf.train.RMSPropOptimizer(lr, decay=0.9, momentum=0.)
n_evaluations = 100
params['eval_frequency'] = int(params['num_epochs'] * (training_local.N) / params['batch_size'] / n_evaluations)
params['M'] = []
params['Fseg'] = 3 # np.max(labels_train).astype(int)+1
params['dense'] = True
params['tf_dataset'] = training_local.get_tf_dataset(params['batch_size'])

In [None]:
# print([12*nside**2 for nside in params['nsides']])
model = models.deepsphere(**params)

Jiang: 328,339

DeepSphere ico deep: 12,926,432

DeepSphere ico shallowe: 141,624

DeepSphere equi: z

In [None]:
print("the number of parameters in the model is: {:,}".format(model.get_nbr_var()))

In [None]:
model.fit(training, validation, use_tf_dataset=True, cache='TF', restore=True)

In [None]:
predictions = model.predict(x_test)

In [None]:
probabilities = model.probs(x_test, 3)

In [None]:
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())

ax.set_global()
# ax.stock_img()
ax.coastlines()

plt.scatter(icolong, icolat, s=20,
            c=predictions[0,:], cmap=plt.get_cmap('RdYlBu_r'), alpha=1)

In [None]:
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())

ax.set_global()
# ax.stock_img()
ax.coastlines()

plt.scatter(icolong, icolat, s=20,
            c=labels_test[0,:], cmap=plt.get_cmap('RdYlBu_r'), alpha=1)

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(1, 1, 1, projection=ccrs.Orthographic(0, 0))
ax.set_global()
ax.coastlines(linewidth=2)

# zmin, zmax = -20, 40

plt.scatter(icolong, icolat, s=100,
            c=predictions[0,:], cmap=plt.get_cmap('RdYlBu_r'), alpha=1, transform=ccrs.PlateCarree())

In [None]:
from sklearn.metrics import accuracy_score, average_precision_score
from sklearn.preprocessing import label_binarize

In [None]:
def accuracy(pred_cls, true_cls, nclass=3):
    accu = []
    tot_int = 0
    tot_cl = 0
    for i in range(3):
        intersect = np.sum(((pred_cls == i) * (true_cls == i)))
        thiscls = np.sum(true_cls == i)
        accu.append(intersect / thiscls * 100)
        tot_int += intersect
        tot_cl += thiscls
    return np.array(accu), np.mean(accu), tot_int/tot_cl * 100

In [None]:
def average_precision(score_cls, true_cls, nclass=3):
    score = score_cls
    true = label_binarize(true_cls.reshape(-1), classes=[0, 1, 2])
    score = score.reshape(-1, nclass)
    AP = average_precision_score(true, score, None)
    return AP, np.mean(AP)

In [None]:
accuracy(predictions, labels_test)

In [None]:
average_precision(probabilities, labels_test)

mAP for positives class is 0.7541626

In [None]:
from sklearn.utils.class_weight import compute_class_weight

In [None]:
compute_class_weight('balanced', [0,1,2], labels_train.flatten())