In [7]:
import keras
import random
import numpy as np
import xarray as xr
import pandas as pd
import cartopy.crs as ccrs
import matplotlib.pyplot as plt

from matplotlib import colors
from keras import Sequential
from keras.layers import Dense, Activation, Conv2D, Flatten, Input, Reshape, AveragePooling2D, MaxPooling2D, Conv2DTranspose, TimeDistributed, LSTM, GlobalAveragePooling2D, BatchNormalization
from keras.regularizers import l2

# import from parent level
import sys 
sys.path.append('..')
from path_parameters import TRAIN_PATH, TEST_PATH, OUTPUT_PATH

random.seed(13)

2022-04-04 18:54:46.787544: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-04 18:54:46.787571: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [10]:
data_path = TRAIN_PATH
test_path = TEST_PATH
output_path = OUTPUT_PATH
lstm_output_path = output_path + "outputs_ssp245_predict_pr_10to1_kern3_bs5.nc"

# Load data

In [None]:
simus = ['ssp126',
        'ssp370',
         #'ssp370-lowNTCF',
         'ssp585']

In [None]:
X_train = []
Y_train = []

for i,simu in enumerate(simus): 
    input_name = 'inputs_' + simu + '.nc'
    output_name = 'outputs_' + simu + '.nc'
    
    mbr = 1
    # load inputs 
    input_xr = xr.open_mfdataset([data_path + 'inputs_historical.nc', 
                             data_path + input_name]).compute()
        
    # load outputs                                                             
    output_xr = xr.concat([xr.open_dataset(data_path + 'outputs_historical.nc').sel(member=2),
                     xr.open_dataset(data_path + output_name).sel(member=mbr)], dim='time').compute()
    output_xr = output_xr.assign({"pr": output_xr.pr * 86400, "pr90": output_xr.pr90 * 86400}).rename({'lon':'longitude', 
                         'lat': 'latitude'}).transpose('time','latitude', 'longitude').drop(['member','quantile'])
    
    # Append to list 
    X_train.append(input_xr)
    Y_train.append(output_xr)

# Data normalization

In [None]:
minmax_inputs = {'CO2': [0, 10000],
 'CH4': [0, 1],
 'BC': [0, 6e-11],
 'SO2': [0, 2e-09]}

In [None]:
minmax_outputs = {'tas': [-7, 15],
 'diurnal_temperature_range': [-2.25, 6],
 'pr': [-11, 14],
 'pr90': [-25.5, 35]}

In [None]:
# Utilities for normalizing the emissions data
def normalize(data, var, minmax_dict): 
    min_var = minmax_dict[var][0]
    max_var = minmax_dict[var][1]
    return (data-min_var) / (max_var-min_var)

def unnormalize(data, var, minmax_dict):
    min_var = minmax_dict[var][0]
    max_var = minmax_dict[var][1]
    return data * (max_var-min_var)+ min_var

In [None]:
# normalize data 
X_train_norm = [] 
for i,train_xr in enumerate(X_train): 
    for var in ['CO2', 'CH4', 'SO2', 'BC']: 
        var_dims = train_xr[var].dims
        train_xr=train_xr.assign({var: (var_dims,normalize(train_xr[var].data, var, minmax_inputs))}) 
    X_train_norm.append(train_xr)
        
Y_train_norm = []
for i,train_xr in enumerate(Y_train): 
    for var in ['tas', 'diurnal_temperature_range', 'pr', 'pr90']: 
        var_dims = train_xr[var].dims
        train_xr=train_xr.assign({var: (var_dims,normalize(train_xr[var].data, var, minmax_outputs))}) 
    Y_train_norm.append(train_xr)

## Reshape data to feed into the model 

In [None]:
slider = 10 # years moving temporal window 

In [None]:
# Functions for reshaping the data 
def input_for_training(X_train_xr): 
    
    X_train_reshaped = xr.Dataset.from_dataframe(X_train_xr.to_dataframe())
    X_train_np = np.array([X_train_reshaped['CO2'].data, X_train_reshaped['CH4'].data, X_train_reshaped['SO2'].data, X_train_reshaped['BC'].data])
    X_train_np = X_train_np.transpose(3,1,2,0)

    time_length = X_train_np.shape[0]
    X_train_to_return = np.array([X_train_np[i:i+slider] for i in range(0,time_length-slider)])
    
    return X_train_to_return 

def output_for_training(Y_train_xr, var): 
    Y_train_np = Y_train_xr[var].data
    
    time_length = Y_train_np.shape[0]
    Y_train_to_return = np.array([[Y_train_np[i+slider-1]] for i in range(0,time_length-slider)])
    
    return Y_train_to_return

# CNN - LSTM architecture
## Test on one variable
## Build model

In [None]:
var_to_predict =  'tas' 
X_train_all = np.concatenate([input_for_training(X_train_norm[i]) for i in range(len(simus))],axis = 0)
Y_train_all = np.concatenate([output_for_training(Y_train_norm[i], var_to_predict) for i in range(len(simus))], axis=0)
X_train_all.shape, Y_train_all.shape

inspo: https://medium.com/smileinnovation/how-to-work-with-time-distributed-data-in-a-neural-network-b8b39aa4ce00

In [None]:
keras.backend.clear_session()
cnn_model = None

In [None]:
cnn_model = Sequential()
cnn_model.add(Input(shape=(slider, 96, 144, 4)))
cnn_model.add(TimeDistributed(Conv2D(20, (3, 3), padding='same', activation='relu'), input_shape=(slider, 96, 144, 4)))
cnn_model.add(TimeDistributed(AveragePooling2D(2)))
cnn_model.add(TimeDistributed(GlobalAveragePooling2D()))
cnn_model.add(LSTM(25, activation='relu'))
cnn_model.add(Dense(1*96*144))
cnn_model.add(Activation('linear'))
cnn_model.add(Reshape((1, 96, 144)))


Maybe add regularization : https://machinelearningmastery.com/how-to-reduce-overfitting-in-deep-learning-with-weight-regularization/ and batch normalization (scale inputs in between layers) https://machinelearningmastery.com/how-to-accelerate-learning-of-deep-neural-networks-with-batch-normalization/  and learning rate optimizer: https://machinelearningmastery.com/understand-the-dynamics-of-learning-rate-on-deep-learning-neural-networks/ 

In [None]:
cnn_model.summary()

In [None]:
cnn_model.compile(optimizer="rmsprop", loss="mse", metrics=["mse"]) 

# Train model

In [None]:
hist = cnn_model.fit(X_train_all,
                     Y_train_all,
                     #use_multiprocessing = True, 
                     #workers = 5, 
                     batch_size=5,
                     epochs=20,
                     verbose=1)

In [None]:
# Save the entire model as a SavedModel.
#!mkdir -p saved_model


# Make final prediction for submission

In [None]:
# Open and reformat test data 
X_test = xr.open_mfdataset([data_path + 'inputs_historical.nc',
                            test_path + 'inputs_ssp245.nc']).compute()
# normalize data 
for var in ['CO2', 'CH4', 'SO2', 'BC']: 
    var_dims = X_test[var].dims
    X_test=X_test.assign({var: (var_dims,normalize(X_test[var].data, var, minmax_inputs))}) 
    
X_test_np = input_for_training(X_test)  

In [None]:
# Make predictions using trained model 
m_pred = cnn_model.predict(X_test_np)
# reshape to xarray 
m_pred = m_pred.reshape(m_pred.shape[0], m_pred.shape[2], m_pred.shape[3])
m_pred = xr.DataArray(m_pred, dims=['time','lat','lon'], coords= [X_test.time.data[slider:], X_test.latitude.data, X_test.longitude.data ])
# unnormalize 
m_pred.data = unnormalize(m_pred.data, var_to_predict, minmax_outputs)
m_pred = m_pred.transpose('lat','lon','time').sel(time=slice(2015,2101)).to_dataset(name=var_to_predict)
m_pred

In [None]:
# Plot results 
divnorm=colors.TwoSlopeNorm(vmin=-2., vcenter=0., vmax=4.)

print(var_to_predict)
for yr in [2020, 2050, 2100]: 
    f, axes = plt.subplots(1,1,
                           subplot_kw=dict(projection=ccrs.PlateCarree()),
                           figsize=(9, 5))

    m_pred.sel(time=yr).plot.pcolormesh(ax= axes, cmap="coolwarm", norm=divnorm,
                                  cbar_kwargs={"label":var_to_predict})
    axes.set_title(f"Estimated {yr}")

In [None]:
# unit conversion if applicable (i.e. for precipitation)
# m_pred = m_pred.assign({var: m_pred[var] / 86400})

In [6]:
# save test predictions as .nc 
m_pred.to_netcdf(output_path)

NameError: name 'm_pred' is not defined

# Run training and predictions for each target variable

In [None]:
vars_to_predict = ['tas', 'diurnal_temperature_range', 'pr', 'pr90']

# Open and reformat test data 
X_test = xr.open_mfdataset([data_path + 'inputs_historical.nc',
                            test_path + 'inputs_ssp245.nc']).compute()
# normalize data 
for var in ['CO2', 'CH4', 'SO2', 'BC']: 
    var_dims = X_test[var].dims
    X_test=X_test.assign({var: (var_dims,normalize(X_test[var].data, var, minmax_inputs))}) 
    
X_test_np = input_for_training(X_test)  

In [9]:
for var_to_predict in vars_to_predict:
    
    print(var_to_predict)
    
    # Data
    X_train_all = np.concatenate([input_for_training(X_train_norm[i]) for i in range(len(simus))],axis = 0)
    Y_train_all = np.concatenate([output_for_training(Y_train_norm[i],var_to_predict) for i in range(len(simus))], axis=0)
    print(X_train_all.shape)
    print(Y_train_all.shape)
    
    # Model    
    keras.backend.clear_session()
    cnn_model = None
    
    cnn_model = Sequential()
    cnn_model.add(Input(shape=(slider, 96, 144, 4)))
    cnn_model.add(TimeDistributed(Conv2D(20, (3, 3), padding='same', activation='relu'),input_shape=(slider, 96, 144, 4)))
    cnn_model.add(TimeDistributed(AveragePooling2D(2)))
    cnn_model.add(TimeDistributed(GlobalAveragePooling2D()))
    cnn_model.add(LSTM(25, activation = 'relu'))
    cnn_model.add(Dense(1*96*144))
    cnn_model.add(Activation('linear'))
    cnn_model.add(Reshape((1, 96, 144)))
    
    cnn_model.compile(optimizer="rmsprop", loss="mse", metrics=["mse"])
    
    hist = cnn_model.fit(X_train_all,
                         Y_train_all,
                         use_multiprocessing = True, 
                         #workers = 5, 
                         batch_size=5, epochs=20,  # 25 for tas,  5 for pr_3, pr, dtr_2  
                         verbose=1)
    
    # Make predictions using trained model 
    m_pred = cnn_model.predict(X_test_np)
    # reshape to xarray 
    m_pred = m_pred.reshape(m_pred.shape[0], m_pred.shape[2], m_pred.shape[3])
    m_pred = xr.DataArray(m_pred, dims=['time','lat','lon'], coords= [X_test.time.data[slider:], X_test.latitude.data, X_test.longitude.data ])
    # unnormalize 
    m_pred.data = unnormalize(m_pred.data, var_to_predict, minmax_outputs)
    xr_prediction = m_pred.transpose('lat','lon','time').sel(time=slice(2015,2101)).to_dataset(name=var_to_predict)

    if var_to_predict=="pr90" or var_to_predict=="pr":
        xr_prediction = xr_prediction.assign({var_to_predict: xr_prediction[var_to_predict] / 86400})

    # save test predictions as .nc 
    if var_to_predict == "diurnal_temperature_range":
        xr_prediction.to_netcdf((output_path+"outputs_ssp245_predict_dtr.nc"),'w')

    else:
        xr_prediction.to_netcdf((output_path+"outputs_ssp245_predict_{}.nc").format(var_to_predict),'w')
    xr_prediction.close()


NameError: name 'vars_to_predict' is not defined