In [1]:
# Iman Wahle
# August 2020
# Adapted from CFL_ElNino_Demonstration.ipynb by Krzysztof Chalupka
# This notebook walks through aggregating microlevel climate data to
# macrolevel features as causal hypotheses for future testing. The neural
# network used for density learning has been adapted from the original
# implementation to use tensorflow. 

In [2]:
import os
import sys
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.colors as colors

import core_ml_tf as cmt # new tf backend
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [3]:
imshape = (55, 9) # This is the grid shape of our images, stored here for plotting reference.

The Dataset
-----------

The following code loads our data into the correct format:
* X -- numpy array of size (n_datapoints, n_input_dim). Each row corresponds to one input value.
* Y -- numpy array of size (n_datapoints, n_output_dim). Each row is one output value.

In our case, each row of X is a (flattened) map of Pacific zonal wind strength, and each row of Y a (flattened) map of Pacific water temperature over the same region. In our case, n_input_dim == n_output_dim, but this need not be the case at all.

In [4]:
import joblib 

# Load the data. 
## PLUG YOUR OWN DATA HERE. 'coords' is only needed to display climate maps. Your data
## can only contain X and Y arrays.
X, Y, coords = joblib.load('elnino_data.pkl')

# Create a randomized, normalized training and validation set.
x_scaler = StandardScaler().fit(X)
y_scaler = StandardScaler().fit(Y)
X_tr, X_ts, Y_tr, Y_ts = train_test_split(X, Y, shuffle=True, train_size=0.85)
X_tr = x_scaler.transform(X_tr).astype('float32')
Y_tr = y_scaler.transform(Y_tr).astype('float32')
X_ts = x_scaler.transform(X_ts).astype('float32')
Y_ts = y_scaler.transform(Y_ts).astype('float32')

  


In [5]:
print('X shape: {}'.format(X.shape))
print('Y.shape: {}'.format(Y.shape))

X shape: (13140, 495)
Y.shape: (13140, 495)


Learning P(Y | X)
--------------------
The first step of Causal Feature Learning (CFL) is to cluster x's according to P(Y | x) conditional densities. In this demonstration, for simplicity, we will approximate P(Y | x) with its expected value E[P(Y | x)]. This means we assume that if two distributions have equal means, they themselves are equal. It is possible to efficiently relax this assumption by using Mixture Density Networks (Bishop 1995) to approximate all moments of a distribution.

Learning E[P(Y | x)] amounts to regressing y on x. We do this using a neural network and the Lasagne package.

In [None]:
cmt.train_network(X_tr, Y_tr, X_ts, Y_ts, 
                  save_fname='checkpoints/epoch_{}', 
                  n_epochs=3001,
                  lr=1e-3, verbose=True)

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
nn_input_layer (InputLayer)  [(None, 495)]             0         
_________________________________________________________________
nn_dropout1 (Dropout)        (None, 495)               0         
_________________________________________________________________
nn_dense1 (Dense)            (None, 1024)              507904    
_________________________________________________________________
nn_dropout2 (Dropout)        (None, 1024)              0         
_________________________________________________________________
nn_layer2 (Dense)            (None, 1024)              1049600   
_________________________________________________________________
nn_dropout3 (Dropout)        (None, 1024)              0         
_________________________________________________________________
nn_output_layer (Dense)      (None, 495)               5073

In [None]:
model = cmt.get_model(X.shape[1], Y.shape[1])
model.load_weights('checkpoints/epoch_2400')

Finding the Observational Partition of X
-----------------------------------------
Finding the observational partition of X amounts to clustering the X data according to P(Y | X). That is, we put two x's in the same bucket if the neural net we trained maps them to the same values.

In [None]:
N_CLASSES = 4
yhat = model.predict(X)
x_lbls = KMeans(n_clusters=N_CLASSES, n_init=10, n_jobs=-1).fit_predict(yhat)

Finding the Observational Partition of Y
---------------------------------------------
To find the partition of Y, we want to put together all y1 and y2 if P(y1 | x) == P(y2 | x) for each x. This procedure is a little bit more interesting than clustering the x's, and is described in our UAI 2016 paper. We describe it briefly here.

We've already clustered x's in a way that that guarantees that if x1 and x2 belong to the same x_lbls class, then P(y | x1) == P(y | x2). Thus, the requirement P(y1 | x) == P(y2 | x) can be subsituted by P(y1 | x_lbls==O) == P(y2 | x_lbls==O) for any observational X-class O. Since x_lbls is discrete, we should have plenty of data per each x_lbls class. We will approximate P(y | x_lbls==O) using the distance of y to the closest (except for itself) Y-point whose corresponding x belongs to x_lbls==O.

In [None]:
y_ftrs = np.zeros((Y.shape[0], np.unique(x_lbls).size))
# Loop, not vectorized, to save memory. Can take a while.
for y_id, y in enumerate(np.vstack([Y_tr, Y_ts])):
    if y_id % 100==0:
        sys.stdout.write('\rComputing P(y | x_lbls) features, iter {}/{}...'.format(y_id, Y.shape[0]))
        sys.stdout.flush() 
    for x_lbl_id, x_lbl in enumerate(np.unique(x_lbls)):
        # Find ids of xs in this x_lbls class.
        x_lbl_ids = np.where(x_lbls==x_lbl)[0]
        # Compute distances of y to all y's in this x_lbls class and sort them.
        sorted_dists = np.sort(np.sum((y-np.vstack([Y_tr, Y_ts])[x_lbl_ids])**2, axis=1))
        # Find the mean distance to the 4 closest points (exclude the actually closest point though).
        y_ftrs[y_id][x_lbl_id] = sorted_dists[1:5].mean()
print('Done. Clustering P(y | x_lbls).')
y_lbls = KMeans(n_clusters=N_CLASSES, n_init=10, n_jobs=-1).fit_predict(y_ftrs)

Understanding the Results
--------------------------
Visualizing the observational partition is data-specific. In our case, since both X and Y are images, we can visualize the means of each observational cluster to gain some insight into what it contains. 

But, one of the virtues of the method is that it is interpretation-agnostic. The observational partition can be used as a causal hypothesis to drive experimentation. This can be done whether the inputs and outputs are easily interpretable or not at all.

In [None]:
fig = plt.figure(figsize=(15,10), facecolor='white')
X_raw = x_scaler.inverse_transform(np.vstack([X_tr, X_ts]))
Y_raw = y_scaler.inverse_transform(np.vstack([Y_tr, Y_ts]))

levels = np.linspace(-0.5,0.5,30)
for x_cluster_id in range(4):
    ax = plt.subplot2grid((4,2), (x_cluster_id, 0))  
    # Plot the cluster's mean difference from all frames' mean.
    cluster_mean = (X_raw[x_lbls==x_cluster_id].mean(axis=0)-X_raw.mean(axis=0)).reshape(imshape).T
    im=ax.contourf(coords['x'].ravel(), coords['y'].ravel(), cluster_mean, levels=levels, cmap='BrBG_r')
    ax.set_xticks([]); ax.set_yticks([])

levels=np.linspace(-1,1.5,30)
for y_cluster_id in range(4):
    ax = plt.subplot2grid((4,2), (y_cluster_id, 1))  
    # Plot the cluster's mean difference from all frames' mean.
    cluster_mean = (Y_raw[y_lbls==y_cluster_id].mean(axis=0)-Y_raw.mean(axis=0)).reshape(imshape).T

    im=ax.contourf(coords['x'].ravel(), coords['y'].ravel(), cluster_mean, levels=levels, cmap='coolwarm')
    ax.set_xticks([]); ax.set_yticks([])

# Compute and print P(y_lbl | x_lbl)
P_CE = np.array([np.bincount(y_lbls.astype(int)[x_lbls==x_lbl], 
    minlength=y_lbls.max()+1).astype(float) for x_lbl in np.sort(np.unique(x_lbls))])
P_CE = P_CE/P_CE.sum()
P_E_given_C = P_CE/P_CE.sum(axis=1, keepdims=True)

print('P(TempCluster | WindCluster):')
print(P_E_given_C)