In [1]:
import dit
import os

import numpy as np
import pandas as pd
from collections import Counter

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import auc

from dit import ScalarDistribution
from tqdm import tqdm as tqdm

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from src.data_loader import Shifted_Data_Loader
import multiprocessing

Using TensorFlow backend.


In [2]:
# DL = Shifted_Data_Loader('fashion_mnist',rotation=None,translation=0.8,autoload=False,flatten=False)

In [3]:
proj_root = '/home/elijahc/projects/vae'
models_root = os.path.join(proj_root,'models')
dates = ['2019-01-{}'.format(n) for n in np.arange(10)+14]
paths = [os.path.join(models_root,d) for d in dates]
trans_amt = np.arange(10)/10
fa_10_iso_df = pd.read_pickle('../data/style_embeddings/fashion_mnist_isomap_10_neighbor.pk').set_index('test_idx').sort_index()
isos = fa_10_iso_df.isomap_dim_1.values

paths

['/home/elijahc/projects/vae/models/2019-01-14',
 '/home/elijahc/projects/vae/models/2019-01-15',
 '/home/elijahc/projects/vae/models/2019-01-16',
 '/home/elijahc/projects/vae/models/2019-01-17',
 '/home/elijahc/projects/vae/models/2019-01-18',
 '/home/elijahc/projects/vae/models/2019-01-19',
 '/home/elijahc/projects/vae/models/2019-01-20',
 '/home/elijahc/projects/vae/models/2019-01-21',
 '/home/elijahc/projects/vae/models/2019-01-22',
 '/home/elijahc/projects/vae/models/2019-01-23']

In [4]:
isos.shape

(10000,)

In [5]:
dit.shannon.entropy

<function dit.shannon.shannon.entropy(dist, rvs=None, rv_mode=None)>

In [6]:
z_encodings = np.array([np.load(os.path.join(p,'layer_activations','z_enc.npy')) for p in paths])
dense_1 = np.array([np.load(os.path.join(p,'layer_activations','dense_1.npy')) for p in paths])
dense_2 = np.array([np.load(os.path.join(p,'layer_activations','dense_2.npy')) for p in paths])

dxs = np.array([np.load(os.path.join(p,'layer_activations','dx.npy')) for p in paths])-14
dys = np.array([np.load(os.path.join(p,'layer_activations','dy.npy')) for p in paths])-14
cids = np.array([np.load(os.path.join(p,'layer_activations','y_train.npy')) for p in paths])
dfs = [pd.DataFrame.from_records({'dx':dxs[i],'dy':dys[i],'class_id':cids[i],'eccentricity':[tx]*10000}) for i,tx in enumerate(trans_amt) ]

In [7]:
# DLs = [Shifted_Data_Loader('fashion_mnist',flatten=False,autoload=False) for _ in np.arange(10)]

In [8]:
# from keras.models import Model
# classifiers = [Model(m.input,m.get_layer('class').output) for m in models]
# class_encodings = [c.predict]

In [9]:
sub_dfs = []
for cid in np.arange(10):
    c_idxs = fa_10_iso_df.class_id.values==cid
    subset_df = fa_10_iso_df[c_idxs]
    scaler = MinMaxScaler(feature_range=(-14,14))
    sc_isos = scaler.fit_transform(isos[c_idxs].reshape(-1,1)).flatten()
    subset_df['scaled_isomap_dim_1'] = sc_isos
    sub_dfs.append(subset_df)

fa_10_iso_df = pd.concat(sub_dfs,axis=0).sort_index()
iso = np.array([fa_10_iso_df.isomap_dim_1.values.tolist() for _ in np.arange(10)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [10]:
class RunResult():
    def __init__(self, z_raw, dx,dy,iso,class_id,dense_1,dense_2):
        self.z_raw = z_raw
        self.z_dim = z_raw.shape[-1]
        self.dx = dx
        self.dy = dy
        self.dense_1 = dense_1
        self.dense_2 = dense_2
        self.isomap_1D_raw = iso
        self.class_id = class_id
        
    def z_enc(self,feat_range=30):
        z_n = [self.z_raw[:,n] for n in np.arange(self.z_dim)]
        return [MinMaxScaler(feature_range=(0,feat_range)).fit_transform(nvec.reshape(-1,1)).flatten().astype(int) for nvec in z_n]
    
    def q_dense_2(self,feat_range=30):
        # Computes quantized activation levels of dense_1 across feat_range levels
        d_n = [self.dense_1[:,n] for n in np.arange(self.dense_2.shape[-1])]
        return [MinMaxScaler(feature_range=(0,feat_range)).fit_transform(nvec.reshape(-1,1)).flatten().astype(int) for nvec in d_n]
    
    def q_dense_1(self,feat_range=30):
        # Computes quantized activation levels of dense_1 across feat_range levels
        d_n = [self.dense_1[:,n] for n in np.arange(self.dense_1.shape[-1])]
        return [MinMaxScaler(feature_range=(0,feat_range)).fit_transform(nvec.reshape(-1,1)).flatten().astype(int) for nvec in d_n]
    
    def iso(self,feat_range=30):
        return MinMaxScaler(feature_range=(0,feat_range)).fit_transform(self.isomap_1D_raw.reshape(-1,1)).flatten().astype(int)
    
    def joint_dist(self,Y,X=None,n_cores=3,verbose=False):
        if X is None:
            if verbose:
                print('No X given, using z_enc as X')
            X = self.z_enc()
        if verbose:   
            print('spinning up {} cores...'.format(n_cores))
        pool = multiprocessing.Pool(processes=n_cores)
        
        pairs = [list(zip(n,Y)) for n in X]
        n_vec = pool.map(Counter,pairs)
        n_pmf = [{k:v/float(sum(C.values())) for k,v in C.items()} for C in n_vec]
        n_cdists = pool.map(dit.Distribution,n_pmf)
        
        pool.close()
#         pool.join()
    #     n_dists = [ScalarDistribution(d) for d in n_pmf]

        return n_cdists
    
    def entropy(self,X):
        jdists = self.z_enc_joint_dist(X)
        
        return [dit.shannon.entropy(d) for d in jdists]
    
    def mutual_info(self,Y,X=None,n_cores=3):
        # Calculates I(z_enc; X)
        if X is None:
            X = self.z_enc()
            
        jdists = self.joint_dist(Y,X,n_cores=n_cores)
        
        return [dit.shannon.mutual_information(d,[0],[1]) for d in jdists]
    
    def prior_layer_info(self,X,lname):
        {
            'z':self.dense_2,
            'dense_2': self.dense_1,
        }
    
    def conditional_entropy(self,X):
        jdists = self.z_enc_joint_dist(X)
        
        cond_H = [dit.shannon.mutual_information(d,[1],[0]) for d in jdists]
        
        return cond_H


# n,dx = make_joint_dists(z_encodings[3],dxs[3])

In [11]:
z_result_sets = [RunResult(z_encodings[i],dxs[i],dys[i],iso[i],cids[i],dense_1[i],dense_2[i]) for i in np.arange(10)]

In [12]:
rr = z_result_sets[3]

In [13]:
# qd2 = rr.q_dense_2(feat_range=30)

In [15]:
z_dx_I = [rr.mutual_info(rr.dx) for rr in tqdm(z_result_sets,desc='z_dx_I')]
z_dy_I = [rr.mutual_info(rr.dy) for rr in tqdm(z_result_sets,desc='z_dy_I')]
z_iso_I = [rr.mutual_info(rr.iso(feat_range=30)) for rr in tqdm(z_result_sets,desc='z_iso_I')]
z_class_I = [rr.mutual_info(rr.class_id) for rr in tqdm(z_result_sets,desc='z_class_I')]

z_dx_I: 100%|██████████| 10/10 [00:38<00:00,  4.09s/it]
z_dy_I: 100%|██████████| 10/10 [00:35<00:00,  3.85s/it]
z_iso_I: 100%|██████████| 10/10 [00:45<00:00,  4.51s/it]
z_class_I: 100%|██████████| 10/10 [00:36<00:00,  3.69s/it]


In [16]:
z_I_df = [pd.DataFrame.from_records({'dx':x,'dy':y,'style':i,'class':c}) for x,y,i,c in zip(z_dx_I,z_dy_I,z_iso_I,z_class_I)]
for df,tx in zip(z_I_df,trans_amt):
    df['translation']=tx
    df['xcov']=10

In [17]:
zI = pd.concat(z_I_df)

In [None]:
# This takes a long time, already done, just load them

# print('calculating mutual info for dx...')
d2_dx_I = [rr.mutual_info(rr.dx,X=rr.q_dense_2(),n_cores=8) for rr in tqdm(z_result_sets,desc='d2_dx_I')]

# print('calculating mutual info for dy...')
# d2_dy_I = [rr.mutual_info(rr.dy) for rr in d2_result_sets]

# print('calculating mutual info for style (Iso)...')
# d2_iso_I = [rr.mutual_info(rr.iso(feat_range=30)) for rr in d2_result_sets]

# print('calculating mutual info for class...')
# d2_class_I = [rr.mutual_info(rr.class_id) for rr in d2_result_sets]

d2_dx_I:   0%|          | 0/10 [00:00<?, ?it/s]Process ForkPoolWorker-127:
Process ForkPoolWorker-125:
Process ForkPoolWorker-123:
Process ForkPoolWorker-126:
Process ForkPoolWorker-124:
Process ForkPoolWorker-121:
Process ForkPoolWorker-128:
Process ForkPoolWorker-122:
Exception in thread Thread-131:
Traceback (most recent call last):
  File "/home/elijahc/.pyenv/versions/3.5.2/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/home/elijahc/.pyenv/versions/3.5.2/lib/python3.5/threading.py", line 862, in run
    self._target(*self._args, **self._kwargs)
  File "/home/elijahc/.pyenv/versions/3.5.2/lib/python3.5/multiprocessing/pool.py", line 429, in _handle_results
    task = get()
  File "/home/elijahc/.pyenv/versions/3.5.2/lib/python3.5/multiprocessing/connection.py", line 251, in recv
    return ForkingPickler.loads(buf.getbuffer())
_pickle.UnpicklingError: invalid load key, ''.

Traceback (most recent call last):


In [None]:
# d2_I_df = [pd.DataFrame.from_records({'dx':x,'dy':y,'style':i,'class':c}) for x,y,i,c in zip(d2_dx_I,d2_dy_I,d2_iso_I,d2_class_I)]
# for df,tx in zip(d2_I_df,trans_amt):
#     df['translation']=tx
#     df['xcov']=10

# for p,df in zip(paths,d2_I_df):
#     df.to_pickle(os.path.join(p,'d2_smi_df.pk'))

In [None]:
d2_I_df = [pd.read_pickle(os.path.join(p,'d2_smi_df.pk')) for p in paths]

In [None]:
d2_I_df[3].head()

In [None]:
# pd.concat(z_I_df).to_pickle('../data/style_embeddings/z_I.pk')

In [None]:
d2I = pd.concat(d2_I_df)
d2I.head()

In [None]:
sns.set_context('talk')
g = sns.PairGrid(d2I,hue='translation',vars=['class','dx','dy','style'],hue_order=[0.9,0.3,0.1],palette='GnBu_d',diag_sharey=False)

# g.map_diag(sns.countplot,)
g.map_offdiag(plt.scatter,s=15)
# g.set(ylim=(0, 1),xlim=(0,1),)
g.add_legend();

In [None]:
sns.set_context('talk')
g = sns.PairGrid(zI,hue='translation',vars=['class','dx','dy','style'],hue_order=[0.9,0.3,0.1],palette='GnBu_d',diag_sharey=False)

g.map_offdiag(plt.scatter,s=15)
# g.set(ylim=(0, 1),xlim=(0,1),)
g.add_legend();

In [None]:
fig,axs = plt.subplots(1,10,figsize=(30,3),
#                        sharey=True,sharex=True,
                      )

for df,ax in zip(d2_I_df,axs):
    sns.distplot(df['dx'],ax=ax,)

In [None]:
fig,axs = plt.subplots(1,10,figsize=(30,3),sharey=True,sharex=True)
# ax.set_ylim(-0.1,0.8)
# ax.set_xlim(-0.1,0.8)
df = d2_I_df[6]
points = []
for df,i in zip(d2_I_df,np.arange(10)):
    
    pts_0 = axs[i].scatter(df['dx'],df['dy'],c=df['class'],cmap='viridis',s=3)
#     pts_1 = axs[1,i].scatter(df['dx'],df['dy'],c=df['style'],cmap='viridis',s=3)
    if i == 0:
        axs[i].set_ylabel('class')
plt.colorbar(pts_0)
# sns.scatterplot(x='dx',y='dy',hue='class',data=z_I_df[5],palette='plasma',legend=False)

In [None]:
fig,axs = plt.subplots(1,10,figsize=(30,3),sharey=True,sharex=True)
# ax.set_ylim(-0.1,0.8)
# ax.set_xlim(-0.1,0.8)
df = d2_I_df[6]
points = []
for df,i in zip(d2_I_df,np.arange(10)):
    
    pts_0 = axs[i].scatter(df['dx'],df['dy'],c=df['style'],cmap='viridis',s=3)
#     pts_1 = axs[1,i].scatter(df['dx'],df['dy'],c=df['style'],cmap='viridis',s=3)
    if i == 0:
        axs[i].set_ylabel('class')
plt.colorbar(pts_0)

In [None]:
# z_I_df = [df['spatial_var']=tx for df,tx in zip(z_I_df,tx_vals)]

In [None]:
rr = result_set[5]

In [None]:
tx_vals = trans_amt
plt.plot(tx_vals,np.array(z_dx_I).mean(axis=1))
plt.plot(tx_vals,np.array(z_dy_I).mean(axis=1))
plt.plot(tx_vals,np.array(z_iso_I).mean(axis=1))
plt.plot(tx_vals,np.array(z_class_I).mean(axis=1))
plt.legend(['dx','dy','style','class'])
plt.xlabel('Spatial Variation')
plt.ylabel('Avg Mutual Info')

In [None]:
import seaborn as sns
sns.set_context('talk')
fig,axs = plt.subplots(4,10,sharex=True,sharey=True,figsize=(20,6))
for fx,fy,fisos,fclass,i in zip(z_dx_I,z_dy_I,z_iso_I,z_class_I,np.arange(10)):
    axs[0,i].scatter(np.arange(25),sorted(fx,reverse=True))
    axs[1,i].scatter(np.arange(25),sorted(fy,reverse=True))
    axs[2,i].scatter(np.arange(25),sorted(fisos,reverse=True))
    axs[3,i].scatter(np.arange(25),sorted(fclass,reverse=True))

    axs[0,0].set_ylabel('I(dX|Z)')
    axs[1,0].set_ylabel('I(dY|Z)')
    axs[2,0].set_ylabel('I(S|Z)')
    axs[3,0].set_ylabel('I(C|Z)')
    
    for ax in axs[3]:
        ax.set_xticks([])
plt.tight_layout()

plt.savefig('../figures/2019-01-28/unit_shanon_waterfall.png')

In [None]:
plt.plot(tx_vals,[auc(np.arange(25)/25.0,z_dx_I[i]) for i in np.arange(10)])
plt.plot(tx_vals,[auc(np.arange(25)/25.0,z_dy_I[i]) for i in np.arange(10)])
plt.plot(tx_vals,[auc(np.arange(25)/25.0,z_iso_I[i]) for i in np.arange(10)])
plt.plot(tx_vals,[auc(np.arange(25)/25.0,z_class_I[i]) for i in np.arange(10)])
plt.legend(['dx','dy','style','class'])
plt.xlabel('Spatial Variation')
plt.ylabel('AUC ()')
plt.tight_layout()
plt.savefig('../figures/2019-01-28/shannon_auc_vs_spatial_variation.png')