In [1]:
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
import torch
import os
import pandas as pd 
from torchvision import datasets, transforms, models
from torch.utils.data import Dataset
from mftma.manifold_analysis_correlation import manifold_analysis_corr
from mftma.utils.make_manifold_data import make_manifold_data
from mftma.utils.activation_extractor import extractor
from mftma.utils.analyze_pytorch import analyze
import getpass
import mat73
import scipy
from datetime import datetime
from collections import defaultdict
import scipy.io as sio   
from scipy.io import loadmat
class CFAR100_fake_dataset(Dataset):
    def __init__(self, data_dir=None):
        self.data_dir=data_dir
        self.dat , self.target=self.load_data()
        self.n_samples=self.dat.shape[0]
    def __len__(self):
        return self.n_samples
    def __getitem__(self, idx):
        #item=np.expand_dims(self.dat[idx],axis=0)
        item=self.dat[idx]
        targ=np.squeeze(self.target[idx])
        return (torch.tensor(item,dtype=torch.float), targ)
    def load_data(self):
        try:
            annot=loadmat(self.data_dir) 
            ops_struct=annot['ops_out']
            vals=ops_struct[0,0]
        except: 
            data_dict = mat73.loadmat(self.data_dir)
            vals=data_dict['ops_out']
        dat=vals['data']
        self.vals=vals
        self.adj=vals['Adjacency']
        dat_new=dat[:,range(3*32*32)]
        dat_new=np.reshape(dat_new,(-1,3,32,32))
        target=np.double(np.transpose(vals['class_id'])-1.0)
        return dat_new, target 
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ModuleNotFoundError: No module named 'scipy.linalg'

ModuleNotFoundError: No module named 'scipy.linalg'

In [None]:
user=getpass.getuser()
print(user)
if user=='eghbalhosseini':
    save_dir='/Users/eghbalhosseini/MyData/neural_manifolds/VGG16_training_on_synthetic/'
    data_dir='/Users/eghbalhosseini/MyData/neural_manifolds/VGG16_training_on_synthetic/'
elif user=='ehoseini':
    save_dir='/om/user/ehoseini/MyData/neural_manifolds/VGG16_training_on_synthetic/'
    data_dir='/om/user/ehoseini/MyData/neural_manifolds/'

#data_file='synthtree_nobj_50000_nclass_50_nfeat_3072_norm_1.mat'

# 1. Partition_dataset

In [None]:
# load dataset 
data_file='synthpartition_nobj_50000_nclass_50_nfeat_3072_norm_1.mat'
train_dataset=CFAR100_fake_dataset(data_dir=os.path.join(data_dir,data_file))
# extract samples 
sampled_classes = 50
examples_per_class = 25
data = make_manifold_data(train_dataset, sampled_classes, examples_per_class,max_class=50, seed=0)
data = [d.to(device) for d in data]

## load model and extract activations 

In [None]:
model_save_path=save_dir+'VGG16_synthdata_'+train_dataset.vals.structure+'_nclass_'+str(int(train_dataset.vals.n_class))+'_n_exm_'+str(int(train_dataset.vals.exm_per_class))
model = models.vgg16(num_classes=50)
model.load_state_dict(torch.load(model_save_path,map_location=device))
model = model.to(device)
model = model.eval()
activations = extractor(model, data, layer_types=['Conv2d', 'Linear'])
list(activations.keys())

In [None]:
(activations['layer_0_Input'][1][1]).shape

In [None]:
train_dir="/Users/eghbalhosseini/Desktop/"
epoch_id=1
layer_num=0


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
datafile = os.path.join(save_dir,train_dir, 'train_epoch_' + str(epoch_id))
epoch_dat = pd.read_pickle(datafile)

In [None]:
for layer, data, in activations.items():
    X = [d.reshape(d.shape[0], -1).T for d in data]
    # Get the number of features in the flattened data
    N = X[0].shape[0]
    # If N is greater than 5000, do the random projection to 5000 features
    if N > 5000:
        print("Projecting {}".format(layer))
        M = np.random.randn(5000, N)
        M /= np.sqrt(np.sum(M*M, axis=1, keepdims=True))
        X = [np.matmul(M, d) for d in X]
    activations[layer] = X

In [None]:
capacities = []
radii = []
dimensions = []
correlations = []

for k, X, in activations.items():
    # Analyze each layer's activations
    a, r, d, r0, K = manifold_analysis_corr(X, 0, 300, n_reps=1)
    
    # Compute the mean values
    a = 1/np.mean(1/a)
    r = np.mean(r)
    d = np.mean(d)
    print("{} capacity: {:4f}, radius {:4f}, dimension {:4f}, correlation {:4f}".format(k, a, r, d, r0))
    
    # Store for later
    capacities.append(a)
    radii.append(r)
    dimensions.append(d)
    correlations.append(r0)

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(18, 4))

axes[0].plot(capacities, linewidth=5)
axes[1].plot(radii, linewidth=5)
axes[2].plot(dimensions, linewidth=5)
axes[3].plot(correlations, linewidth=5)

axes[0].set_ylabel(r'$\alpha_M$', fontsize=18)
axes[1].set_ylabel(r'$R_M$', fontsize=18)
axes[2].set_ylabel(r'$D_M$', fontsize=18)
axes[3].set_ylabel(r'$\rho_{center}$', fontsize=18)

names = list(activations.keys())
names = [n.split('_')[1] + ' ' + n.split('_')[2] for n in names]
for ax in axes:
    ax.set_xticks([i for i, _ in enumerate(names)])
    ax.set_xticklabels(names, rotation=90, fontsize=16)
    ax.tick_params(axis='both', which='major', labelsize=14)

plt.tight_layout()
plt.show()

# 2. Tree dataset

In [None]:
# load dataset 
data_file='synthtree_nobj_50000_nclass_50_nfeat_3072_norm_1.mat'
train_dataset=CFAR100_fake_dataset(data_dir=os.path.join(data_dir,data_file))
# extract samples 
sampled_classes = 50
examples_per_class = 50
data = make_manifold_data(train_dataset, sampled_classes, examples_per_class,max_class=50, seed=0)
data = [d.to(device) for d in data]

In [None]:
model_save_path=save_dir+'VGG16_synthdata_'+train_dataset.vals.structure+'_nclass_'+str(int(train_dataset.vals.n_class))+'_n_exm_'+str(int(train_dataset.vals.exm_per_class))
model = models.vgg16(num_classes=50)
model.load_state_dict(torch.load(model_save_path,map_location=device))
model = model.to(device)
model = model.eval()
activations = extractor(model, data, layer_types=['Conv2d', 'Linear'])
list(activations.keys())

In [None]:
for layer, data, in activations.items():
    X = [d.reshape(d.shape[0], -1).T for d in data]
    # Get the number of features in the flattened data
    N = X[0].shape[0]
    # If N is greater than 5000, do the random projection to 5000 features
    if N > 5000:
        print("Projecting {}".format(layer))
        M = np.random.randn(5000, N)
        M /= np.sqrt(np.sum(M*M, axis=1, keepdims=True))
        X = [np.matmul(M, d) for d in X]
    activations[layer] = X

In [None]:
capacities = []
radii = []
dimensions = []
correlations = []

for k, X, in activations.items():
    # Analyze each layer's activations
    a, r, d, r0, K = manifold_analysis_corr(X, 0, 300, n_reps=1)
    
    # Compute the mean values
    a = 1/np.mean(1/a)
    r = np.mean(r)
    d = np.mean(d)
    print("{} capacity: {:4f}, radius {:4f}, dimension {:4f}, correlation {:4f}".format(k, a, r, d, r0))
    
    # Store for later
    capacities.append(a)
    radii.append(r)
    dimensions.append(d)
    correlations.append(r0)

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(18, 4))

axes[0].plot(capacities, linewidth=5)
axes[1].plot(radii, linewidth=5)
axes[2].plot(dimensions, linewidth=5)
axes[3].plot(correlations, linewidth=5)

axes[0].set_ylabel(r'$\alpha_M$', fontsize=18)
axes[1].set_ylabel(r'$R_M$', fontsize=18)
axes[2].set_ylabel(r'$D_M$', fontsize=18)
axes[3].set_ylabel(r'$\rho_{center}$', fontsize=18)

names = list(activations.keys())
names = [n.split('_')[1] + ' ' + n.split('_')[2] for n in names]
for ax in axes:
    ax.set_xticks([i for i, _ in enumerate(names)])
    ax.set_xticklabels(names, rotation=90, fontsize=16)
    ax.tick_params(axis='both', which='major', labelsize=14)

plt.tight_layout()
plt.show()