In [1]:
import fiftyone as fo
import torchvision
import os 
import torch
from PIL import Image
import numpy as np
import fiftyone.brain as fob
from sklearn.neighbors import NearestNeighbors


## Create Dataset

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
samples = []

data_root2 = '../Data/Snickers_Real_Image/10/'
data_root1 = '../Data/Snickers_Digital_Twin/10/'

folder1 = os.listdir(data_root1)
folder2 = os.listdir(data_root2)


for count, file in enumerate(folder1):
    if count >len(folder2):
        break
    sample = fo.Sample(filepath = os.path.join(data_root1,file))
    label = 'Synthetic'
    sample["ground_truth"] = fo.Classification(label=label)
    samples.append(sample)

for count, file in enumerate(folder2):
    sample = fo.Sample(filepath = os.path.join(data_root2,file))
    label = 'Real'
    sample["ground_truth"] = fo.Classification(label=label)
    samples.append(sample)

# Create dataset
dataset = fo.Dataset("my-classification-dataset")
dataset.add_samples(samples)

 100% |█████████████████| 281/281 [126.0ms elapsed, 0s remaining, 2.3K samples/s]  


['6261aa7a5d9990c51a3ea3b9',
 '6261aa7a5d9990c51a3ea3bb',
 '6261aa7a5d9990c51a3ea3bc',
 '6261aa7a5d9990c51a3ea3bf',
 '6261aa7a5d9990c51a3ea3c0',
 '6261aa7a5d9990c51a3ea3c1',
 '6261aa7a5d9990c51a3ea3c2',
 '6261aa7a5d9990c51a3ea3c7',
 '6261aa7a5d9990c51a3ea3c8',
 '6261aa7a5d9990c51a3ea3c9',
 '6261aa7a5d9990c51a3ea3ca',
 '6261aa7a5d9990c51a3ea3cb',
 '6261aa7a5d9990c51a3ea3cc',
 '6261aa7a5d9990c51a3ea3cd',
 '6261aa7a5d9990c51a3ea3ce',
 '6261aa7a5d9990c51a3ea3d7',
 '6261aa7a5d9990c51a3ea3d8',
 '6261aa7a5d9990c51a3ea3d9',
 '6261aa7a5d9990c51a3ea3da',
 '6261aa7a5d9990c51a3ea3db',
 '6261aa7a5d9990c51a3ea3dc',
 '6261aa7a5d9990c51a3ea3dd',
 '6261aa7a5d9990c51a3ea3de',
 '6261aa7a5d9990c51a3ea3df',
 '6261aa7a5d9990c51a3ea3e0',
 '6261aa7a5d9990c51a3ea3e1',
 '6261aa7a5d9990c51a3ea3e2',
 '6261aa7a5d9990c51a3ea3e3',
 '6261aa7a5d9990c51a3ea3e4',
 '6261aa7a5d9990c51a3ea3e5',
 '6261aa7a5d9990c51a3ea3e6',
 '6261aa7a5d9990c51a3ea3f7',
 '6261aa7a5d9990c51a3ea3f8',
 '6261aa7a5d9990c51a3ea3f9',
 '6261aa7a5d99

## Image loading 

In [3]:
def pil_loader(path: str) -> Image.Image:
    with open(path, 'rb') as f:
        img = Image.open(f)
        return img.convert('RGB')
    
def get_image(path,transform):
    
    x = pil_loader(path)
    x = transform(x)
    return(x)

## Image Normalization

In [4]:
transform = torchvision.transforms.Compose([ 
    torchvision.transforms.Resize(size = (256,256)),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
])

## Setup pre-trained model for generating embeddings

In [5]:
model = torchvision.models.resnet50(pretrained=True).to(device)

model.eval()

class SaveFeatures():
    features=None
    def __init__(self, m): 
        self.hook = m.register_forward_hook(self.hook_fn)
        self.features = None
    def hook_fn(self, module, input, output): 
        out = output.detach()#.cpu().numpy()
        self.features = out.flatten()

    def remove(self): 
        self.hook.remove()
        
hook = SaveFeatures(model.avgpool)


def model_embedding(model, images):
    ## renset50
    _ = model(images)
    return hook.features


## Generate Embeddings

In [6]:
model.eval()
n_features = 2048
num_images = len(dataset.values('filepath'))

with torch.no_grad():
    
    embedding = torch.zeros(num_images, n_features)
    index_count = 0 
    for index, (f, label_dict) in enumerate(zip(dataset.values('filepath'), dataset.values('ground_truth'))):
      
        images = get_image(f, transform)
        images = images.unsqueeze(0).to(device)  
        
        out = model_embedding(model, images.to(device))
        embedding[index,:] = (out)

        

## umap Dimensionality reduction

In [7]:
results = fob.compute_visualization(
    dataset,
    embeddings=embedding.detach().cpu().numpy(),
    num_dims=2,
    method="umap",
    brain_key="None",
    verbose=True,
    seed=51,
)


Generating visualization...
UMAP(random_state=51, verbose=True)
Thu Apr 21 12:03:43 2022 Construct fuzzy simplicial set



The TBB threading layer requires TBB version 2019.5 or later i.e., TBB_INTERFACE_VERSION >= 11005. Found TBB_INTERFACE_VERSION = 9107. The TBB threading layer is disabled.



Thu Apr 21 12:03:43 2022 Finding Nearest Neighbors
Thu Apr 21 12:03:45 2022 Finished Nearest Neighbor Search
Thu Apr 21 12:03:47 2022 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Thu Apr 21 12:03:49 2022 Finished embedding


## Voxel 51 visualization

In [8]:
session = fo.launch_app(view=dataset.view())
plot = results.visualize(labels="ground_truth.label")
plot.show(height=720)
session.plots.attach(plot)





FigureWidget({
    'data': [{'customdata': array(['6261aa7a5d9990c51a3ea4c5', '6261aa7a5d9990c51a3ea4c6',
    …

## Calculate data overlap

In [9]:
y = np.array([int('Real' in d['label']) for d in dataset.values('ground_truth')])
X = results.points
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X)
nearest_class = []

## find nearest points to Synthetic data
for data in X[y==0]:
    distances, indices = nbrs.kneighbors(data.reshape(1, -1))
    nearest_class.append(y[indices[0][1]])    

n_real = np.sum([int(x==1) for x in nearest_class])
n_fake = np.sum([int(x==0) for x in nearest_class])


ns = (len(nearest_class))
z = 0
x = n_fake/len(nearest_class)
a = x
b = z
c = np.sqrt(ns)
p_val = (-np.sqrt(-4*a**2*b**2*c**2+4*a*b**2*c**2 + b**4) + 2*a*c**2 + b**2) / (2*(b**2+c**2))

nd = len(y)
nq = (y==1).sum() 
data_overlap = -(p_val-1) * (nd / nq)
print('data overlap = %f' %data_overlap)

data overlap = 0.384347
