In [1]:
#install editable version of this repository
!python -m pip install -e .

In [12]:
import os 
import sys
from importlib import reload

# Import Pytorch & Huggingface ==========================
#set cache dir before imports 
functions_path="/orfeo/scratch/area/evillegas/glm/glm_analysis"
huggingface_path="/orfeo/cephfs/scratch/area/evillegas/.cache/"
environment_path="/orfeo/cephfs/scratch/area/evillegas/glm/dgxtorch/bin/activate"
home_path="/orfeo/scratch/area/evillegas/"


from configuration import huggingface_path as cache_dir
#cache_dir = cache_path_here
os.environ['HF_HOME'] = f"{cache_dir}/huggingface"
os.environ['TRANSFORMERS_CACHE']= f"{cache_dir}/huggingface/hub"
os.environ['TORCH_HOME']= f"{cache_dir}/torch"
os.environ['XDG_CACHE_HOME']= cache_dir

#imports 
import torch
from torch.cuda import memory_allocated, empty_cache
from datasets import load_dataset
from torch.utils.data import DataLoader

# Import local functions ==================================
import activation_extractor
from activation_extractor import *
reload(activation_extractor)

<module 'activation_extractor' from '/orfeo/cephfs/scratch/area/evillegas/glm/activation-extractor/src/activation_extractor/__init__.py'>

In [13]:
from activation_extractor.extractors import intermediateExtractorBase 
reload(intermediateExtractorBase)
from activation_extractor.inferencers import inferencerBase
reload(inferencerBase)
from activation_extractor.model_functions import load_models, tokenize_funs, inference_funs, default_hooked_layers
reload(load_models)
reload(tokenize_funs)
reload(inference_funs)
reload(default_hooked_layers)

<module 'activation_extractor.model_functions.default_hooked_layers' from '/orfeo/cephfs/scratch/area/evillegas/glm/activation-extractor/src/activation_extractor/model_functions/default_hooked_layers.py'>

# Proteins/DNA

In [15]:
#load data
sequences = ["AAAAAAAAAAA", "CCCCCCCCCC"]

In [5]:
model_name = "InstaDeepAI/nucleotide-transformer-500m-human-ref"
inferencer = activation_extractor.Inferencer(model_name, device='cuda', half=False)

In [17]:
#intermediate activation extractor
layers_to_hook = activation_extractor.get_layers_to_hook(inferencer.model,inferencer.model_type)
extractor = activation_extractor.IntermediateExtractor(inferencer.model, layers_to_hook)
extractor.register_hooks()

#inference
processed = inferencer.process(sequences) #tokenize
outputs = inferencer.inference(processed)

#extractor outputs
#extractor.save_outputs('results/embeddings/test')
extractor.clear_all_hooks()

In [4]:
extractor.get_outputs()

# Images 

In [8]:
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

In [4]:
model_name = "timm/vgg16.tv_in1k"
inferencer = activation_extractor.Inferencer(model_name, device='cuda', half=False)

In [5]:
#intermediate activation extractor
layers_to_hook = default_hooked_layers.get_layers_to_hook(inferencer.model,inferencer.model_type)
extractor = intermediateExtractor.IntermediateExtractorBase(inferencer.model, layers_to_hook)
extractor.register_hooks()

#inference
processed = inferencer.process(image)
outputs = inferencer.inference(processed)

#extractor outputs
#extractor.save_outputs('results/embeddings/test')
extractor.clear_all_hooks()

In [6]:
extractor.get_outputs()

# Multimodal

In [19]:
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
text = ["a photo of a cat", "a photo of a dog"]

input_data = {"text":text,
             "image":image}

In [23]:
model_name = "openai/clip-vit-base-patch32"
inferencer = activation_extractor.Inferencer(model_name, device='cuda', half=False)

In [24]:
#intermediate activation extractor
layers_to_hook = default_hooked_layers.get_layers_to_hook(inferencer.model,inferencer.model_type)
extractor = intermediateExtractor.IntermediateExtractorBase(inferencer.model, layers_to_hook)
extractor.register_hooks()

#inference
processed = inferencer.process(input_data)
outputs = inferencer.inference(processed)

#extractor outputs
#extractor.save_outputs('results/embeddings/test')
extractor.clear_all_hooks()

In [7]:
extractor.get_outputs()

# Inference Over a Dataset

In [5]:
from activation_extractor.scripts.inference import main_inference, load_the_data

In [2]:
model_name="InstaDeepAI/nucleotide-transformer-500m-human-ref"
output_folder="test"
emb_format="mean"
save_method="numpy"
max_batches=1

data_args = {
        "data_type":"dna",
        "target_col":"sequence",
        "batch_size":4,
        "data_source":"huggingface",
        "dataset_name":"InstaDeepAI/human_reference_genome",
        "dataset_partition":"validation",
        # "data_source":"local",
        # "input_path":"test.csv",
        "max_length":999,
    }


In [3]:
model_name="facebook/convnext-tiny-224"
output_folder="test"
emb_format="mean"
save_method="numpy"
max_batches=1

data_args = {
        "data_type":"image",
        "target_col":"img",
        "batch_size":1,
        "data_source":"huggingface",
        "dataset_name":"uoft-cs/cifar100",
        "dataset_partition":"train",
        # "data_source":"local",
        # "input_path":"test.csv",
        "max_length":999,
    }


In [18]:
def collate_pil(batch):
    """
    Convert PIL images to numpy arrays.
    """
    batch = [np.array(img) for img in batch]

In [34]:
ds = [ np.array(dataset["train"]["img"][i]) for i in range(10) ]

In [36]:
ds[0].shape

(32, 32, 3)

In [60]:
from torchvision.datasets import CIFAR100
import torchvision.transforms as transforms

In [64]:
# Define the transformations
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert the image to a PyTorch tensor
    transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761))  # Normalize the images
])

train_dataset = CIFAR100(
    root='./data',  # Directory to store the dataset
    train=True,  # Download the training dataset
    download=True,  # Download if not already downloaded
    transform=transform  # Apply transformations
)

train_loader = DataLoader(
    train_dataset,
    batch_size=64,  # Number of samples per batch
    shuffle=True,  # Shuffle the dataset
    num_workers=2,  # Number of subprocesses to use for data loadingt
)

Files already downloaded and verified


In [45]:
data_loader = DataLoader(train_dataset, batch_size=2, 
                             shuffle=False, collate_fn=None)

In [23]:
from datasets import load_dataset
from torch.utils.data import DataLoader
import numpy as np

dataset = load_dataset("uoft-cs/cifar100")
data_loader = DataLoader(dataset["train"], batch_size=2, 
                             shuffle=False, collate_fn=collate_pil)

In [7]:
data_loader = load_the_data(**data_args)

In [67]:
batch = next(iter(train_loader))
batch

[tensor([[[[ 0.2153,  0.5819,  0.2593,  ...,  1.3588,  1.3002,  1.1829],
           [ 0.1714,  0.4646,  0.2447,  ...,  1.2269,  1.0656,  1.1096],
           [ 0.0101, -0.0339, -0.1072,  ...,  0.5965,  0.6552,  0.9630],
           ...,
           [-1.1480, -1.1334, -1.2360,  ..., -1.2507, -1.1041, -1.0894],
           [-1.1774, -1.1627, -1.2653,  ..., -1.2067, -1.0747, -0.6496],
           [-1.2213, -1.2213, -1.2946,  ..., -0.1951,  0.3473,  1.0510]],
 
          [[-0.1240,  0.4417,  0.0901,  ...,  1.4661,  1.4966,  1.2979],
           [-0.1393,  0.2735,  0.0442,  ...,  1.2979,  1.1297,  1.1603],
           [-0.2157, -0.3074, -0.3380,  ...,  0.3958,  0.4111,  0.7781],
           ...,
           [-1.2706, -1.2553, -1.3624,  ..., -1.4235, -1.4082, -1.4541],
           [-1.3165, -1.2859, -1.3776,  ..., -1.4694, -1.4388, -0.8731],
           [-1.3471, -1.3471, -1.4235,  ..., -0.3686,  0.3041,  1.2214]],
 
          [[-0.0199,  0.4914,  0.2073,  ...,  1.5708,  1.5992,  1.3436],
           [-

In [4]:
main_inference(model_name, output_folder, emb_format, save_method, max_batches, data_args)

Output folder is: test/facebook/convnext-tiny-224/mean


ValueError: Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray, but got <class 'NoneType'>.

In [29]:
np.array(ds["train"]["img"][0]).shape

(32, 32, 3)

In [9]:
from datasets import load_dataset

ds = load_dataset("ChristophSchuhmann/MS_COCO_2017_URL_TEXT")

Downloading data:   0%|          | 0.00/18.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/591753 [00:00<?, ? examples/s]

In [12]:
ds['train']['URL'][0]

'http://images.cocodataset.org/train2017/000000391895.jpg'