In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, RobertaModel, RobertaTokenizer, ViTModel, BlipProcessor, BlipForQuestionAnswering , CLIPProcessor, CLIPModel, get_linear_schedule_with_warmup, AutoModelForSequenceClassification, AutoModelForImageClassification, AutoImageProcessor
from transformers import BeitImageProcessor, BeitForImageClassification
import pickle 
import torch.nn as nn
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

from sklearn.model_selection import train_test_split
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader, Dataset
from torch.nn import CrossEntropyLoss
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor, Normalize

# Testing fusion approaches

In [None]:
# create two tensort of the size 768 and 768
a = torch.randn(6, 768)
b = torch.randn(6, 768)

In [None]:
# Basic element-wise multiplication

d = a.mul(b)
d.shape

In [None]:
e = torch.matmul(a, b.reshape(768, 6))
e

In [None]:
torch.matmul(a, b.T)

In [None]:
(a * b) == d

In [None]:
(a + b).shape

In [None]:
f = torch.einsum('ik,jk->ij', a, b)
f

In [None]:
f == (a * b)

In [None]:
# compute einsum and normalise it
g = torch.einsum('ij,jk->ik', a, b.T)
g = torch.nn.functional.normalize(g, p=2, dim=1)
g

In [None]:
i = torch.einsum('ij,jk->ik', a, b.reshape(768, 6))
i = torch.nn.functional.normalize(g, p=2, dim=1)
i

In [None]:
h = torch.ger(a[0], b[0])
h.shape

In [None]:
from transformers import ViTImageProcessor, ViTForImageClassification
from PIL import Image
import requests

#url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
#image = Image.open(requests.get(url, stream=True).raw)

image = Image.open('./images/332177888.jpg')

processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
model = AutoModel.from_pretrained('google/vit-base-patch16-224')
model.config.output_hidden_states = True

inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)

In [None]:
inputs

In [None]:
try:
    print(model.config.hidden_sizes)
except:
    print("hehe")

In [None]:
outputs.last_hidden_state.shape

In [None]:
outputs.last_hidden_state.squeeze(0).shape

In [None]:
outputs.last_hidden_state[:, 0, :].shape

In [None]:
outputs.last_hidden_state.squeeze(0)[0, :].shape

In [None]:
tmp = processor(images=image, return_tensors="pt").pixel_values
### reshape tmp to 3, 224, 224
# mp = tmp.squeeze(0)
tmp.shape

In [None]:
from transformers import AutoImageProcessor, ResNetForImageClassification
from PIL import Image
import requests
import torch

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

images = [image, image]  # Batch size 2

processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
model = AutoModel.from_pretrained("microsoft/resnet-50")
model.config.output_hidden_states = True

inputs = processor(images, return_tensors="pt")

# TODO: add pooling

In [None]:
model.config

In [None]:
model.config.hidden_size

In [None]:
states = model(**inputs).last_hidden_state
states.shape

In [None]:
states[0, :, :].shape

In [None]:
from transformers import MobileViTFeatureExtractor, MobileViTForImageClassification
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

feature_extractor = AutoImageProcessor.from_pretrained("apple/mobilevit-small")
model = AutoModel.from_pretrained("apple/mobilevit-small")

inputs = feature_extractor(images=image, return_tensors="pt")

outputs = model(**inputs)
# TODO - add pooling

In [None]:
outputs.last_hidden_state.shape

In [None]:
model.config

In [None]:
from transformers import BeitImageProcessor, BeitForImageClassification
from PIL import Image
import requests

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

processor = BeitImageProcessor.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
model = BeitForImageClassification.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
model.config.output_hidden_states = True

inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)

In [None]:
outputs

In [None]:
outputs.hidden_states[-1].shape

In [None]:
inputs.pixel_values.shape

In [None]:
model.config.hidden_size

In [None]:
import torch
import torch.nn as nn

# Define the input tensor shape
BATCH_SIZE = 10
HIDDEN_SIZE = 256
X = 5
Y = 5

# Create a dummy input tensor with the specified shape
input_tensor = torch.randn(BATCH_SIZE, HIDDEN_SIZE, X, Y)

# Define a new layer that will transform the input tensor to the desired shape
class TransformLayer(nn.Module):
    def __init__(self):
        super(TransformLayer, self).__init__()
        # Use AdaptiveAvgPool2d to perform average pooling and reduce the spatial dimensions to 1x1
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))

    def forward(self, x):
        # Apply average pooling
        x = self.avg_pool(x)
        # Remove the 1x1 spatial dimensions by flattening
        x = x.view(BATCH_SIZE, HIDDEN_SIZE)
        return x

# Create an instance of the TransformLayer
transform_layer = TransformLayer()

# Apply the transformation to the input tensor
output_tensor = transform_layer(input_tensor)

# Print the shapes of the input and output tensors
print(f"Input tensor shape: {input_tensor.shape}")
print(f"Output tensor shape: {output_tensor.shape}")

# Verify that the output tensor has the desired shape
assert output_tensor.shape == (BATCH_SIZE, HIDDEN_SIZE), "The output tensor does not have the correct shape."

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define the input tensor shape
BATCH_SIZE = 10
HIDDEN_SIZE = 256
X = 5
Y = 5

# Create a dummy input tensor with the specified shape
input_tensor = torch.randn(BATCH_SIZE, HIDDEN_SIZE, X, Y)

# Define a new layer that will transform the input tensor to the desired shape
class TransformLayer(nn.Module):
    def __init__(self, hidden_size, x, y):
        super(TransformLayer, self).__init__()
        # Calculate the number of features for the linear layer
        self.num_features = x * y
        # Define a linear layer that will flatten the spatial dimensions
        self.linear = nn.Linear(self.num_features, 1)

    def forward(self, x):
        # Flatten the spatial dimensions of the input tensor
        x = x.view(BATCH_SIZE, HIDDEN_SIZE, -1)
        print(f"Shape after reshaping: {x.shape}")
        # Apply the linear layer to each channel
        x = self.linear(x)
        print(f"Shape after linear layer: {x.shape}")
        # Remove the last dimension by squeezing
        x = x.squeeze(-1)
        return x

# Create an instance of the TransformLayer
transform_layer = TransformLayer(HIDDEN_SIZE, X, Y)

# Apply the transformation to the input tensor
output_tensor = transform_layer(input_tensor)

# Print the shapes of the input and output tensors
print(f"Input tensor shape: {input_tensor.shape}")
print(f"Output tensor shape: {output_tensor.shape}")

# Verify that the output tensor has the desired shape
assert output_tensor.shape == (BATCH_SIZE, HIDDEN_SIZE), "The output tensor does not have the correct shape."

In [None]:
import torch
import torch.nn as nn

# Define the input tensor shape
BATCH_SIZE = 10
HIDDEN_SIZE = 256
X = 5

# Create a dummy input tensor with the specified shape
input_tensor = torch.randn(BATCH_SIZE, X, HIDDEN_SIZE)

# Define a new layer that will transform the input tensor to the desired shape using a pooling layer
class PoolingTransformLayer(nn.Module):
    def __init__(self, hidden_size):
        super(PoolingTransformLayer, self).__init__()
        self.hidden_size = hidden_size
        # Use AdaptiveAvgPool1d to perform average pooling and reduce the X dimension to 1
        self.avg_pool = nn.AdaptiveAvgPool1d(1)

    def forward(self, x):
        # Apply average pooling across the X dimension
        x = x.transpose(1, 2)  # Swap X and HIDDEN_SIZE dimensions
        x = self.avg_pool(x)
        # Remove the last dimension by squeezing
        x = x.view(BATCH_SIZE, self.hidden_size)
        return x

# Create an instance of the TransformLayer using a pooling layer
pooling_transform_layer = PoolingTransformLayer(HIDDEN_SIZE)

# Apply the transformation to the input tensor
output_tensor_pooling = pooling_transform_layer(input_tensor)

# Print the shapes of the input and output tensors
print(f"Input tensor shape: {input_tensor.shape}")
print(f"Output tensor shape (pooling): {output_tensor_pooling.shape}")


In [None]:
import torch
import torch.nn as nn

# Define the input tensor shape
BATCH_SIZE = 10
HIDDEN_SIZE = 256
X = 5

# Create a dummy input tensor with the specified shape
input_tensor = torch.randn(BATCH_SIZE, X, HIDDEN_SIZE)

# Perform average pooling across the X dimension
output_tensor_pooling = input_tensor.mean(dim=1)

# Print the shapes of the input and output tensors
print(f"Input tensor shape: {input_tensor.shape}")
print(f"Output tensor shape (pooling): {output_tensor_pooling.shape}")

In [None]:
import torch
import torch.nn as nn

# Define the input tensor shape
BATCH_SIZE = 10
HIDDEN_SIZE = 256
X = 5

# Create a dummy input tensor with the specified shape
input_tensor = torch.randn(BATCH_SIZE, X, HIDDEN_SIZE)

# Define a new layer that will transform the input tensor to the desired shape using a linear layer
class LinearTransformLayer(nn.Module):
    def __init__(self, hidden_size, x):
        super(LinearTransformLayer, self).__init__()
        self.hidden_size = hidden_size
        # Define a linear layer that will flatten the spatial dimension
        # The input features should be X * hidden_size and output features should be hidden_size
        self.linear = nn.Linear(x * hidden_size, hidden_size)

    def forward(self, x):
        # Flatten the spatial dimension of the input tensor
        x = x.view(BATCH_SIZE, -1)
        # Apply the linear layer
        x = self.linear(x)
        return x

# Create an instance of the TransformLayer using a linear layer
linear_transform_layer = LinearTransformLayer(HIDDEN_SIZE, X)

# Apply the transformation to the input tensor
output_tensor_linear = linear_transform_layer(input_tensor)

# Print the shapes of the input and output tensors
print(f"Input tensor shape: {input_tensor.shape}")
print(f"Output tensor shape (linear): {output_tensor_linear.shape}")

# Verify that the output tensor has the desired shape
assert output_tensor_linear.shape == (BATCH_SIZE, HIDDEN_SIZE), "The output tensor does not have the correct shape."

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define the input tensor shapes
BATCH_SIZE = 10
HIDDEN_SIZE = 256
X = 5
Y = 5

# Create two dummy input tensors with the specified shapes
input_tensor_4d = torch.randn(BATCH_SIZE, HIDDEN_SIZE, X, Y)
input_tensor_3d = torch.randn(BATCH_SIZE, X, HIDDEN_SIZE)

# Define a general function to transform the input tensor to the desired shape using a pooling layer
def transform_tensor_pooling(input_tensor):
    # Determine the shape of the input tensor
    tensor_shape = input_tensor.shape

    # Check if the input tensor is 4D or 3D
    if len(tensor_shape) == 4:
        # Use AdaptiveAvgPool2d for 4D tensor
        avg_pool_2d = nn.AdaptiveAvgPool2d((1, 1))
        output_tensor = avg_pool_2d(input_tensor)
        print("Shape after pool: " + str(output_tensor.shape))
        output_tensor = output_tensor.view(tensor_shape[0], tensor_shape[1])
    elif len(tensor_shape) == 3:
        # Use AdaptiveAvgPool1d for 3D tensor
        avg_pool_1d = nn.AdaptiveAvgPool1d(1)
        output_tensor = avg_pool_1d(input_tensor.transpose(1, 2))
        print("Shape after pool: " + str(output_tensor.shape))
        output_tensor = output_tensor.view(tensor_shape[0], tensor_shape[2])
    else:
        raise ValueError("Unsupported tensor shape")

    return output_tensor

# Define a general function to transform the input tensor to the desired shape using a linear layer
def transform_tensor_linear(input_tensor):
    # Determine the shape of the input tensor
    tensor_shape = input_tensor.shape

    # Check if the input tensor is 4D or 3D
    if len(tensor_shape) == 4:
        # Flatten the tensor for the linear layer
        input_features = tensor_shape[2] * tensor_shape[3]
        linear_layer = nn.Linear(input_features, 1)
        output_tensor = linear_layer(input_tensor.view(tensor_shape[0], tensor_shape[1], -1))
        output_tensor = output_tensor.squeeze(-1)
    
    elif len(tensor_shape) == 3:
        # Flatten the tensor for the linear layer
        input_features = tensor_shape[1]
        linear_layer = nn.Linear(tensor_shape[1] * tensor_shape[2], tensor_shape[2])
        output_tensor = linear_layer(input_tensor.view(tensor_shape[0], -1))
    else:
        raise ValueError("Unsupported tensor shape")

    return output_tensor

# Example usage:
print(transform_tensor_pooling(input_tensor_4d).shape)
print(transform_tensor_pooling(input_tensor_3d).shape)

print(transform_tensor_linear(input_tensor_4d).shape)
print(transform_tensor_linear(input_tensor_3d).shape)