In [1]:
import whisper 
import matplotlib.pyplot as plt
import seaborn as sns
from torch import Tensor, nn
from typing import Dict, Iterable, Optional
import torch.nn.functional as F
import torch
import numpy as np
from torch_intermediate_layer_getter import IntermediateLayerGetter as MidGetter

In [2]:
model1 = whisper.load_model("base")

# load audio and pad/trim it to fit 30 seconds
audio1 = whisper.load_audio("a_gisela.mp3")
audio1 = whisper.pad_or_trim(audio1)

# make log-Mel spectrogram and move to the same device as the model
mel1 = whisper.log_mel_spectrogram(audio1).to(model1.device)

selected_layers = {
    'blocks.0': 'blocks.0',
    'blocks.1': 'blocks.1'
}

mid_getter = MidGetter(model1.encoder, return_layers=selected_layers, keep_output=True)
mel1 = torch.reshape(mel1, (1, 80, 3000))
mid_outputs1, model_output1 = mid_getter(mel1)
print(mid_outputs1['blocks.0'].size(), mid_outputs1['blocks.0'])
print(mid_outputs1['blocks.1'].size(), mid_outputs1['blocks.1'])

torch.Size([1, 1500, 512]) tensor([[[-0.1221,  0.3127,  0.4795,  ...,  0.5934,  0.9347,  0.3327],
         [ 0.9969,  1.4110,  0.7859,  ...,  0.6107,  0.6294,  0.5011],
         [ 1.1947,  1.5941,  0.5548,  ...,  0.7294,  0.5479,  0.5069],
         ...,
         [ 0.9574, -0.4389,  0.5143,  ...,  0.0867,  0.2856,  0.3189],
         [ 0.5062,  0.2554,  0.3975,  ...,  0.0898,  0.3056,  0.3125],
         [-0.1968,  1.0142,  0.8037,  ...,  0.1604,  0.3955,  0.3482]]],
       grad_fn=<AddBackward0>)
torch.Size([1, 1500, 512]) tensor([[[-0.4058,  0.2471,  0.3003,  ...,  0.3183,  0.8087,  0.8048],
         [ 0.8013,  1.2728,  0.8933,  ...,  0.5245,  0.6384,  0.6156],
         [ 1.0496,  1.4448,  0.6581,  ...,  0.6079,  0.6210,  0.6813],
         ...,
         [ 0.8739, -0.5718,  0.6698,  ...,  0.1327,  0.1670,  0.3734],
         [ 0.4980,  0.1949,  0.6519,  ...,  0.0632,  0.1781,  0.4272],
         [-0.3434,  1.1119,  1.5858,  ...,  0.2205,  0.2329,  0.8794]]],
       grad_fn=<AddBackward0>)


In [3]:
model2 = whisper.load_model("base")

# load audio and pad/trim it to fit 30 seconds
audio2 = whisper.load_audio("a_bram.mp3")
audio2 = whisper.pad_or_trim(audio2)

# make log-Mel spectrogram and move to the same device as the model
mel2 = whisper.log_mel_spectrogram(audio2).to(model2.device)

selected_layers = {
    'blocks.0': 'blocks.0',
    'blocks.1': 'blocks.1'
}

mid_getter = MidGetter(model2.encoder, return_layers=selected_layers, keep_output=True)
mel2 = torch.reshape(mel2, (1, 80, 3000))
mid_outputs2, model_output1 = mid_getter(mel2)
print(mid_outputs2['blocks.0'].size(), mid_outputs2['blocks.0'])
print(mid_outputs2['blocks.1'].size(), mid_outputs2['blocks.1'])

torch.Size([1, 1500, 512]) tensor([[[-0.1563,  0.3312,  0.4908,  ...,  0.6978,  0.9326,  0.2687],
         [ 0.9851,  1.4307,  0.8256,  ...,  0.7265,  0.6520,  0.4039],
         [ 1.1762,  1.5970,  0.6187,  ...,  0.8394,  0.6291,  0.4031],
         ...,
         [ 0.9504, -0.4161,  0.5163,  ...,  0.0962,  0.2683,  0.2886],
         [ 0.4936,  0.2754,  0.4065,  ...,  0.1024,  0.2941,  0.2839],
         [-0.2025,  1.0311,  0.7810,  ...,  0.1786,  0.3837,  0.3372]]],
       grad_fn=<AddBackward0>)
torch.Size([1, 1500, 512]) tensor([[[-0.4080,  0.2444,  0.3422,  ...,  0.3989,  0.7763,  0.7941],
         [ 0.8298,  1.2738,  0.9891,  ...,  0.6430,  0.6292,  0.5122],
         [ 1.0802,  1.4241,  0.7343,  ...,  0.7464,  0.6629,  0.6262],
         ...,
         [ 0.8745, -0.5507,  0.6610,  ...,  0.1279,  0.1456,  0.3485],
         [ 0.4821,  0.2120,  0.6462,  ...,  0.0647,  0.1625,  0.4041],
         [-0.3639,  1.1347,  1.5000,  ...,  0.2270,  0.1641,  0.8910]]],
       grad_fn=<AddBackward0>)


In [6]:
tensor1 = mid_outputs1['blocks.0']
tensor2 = mid_outputs2['blocks.0']

euclidean_distance = torch.norm(tensor1 - tensor2)
print(f"Euclidean Distance: {euclidean_distance}")

cosine_similarity = F.cosine_similarity(tensor1.unsqueeze(0), tensor2.unsqueeze(0))
print(f"Cosine Similarity: {cosine_similarity}")

manhattan_distance = torch.sum(torch.abs(tensor1 - tensor2))
print(f"Manhattan Distance: {manhattan_distance}")

Euclidean Distance: 64.55815887451172
Cosine Similarity: tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]]], grad_fn=<SumBackward1>)
Manhattan Distance: 29300.666015625
