In [1]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Using cached https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp38-cp38-win_amd64.whl (2673.1 MB)
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cu118/torchvision-0.18.1%2Bcu118-cp38-cp38-win_amd64.whl (4.9 MB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.3.1%2Bcu118-cp38-cp38-win_amd64.whl (4.0 MB)
Collecting filelock (from torch)
  Using cached https://download.pytorch.org/whl/filelock-3.13.1-py3-none-any.whl (11 kB)
Collecting sympy (from torch)
  Using cached https://download.pytorch.org/whl/sympy-1.12-py3-none-any.whl (5.7 MB)
Collecting networkx (from torch)
  Using cached https://download.pytorch.org/whl/networkx-3.2.1-py3-none-any.whl (1.6 MB)
Collecting fsspec (from torch)
  Using cached https://download.pytorch.org/whl/fsspec-2024.2.0-py3-none-any.whl (170 kB)
Collecting mkl<=2021.4.0,>=2021.1.1 (from torch)
  Using cached

In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
!pip install tiktoken  openl3  soundfile

Collecting tiktoken
  Using cached tiktoken-0.7.0-cp38-cp38-win_amd64.whl.metadata (6.8 kB)
Collecting openl3
  Using cached openl3-0.4.2-py2.py3-none-any.whl
Collecting soundfile
  Using cached soundfile-0.12.1-py2.py3-none-win_amd64.whl.metadata (14 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Using cached regex-2024.5.15-cp38-cp38-win_amd64.whl.metadata (41 kB)
Collecting tensorflow>=2.0.0 (from openl3)
  Using cached tensorflow-2.13.1-cp38-cp38-win_amd64.whl.metadata (2.6 kB)
Collecting scipy>=0.19.1 (from openl3)
  Using cached scipy-1.10.1-cp38-cp38-win_amd64.whl.metadata (58 kB)
Collecting kapre>=0.3.5 (from openl3)
  Using cached kapre-0.3.7-py3-none-any.whl
Collecting resampy<0.3.0,>=0.2.1 (from openl3)
  Using cached resampy-0.2.2-py3-none-any.whl
Collecting h5py>=2.7.0 (from openl3)
  Using cached h5py-3.11.0-cp38-cp38-win_amd64.whl.metadata (2.5 kB)
Collecting moviepy>=1.0.0 (from openl3)
  Using cached moviepy-1.0.3-py3-none-any.whl
Collecting scikit-image>=0.14.3 (fr

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.3.1+cu118 requires typing-extensions>=4.8.0, but you have typing-extensions 4.5.0 which is incompatible.


In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [37]:
import openl3
import soundfile

class AudioEncoder():
    def __init__(self) -> None:
        self.audio_encoder_model = openl3.models.load_audio_embedding_model(input_repr="mel256", content_type="music",
                                                                            embedding_size=6144)
    def encode(self, audio_path):
        audio, sr = soundfile.read(audio_path)
        embeddings,timestamps = openl3.get_audio_embedding(audio, sr, model=self.audio_encoder_model)
        return torch.tensor(embeddings, dtype=torch.float32)
    


In [38]:
import tiktoken

class TextTokenizer: 
    def __init__(self) -> None:
        self.encodermodel = tiktoken.get_encoding("cl100k_base")
        
    def encode(self , text):
        tokentext =self.encodermodel.encode(text)
        return torch.tensor(tokentext, dtype=torch.float32)


In [39]:
class JointEmbeddingModel(nn.Module):
    def __init__(self, audio_embed_dim, text_embed_dim, shared_embed_dim):
        super(JointEmbeddingModel, self).__init__()
        self.audio_fc = nn.Linear(audio_embed_dim, shared_embed_dim)
        self.text_fc = nn.Linear(text_embed_dim, shared_embed_dim)
    
    def forward(self, audio_embedding, text_embedding):
        audio_shared = self.audio_fc(audio_embedding)
        text_shared = self.text_fc(text_embedding)
        return audio_shared, text_shared

    def compute_loss(self, audio_shared, text_shared):
        
        positive_pair_dist = F.pairwise_distance(audio_shared, text_shared)
        negative_pair_dist = F.pairwise_distance(audio_shared, torch.roll(text_shared, shifts=1, dims=0))
        margin = 1.0  
        loss = torch.mean(positive_pair_dist) + torch.mean(F.relu(margin - negative_pair_dist))
        return loss


In [40]:
text_tokenizer = TextTokenizer()
audio_encoder = AudioEncoder()

In [41]:
sample_text = "Hello My name is Dhruv"
sample_audio_path = "test_file.wav"

In [42]:
text_embedding = text_tokenizer.encode(sample_text)
audio_embedding = audio_encoder.encode(sample_audio_path)
print("Text Embedding Shape:", text_embedding.shape)
print("Audio Embedding Shape:", audio_embedding.shape)

Text Embedding Shape: torch.Size([7])
Audio Embedding Shape: torch.Size([1701, 6144])


In [43]:
audio_embed_dim = audio_embedding.shape[-1]  
text_embed_dim = text_embedding.shape[-1]    
shared_embed_dim = 256

In [44]:
model = JointEmbeddingModel(audio_embed_dim, text_embed_dim, shared_embed_dim)
print(model)

JointEmbeddingModel(
  (audio_fc): Linear(in_features=6144, out_features=256, bias=True)
  (text_fc): Linear(in_features=7, out_features=256, bias=True)
)


In [45]:
audio_shared, text_shared = model(audio_embedding, text_embedding)
loss = model.compute_loss(audio_shared, text_shared)

In [46]:
print("Audio Shared Embedding Shape:", audio_shared.shape)
print("Text Shared Embedding Shape:", text_shared.shape)
print("Loss:", loss.item())

Audio Shared Embedding Shape: torch.Size([1701, 256])
Text Shared Embedding Shape: torch.Size([256])
Loss: 153423.375
