In [1]:
import numpy as np
import os, h5py
import torch
from transformers import AutoModel, AutoTokenizer
import librosa

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load Indic-BERT model
model = AutoModel.from_pretrained("ai4bharat/indic-bert")
tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')

In [2]:
#  Process all videos in the folder
audio_folder = "../../datasets/Dravidian Dataset/malayalam_data/audio"
output_folder = "../../datasets/Dravidian Dataset/malayalam_data/embeddings"
audio_embeddings_path = os.path.join(output_folder, "audio_embeddings.h5")

In [None]:
# Create a dictionary to store video embeddings
audio_embeddings = {}

for audio_file in os.listdir(audio_folder):
    if audio_file.endswith(".mp3"):
        audio_path = os.path.join(audio_folder, audio_file)
        audio_name = os.path.splitext(audio_file)[0]

        y, sr = librosa.load(audio_path, sr=44100)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        # Convert MFCCs to text (example)
        text = " ".join(str(value) for value in mfccs.flatten())
        # Tokenize and encode text
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512, return_attention_mask=True, return_token_type_ids=True, return_special_tokens_mask=False, return_overflowing_tokens=False, return_offsets_mapping=False, verbose=False, is_split_into_words=False, add_special_tokens=True, pad_to_multiple_of=None, stride=0, truncation_strategy='longest_first', pad_to_max_length=False)
        # Generate embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            audio_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

        print(audio_embedding.shape)  # Shape of the embedding vector

        # Take the first token of the last layer's output
        audio_embeddings[audio_name] = audio_embedding

In [None]:
# Create an HDF5 file and write the data
with h5py.File(audio_embeddings_path, 'w') as hf:
    for audio_id, embedding in audio_embeddings.items():
        hf.create_dataset(audio_id, data=embedding)

In [4]:
# Open the HDF5 file in read mode
with h5py.File(audio_embeddings_path, 'r') as file:
    # List all the groups in the file
    print("Groups in the HDF5 file:")
    g = list(file.keys())
    print(g)
    for i in g:
        print(f"{i}: {file[i].shape}")

Groups in the HDF5 file:
['MAL_MSA_01', 'MAL_MSA_02', 'MAL_MSA_03', 'MAL_MSA_04', 'MAL_MSA_05', 'MAL_MSA_06', 'MAL_MSA_07', 'MAL_MSA_08', 'MAL_MSA_09', 'MAL_MSA_10', 'MAL_MSA_11', 'MAL_MSA_12', 'MAL_MSA_13', 'MAL_MSA_14', 'MAL_MSA_15', 'MAL_MSA_16', 'MAL_MSA_17', 'MAL_MSA_18', 'MAL_MSA_19', 'MAL_MSA_20', 'MAL_MSA_21', 'MAL_MSA_22', 'MAL_MSA_23', 'MAL_MSA_24', 'MAL_MSA_25', 'MAL_MSA_26', 'MAL_MSA_27', 'MAL_MSA_28', 'MAL_MSA_29', 'MAL_MSA_30', 'MAL_MSA_31', 'MAL_MSA_32', 'MAL_MSA_33', 'MAL_MSA_34', 'MAL_MSA_35', 'MAL_MSA_36', 'MAL_MSA_37', 'MAL_MSA_38', 'MAL_MSA_39', 'MAL_MSA_40', 'MAL_MSA_41', 'MAL_MSA_42', 'MAL_MSA_43', 'MAL_MSA_44', 'MAL_MSA_45', 'MAL_MSA_46', 'MAL_MSA_47', 'MAL_MSA_48', 'MAL_MSA_49', 'MAL_MSA_50', 'MAL_MSA_51', 'MAL_MSA_52', 'MAL_MSA_53', 'MAL_MSA_54', 'MAL_MSA_55', 'MAL_MSA_56', 'MAL_MSA_57', 'MAL_MSA_58', 'MAL_MSA_59', 'MAL_MSA_60', 'MAL_MSA_61', 'MAL_MSA_62', 'MAL_MSA_63', 'MAL_MSA_64', 'MAL_MSA_65', 'MAL_MSA_66', 'MAL_MSA_67', 'MAL_MSA_68', 'MAL_MSA_69', 'MAL_MSA