In [4]:
from imagebind import data
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import os
import torch
import glob
import time
import kdbai_client as kdbai
import pandas as pd
import sounddevice as sd
import wave
import threading
import soundfile as sf
from imagebind import data
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()

ImageBindModel(
  (modality_preprocessors): ModuleDict(
    (vision): RGBDTPreprocessor(
      (cls_token): tensor((1, 1, 1280), requires_grad=True)
      
      (rgbt_stem): PatchEmbedGeneric(
        (proj): Sequential(
          (0): PadIm2Video()
          (1): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
        )
      )
      (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper(
        (pos_embed): tensor((1, 257, 1280), requires_grad=True)
        
      )
    )
    (text): TextPreprocessor(
      (pos_embed): tensor((1, 77, 1024), requires_grad=True)
      (mask): tensor((77, 77), requires_grad=False)
      
      (token_embedding): Embedding(49408, 1024)
    )
    (audio): AudioPreprocessor(
      (cls_token): tensor((1, 1, 768), requires_grad=True)
      
      (rgbt_stem): PatchEmbedGeneric(
        (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10), bias=False)
        (norm_layer): LayerNorm((768,), eps=1e-05, elementwise_affine=

In [7]:
#Logeo en KDBAI_CLOUD
KDBAI_API_KEY = '23ee789f8a-fzddBKVhzQPI0ibTnosO0E/LmSr8Q+zPW5AR8vVXrVzKfVyU+Q26BG1tPwyD1vBE9MXhimPI/0m2bZ1h'
KDBAI_ENDPOINT = 'https://cloud.kdb.ai/instance/xtg7rzi99h'
session = kdbai.Session(api_key=KDBAI_API_KEY, endpoint=KDBAI_ENDPOINT)
db = session.database('default')

In [8]:
#Configuración de la tabla.
# Configuración del esquema de la tabla
schema = [
    {"name": "id", "type": "str"},
    {"name": "embeddings", "type": "float32s"}
]

# Definición del índice
index = [
    {
        "name": "flat_index",
        "type": "flat",
        "column": "embeddings",
        "params": {"dims": 1024, "metric": "CS"},
    }
]

# Creación de la tabla
table = 'tb_audio'
try:
    db.table(table).drop()
except kdbai.KDBAIException:
    pass
table_audio = db.create_table(table, schema=schema, indexes=index)

In [9]:
def GenerarEmbeddings(audio_path):
    audio_data = data.load_and_transform_audio_data([audio_path], device)

    with torch.no_grad():
        embeddings = model({ModalityType.AUDIO: audio_data})

    audio_embeddings = embeddings[ModalityType.AUDIO].cpu().numpy().flatten()

    return audio_embeddings

#Prepara una lista de embeddings para un label específico.
#Sirve para cargar la base de datos específica de un label.
def PrepararEmbedding(label, list_emb):
    dict_emb = {"id": [label for i in list_emb], 'embeddings': [i for i in list_emb]}
    df = pd.DataFrame(dict_emb)
    return df

def InsertarAudios(path, label):
    if os.path.exists(path):
      audios_wav = glob.glob(os.path.join(path, '*'))
      if len(audios_wav) > 0:
        list_emb = [GenerarEmbeddings(i) for i in audios_wav]
        table_audio.insert(PrepararEmbedding(label, list_emb))
      else:
        print('No existen audios en el path.')
    else:
      print('No existe el path.')
      
def BusquedaAudio(audio_path):
    print(f"Analizando: {audio_path}")
    emb = GenerarEmbeddings(audio_path + '.wav')
    busqueda = table_audio.search(vectors={'flat_index': [emb]}, n = 1)[0].iloc[0][0:2].to_list()
    print(f"Análisis: {busqueda}")
    return busqueda

#### Funciones de Audio

In [18]:
def GenerarAudio(hz, tiempo, i):
    print(f"\nGrabando: {i}")
    audio = sd.rec(int(tiempo * hz), samplerate= hz, dtype='int16', channels=1)
    sd.wait()
    filename = f"./demo{i}"
    
    #Hilo de guardado
    thread_guardado = threading.Thread(target = GuardarAudio, args = (filename , 1, hz, audio))
    thread_guardado.start()
    
def GuardarAudio(filename, channels, hz, audio):
    print(f"Guardando: {filename}")
    with wave.open(f"{filename}.wav", 'wb') as wf:
        wf.setnchannels(channels)
        wf.setsampwidth(2) 
        wf.setframerate(hz) 
        wf.writeframes(audio.tobytes()) 
    
    #Hilo de búsqueda de embedding
    thread_busqueda = threading.Thread(target= BusquedaAudio, args = (filename,))
    thread_busqueda.start()
    
def ComprimirAudioFLAC(filename):
    data, samplerate = sf.read(f"{filename}.wav")
    sf.write(f"{filename}.flac", data, samplerate, format='FLAC')
    print(f"Comprimido en: {filename}.flac")
    
    
    thread_eliminar = threading.Thread(target= EliminarWAV, args = (filename,))
    thread_eliminar.start()
     
def EliminarWAV(filename):
    path_wav = f"{filename}.wav"
    if os.path.exists(path_wav):
        os.remove(path_wav)
        print(f"Eliminando: {path_wav}")

In [20]:
device = 2
hz = 44100
tiempo = 4
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

for i in range(100):
    os.system('cls')
    GenerarAudio(hz,tiempo, i)


Grabando: 0
Guardando: ./demo0
Analizando: ./demo0

Grabando: 1
Análisis: [0.7457628, 'Seba']
Guardando: ./demo1
Analizando: ./demo1

Grabando: 2
Análisis: [0.8044289, 'Seba']
Guardando: ./demo2
Analizando: ./demo2

Grabando: 3
Análisis: [0.5695438, 'Seba']
Guardando: ./demo3
Analizando: ./demo3

Grabando: 4
Análisis: [0.55974305, 'Seba']
Guardando: ./demo4
Analizando: ./demo4

Grabando: 5
Análisis: [0.7768005, 'Diego']
Guardando: ./demo5
Analizando: ./demo5

Grabando: 6
Análisis: [0.7932022, 'Diego']
Guardando: ./demo6
Analizando: ./demo6

Grabando: 7
Análisis: [0.44745898, 'Seba']
Guardando: ./demo7
Analizando: ./demo7

Grabando: 8
Análisis: [0.67809576, 'Seba']
Guardando: ./demo8
Analizando: ./demo8

Grabando: 9
Análisis: [0.681122, 'Diego']
Guardando: ./demo9
Analizando: ./demo9

Grabando: 10
Análisis: [0.8680649, 'Diego']
Guardando: ./demo10
Analizando: ./demo10

Grabando: 11
Análisis: [0.80227625, 'Diego']
Guardando: ./demo11
Analizando: ./demo11

Grabando: 12
Análisis: [0.77814

KeyboardInterrupt: 

In [16]:
InsertarAudios("E:\Prueba\Seba", 'Seba')

In [17]:
table_audio.query()

Unnamed: 0,id,embeddings
0,Diego,"[-0.06433546, 0.3962934, -0.002036959, -0.1009..."
1,Diego,"[0.06982847, 0.35479352, 0.14583497, -0.019050..."
2,Diego,"[0.17226578, -0.021067247, -0.08030585, -0.225..."
3,Seba,"[-0.5066348, 0.7424317, 0.5863964, -0.3717103,..."
4,Seba,"[0.047204513, 0.038371634, 0.15980327, -0.2737..."
5,Seba,"[0.20582007, 0.1990518, -0.34196186, -0.122000..."
6,Seba,"[-0.5127218, 0.97185636, -0.29173413, -0.57133..."
