<a href="https://colab.research.google.com/github/clauciorank/ZapAudio/blob/main/ZAP_audio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the necessary libraries
!pip install FlagEmbedding
!pip install peft


# Import necessary libraries
from FlagEmbedding import BGEM3FlagModel
import pandas as pd
import numpy as np

# https://huggingface.co/BAAI/bge-m3
model = BGEM3FlagModel('BAAI/bge-m3',
                       use_fp16=True) # True: speeds up computation

In [9]:
# Load transcribed audios and create column with drive path

df = pd.read_csv('/content/drive/MyDrive/ZapAudio/transcribed_files.csv')

def create_drive_path(x):
  file_name = x.split('/')[1]

  drive_path = f'/content/drive/MyDrive/ZapAudio/files_original/{file_name}'

  return drive_path


df['drive_path'] = df['file'].apply(lambda x: create_drive_path(x))

In [10]:
# If embeddings not done, calc all of them (GPU is not necessary but computations will be very slow in CPU)

# sentences_2 = df['text'].to_list()
# embeddings_2 = model.encode(sentences_2,return_dense=True, return_sparse=True,  return_colbert_vecs=True)

import pickle

# Load computed embeddings
with open('/content/drive/MyDrive/ZapAudio/embeddings.pkl', 'rb') as f:
    embeddings_2 = pickle.load(f)

In [56]:
# Phrase or keywords to search
sentences_1 = ["venha aqui casa tomar uma cerveja tá só eu e o nenê"]
# Calc embeddings
embeddings_1 = model.encode(sentences_1,
                            batch_size=12,
                            max_length=8192,
                            return_dense=True, return_sparse=True, return_colbert_vecs=True
                            )

In [57]:
# Compute the similarity scores: Dense, Sparse (lexical) and multi-vector (colbert)

def get_lexical(emb1, emb2):
  all_scores = []

  for i in emb2['lexical_weights']:
    scores = model.compute_lexical_matching_score(emb1['lexical_weights'][0], i)
    all_scores.append(scores)

  return all_scores

def get_colbert(emb1, emb2):
  all_scores = []

  for i in emb2['colbert_vecs']:
    colb = model.colbert_score(emb1['colbert_vecs'][0], i)
    all_scores.append(colb)

  return all_scores

df_copy = df

similarity = embeddings_1['dense_vecs'] @ embeddings_2['dense_vecs'].T
df_copy['dense_vecs'] = pd.Series(similarity[0])

df_copy['lexical'] = pd.Series(get_lexical(embeddings_1, embeddings_2))
df_copy['colbert'] = np.array(get_colbert(embeddings_1, embeddings_2))

df_copy['total'] = df_copy['dense_vecs'] + df_copy['lexical']


In [58]:
# Show the first 5 best matches based on colbert
final_df = df_copy.sort_values(by=['colbert'], ascending=False).head(5).reset_index(drop=True)

final_df

Unnamed: 0.1,Unnamed: 0,file,text,drive_path,dense_vecs,lexical,colbert,total
0,1160,files_original/AUD-20230208-WA0004.opus,Vim aqui em casa hoje tomar uma cerveja? Tá s...,/content/drive/MyDrive/ZapAudio/files_original...,0.927325,0.390171,0.922483,1.317497
1,2430,files_original/AUD-20230614-WA0053.opus,"Vem aqui em casa tomar uma cerveja, eu te son...",/content/drive/MyDrive/ZapAudio/files_original...,0.83723,0.428462,0.84328,1.265693
2,743,files_original/AUD-20200908-WA0028.opus,"Vem aqui em casa tomar uma cerveja, tá sonhan...",/content/drive/MyDrive/ZapAudio/files_original...,0.784908,0.348948,0.791664,1.133855
3,2096,files_original/AUD-20201204-WA0029.mp3,Estão prontos pro gole de hoje? Opa! Opa! Vem...,/content/drive/MyDrive/ZapAudio/files_original...,0.624032,0.249092,0.652091,0.873124
4,1237,files_original/AUD-20191123-WA0072.opus,"Uma sede, tomar uma cerveja, que Deus livre.",/content/drive/MyDrive/ZapAudio/files_original...,0.642813,0.163316,0.60757,0.806129


In [59]:
from IPython.display import Audio, display

for i in final_df['drive_path']:
  display(Audio(i))
