In [4]:
import os
import librosa
import numpy as np
import sqlite3

In [5]:
def get_size_kb(file_path):
    return os.path.getsize(file_path)/1024

def calculate_silence_ratio(file_path, silence_threshold=1e-4, frame_length=2048, hop_length=512):
    # Load the audio file
    y, sr = librosa.load(file_path, sr=None)

    # Calculate the total duration of the audio in seconds
    total_duration = len(y) / sr

    # Calculate the short-term energy for each frame
    energy = np.array([
        np.sum(np.abs(y[i:i+frame_length]**2))
        for i in range(0, len(y), hop_length)
    ])

    # Define silence as frames where energy is below the threshold
    silent_frames = np.sum(energy < silence_threshold)

    # Calculate the duration of silent frames in seconds
    silent_duration = (silent_frames * hop_length) / sr

    # Calculate the silence ratio
    silence_ratio = silent_duration / total_duration

    return silence_ratio

def get_wav_informations(file_path):
    # name = os.path.basename(file_path)
    size = get_size_kb(file_path)
    wav, sr = librosa.load(file_path)
    duration = librosa.get_duration(y=wav, sr=sr)
    channel = len(wav.shape)
    silence_ratio = calculate_silence_ratio(file_path)
    size = get_size_kb(file_path)
    return size, sr, channel, duration, silence_ratio

In [6]:
folder_path = r"C:\Users\tqdcr\OneDrive\Documents\vnpt\s2t\rd"

In [7]:
con = sqlite3.connect("wav.db")

In [8]:
cur = con.cursor()

In [None]:
cur.execute("CREATE TABLE wav(file_name, file_path, file_size, sr, channel, duration, silence_ratio)")

In [None]:
cur.execute("CREATE TABLE wav_nlp(file_name, file_path, file_size, sr, channel, duration, silence_ratio)")

In [53]:
if os.path.exists(folder_path):
    # Loop through each file in the folder
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        # remove noise file before processing
        if get_size_kb(file_path) > 1:        
        # Only process files, not subdirectories
            if os.path.isfile(file_path):
                file_size, sr, channel, duration, silence_ratio = get_wav_informations(file_path)
                # print(f"{file_name}: {file_size}, {sr}, {channel}, {duration}, {silence_ratio}")
                cur.execute(f"""INSERT INTO wav VALUES ("{file_name}", "{file_path}", {file_size}, {sr}, {channel}, {duration}, {silence_ratio})""")
                con.commit()

In [56]:
def resampled_wav_function(wav, orig_sr, target_sr):
  resampled_wav = librosa.resample(wav, orig_sr=orig_sr, target_sr=target_sr)
  return resampled_wav

In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
transcriber = pipeline("automatic-speech-recognition", model="vinai/PhoWhisper-medium")

In [None]:
for row in cur.execute("SELECT file_name, file_path FROM wav"):
    print(output = transcriber(wav)['text'])    

In [None]:
if os.path.exists(folder_path):
# Loop through each file in the folder
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        # Only process files, not subdirectories
        if os.path.isfile(file_path):
            audio_path = f'rd/{file_name}'
            wav, sr = librosa.load(audio_path, sr = None)
            if sr != 16000:
                wav = resampled_wav_function(wav, orig_sr=sr, target_sr=16000)
            output = transcriber(wav)['text']
            print(output)

