In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import soundfile as sf
import torch
import numpy as np
import os
import gc


# Input and output directories
audio_directory = "/kaggle/input/infore/infore_16k_denoised"
output_directory = "/kaggle/working/features"
os.makedirs(output_directory, exist_ok=True)

In [None]:
#Load model
processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")
model = Wav2Vec2Model.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
def load_audio(file_path):
    speech, samplerate = sf.read(file_path)
    return speech, samplerate

# Helper function to extract features
def extract_features(audio_file):
    # Load audio
    speech, samplerate = load_audio(audio_file)
    duration = len(speech) / samplerate  # Calculate duration in seconds
    
    # Tokenize and preprocess
    input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values.to(device)
    
    # Get features (output of Wav2Vec2 hidden states)
    with torch.no_grad():
        embeddings = model(input_values).last_hidden_state.squeeze(0)  # (time_steps, feature_dim)
    
    return embeddings.cpu().numpy(), duration  # Return features and duration

In [None]:
batch_size = 1000  # Number of audio files per batch
audio_files = [os.path.join(audio_directory, f) for f in os.listdir(audio_directory) if f.endswith(".wav")]

for batch_idx in range(0, len(audio_files), batch_size):
    batch_files = audio_files[batch_idx:batch_idx + batch_size]
    batch_features = {}

    print(f"Processing batch {batch_idx // batch_size + 1} with {len(batch_files)} files...")

    for idx, file in enumerate(batch_files):
        try:
            # Extract features and metadata
            features, duration = extract_features(file)
            batch_features[f"file_{batch_idx * batch_size + idx + 1}"] = {
                "features": features,
                "path": file,
                "duration": duration,
            }
        except Exception as e:
            print(f"Error processing {file}: {e}")
        
        if (idx + 1) % 100 == 0:
            print(f"Processed {idx + 1} files in this batch...")

    # Save batch features
    batch_output_path = os.path.join(output_directory, f"features_batch_{batch_idx // batch_size + 1}.npz")
    np.savez_compressed(batch_output_path, **batch_features)
    print(f"Saved features for batch {batch_idx // batch_size + 1} to {batch_output_path}")

    del batch_features
    gc.collect()

print("Feature extraction complete!")