In [None]:
import os
import numpy as np
import torch
import librosa
import soundfile as sf
import subprocess
from tqdm import tqdm
from transformers import ASTFeatureExtractor, ASTModel
from google.colab import drive

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(model_name)
model = ASTModel.from_pretrained(model_name).to(device)
model.eval()

def extract_audio_from_video(video_path, audio_output_path):
    command = [
        'ffmpeg',
        '-i', video_path,
        '-vn',
        '-ac', '1',
        '-ar', '16000',
        '-c:a', 'aac',
        '-y',
        audio_output_path
    ]

    try:
        subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error extracting audio from {video_path}: {e.stderr.decode('utf-8')}")
        return False
    except FileNotFoundError:
        print("Error: ffmpeg not found. Please ensure it is installed.")
        return False

def extract_segmented_ast_feature(audio_path, num_segments=8, silence_threshold=0.005):
    try:
        waveform, sr = librosa.load(audio_path, sr=16000, mono=True)
    except Exception as e:
        return np.zeros((num_segments, 768), dtype=np.float32)

    max_amp = np.max(np.abs(waveform))

    if max_amp < silence_threshold:
        return np.zeros((num_segments, 768), dtype=np.float32)

    total_samples = len(waveform)
    segment_length = total_samples // num_segments

    if segment_length < 100:
        return np.zeros((num_segments, 768), dtype=np.float32)

    segments = []
    for i in range(num_segments):
        start = i * segment_length
        end = start + segment_length
        if i == num_segments - 1:
            seg = waveform[start:]
        else:
            seg = waveform[start:end]
        segments.append(seg)

    inputs = feature_extractor(
        segments,
        sampling_rate=16000,
        return_tensors="pt",
        padding="max_length"
    )

    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]

    return embeddings.cpu().numpy()

drive.mount('/content/drive')

BASE_DIR = '/content/drive/MyDrive/VEA/'

input_dir = os.path.join(BASE_DIR, 'data16')
output_dir = os.path.join(BASE_DIR, 'features_audio')
temp_audio_dir = os.path.join(BASE_DIR, 'temp_audio_extracted')

os.makedirs(output_dir, exist_ok=True)
os.makedirs(temp_audio_dir, exist_ok=True)

video_extensions = ('.mp4', '.avi', '.mov', '.mkv')

try:
    video_files = [f for f in os.listdir(input_dir) if f.lower().endswith(video_extensions)]
except FileNotFoundError:
    video_files = []

if video_files:
    for file in tqdm(video_files, desc="Processing videos and extracting features"):
        base_name = os.path.splitext(file)[0]
        input_path = os.path.join(input_dir, file)

        output_filename = base_name + ".npy"
        output_path = os.path.join(output_dir, output_filename)
        temp_audio_path = os.path.join(temp_audio_dir, base_name + ".aac")

        if os.path.exists(output_path):
            continue

        success = extract_audio_from_video(input_path, temp_audio_path)

        if success:
            feature = extract_segmented_ast_feature(temp_audio_path, num_segments=8, silence_threshold=0.005)

            np.save(output_path, feature)

            if os.path.exists(temp_audio_path):
                os.remove(temp_audio_path)

    print("\nExtraction complete. All features saved to Google Drive.")
else:
    print("No video files found.")