In [11]:
import os
import json
import numpy as np
import librosa
from mindnlp.transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import mindspore as ms

os.environ["HF_HOME"] = "https://hf-mirror.com/"

def create_ast_to_esc_mapping():
    """创建AST标签到ESC-50标签的映射"""
    ast_to_esc = {
        # 动物声音
        74: 'dog',         # Dog
        75: 'bark',        # Bark
        76: 'yip',         # Yip
        77: 'howl',        # Howl
        78: 'bow-wow',     # Bow-wow
        79: 'growling',    # Growling
        80: 'whimper',     # Whimper (dog)
        
        99: 'chicken',     # Chicken, rooster
        100: 'cluck',      # Cluck
        101: 'crowing',    # Crowing
        
        93: 'pig',         # Pig
        94: 'oink',        # Oink
        
        90: 'cattle',      # Cattle, bovinae
        91: 'moo',         # Moo
        
        132: 'frog',       # Frog
        133: 'croak',      # Croak
        
        81: 'cat',         # Cat
        82: 'purr',        # Purr
        83: 'meow',        # Meow
        84: 'hiss',        # Hiss
        85: 'caterwaul',   # Caterwaul
        
        126: 'insect',     # Insect
        127: 'cricket',    # Cricket
        128: 'mosquito',   # Mosquito
        129: 'fly',        # Fly, housefly
        130: 'buzz',       # Buzz
        131: 'bee',        # Bee, wasp, etc.
        
        97: 'sheep',       # Sheep
        96: 'bleat',       # Bleat
        
        117: 'crow',       # Crow
        118: 'caw',        # Caw
        
        # 环境声音
        289: 'rain',       # Rain
        290: 'raindrop',   # Raindrop
        291: 'rain_surface', # Rain on surface
        
        294: 'ocean',      # Ocean
        295: 'waves',      # Waves, surf
        
        298: 'fire',       # Fire
        299: 'crackle',    # Crackle
        
        111: 'bird',       # Bird
        112: 'bird_call',  # Bird vocalization
        113: 'chirp',      # Chirp, tweet
        
        448: 'drip',       # Drip
        
        283: 'wind',       # Wind
        284: 'rustling',   # Rustling leaves
        285: 'wind_noise', # Wind noise
        
        449: 'pour',       # Pour
        450: 'trickle',    # Trickle, dribble
        451: 'gush',       # Gush
        
        374: 'toilet_flush', # Toilet flush
        
        286: 'thunderstorm', # Thunderstorm
        287: 'thunder',    # Thunder
        
        # 人类声音
        23: 'baby_cry',    # Baby cry, infant cry
        49: 'sneeze',      # Sneeze
        63: 'clapping',    # Clapping
        41: 'breathing',   # Breathing
        42: 'wheeze',      # Wheeze
        47: 'cough',       # Cough
        53: 'footsteps',   # Walk, footsteps
        16: 'laughter',    # Laughter
        17: 'baby_laugh',  # Baby laughter
        18: 'giggle',      # Giggle
        19: 'snicker',     # Snicker
        20: 'belly_laugh', # Belly laugh
        21: 'chuckle',     # Chuckle, chortle
        
        375: 'toothbrush', # Toothbrush
        376: 'electric_toothbrush', # Electric toothbrush
        
        43: 'snoring',     # Snoring
        54: 'chewing',     # Chewing, mastication
        
        # 室内声音
        358: 'knock',      # Knock
        359: 'knock',      # Knock
        
        491: 'clicking',   # Clicking
        
        386: 'keyboard',   # Computer keyboard
        
        361: 'squeak',     # Squeak
        
        364: 'dishes',     # Dishes, pots, and pans
        
        377: 'vacuum',     # Vacuum cleaner
        
        395: 'alarm',      # Alarm clock
        
        407: 'tick',       # Tick
        408: 'tick-tock',  # Tick-tock
        
        443: 'shatter',    # Shatter
        
        # 交通工具声音
        339: 'helicopter', # Helicopter
        
        347: 'chainsaw',   # Chainsaw
        
        396: 'siren',      # Siren
        397: 'civil_siren', # Civil defense siren
        
        308: 'horn',       # Vehicle horn, car horn, honking
        309: 'toot',       # Toot
        
        343: 'engine',     # Engine
        344: 'light_engine', # Light engine
        348: 'medium_engine', # Medium engine
        349: 'heavy_engine', # Heavy engine
        
        329: 'train',      # Train
        330: 'train_whistle', # Train whistle
        331: 'train_horn', # Train horn
        
        201: 'church_bell', # Church bell
        
        340: 'airplane',   # Fixed-wing aircraft, airplane
        
        432: 'fireworks',  # Fireworks
        433: 'firecracker', # Firecracker
        
        421: 'sawing',     # Sawing
    }
    return ast_to_esc

def preprocess_function(audio_path, feature_extractor):
    """预处理音频文件"""
    # 使用librosa读取音频文件
    audio, sr = librosa.load(audio_path, sr=16000)
    
    # 使用特征提取器处理音频
    inputs = feature_extractor(
        audio,
        sampling_rate=16000,
        return_tensors="ms",
        padding=True
    )
    return inputs.input_values

def infer(audio_path):
    """对单个音频文件进行推理"""
    # 设置设备
    ms.set_context(device_target='Ascend')
    ms.set_context(device_id=0)
    
    # 加载AST模型和特征提取器
    model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"
    model = AutoModelForAudioClassification.from_pretrained(model_name)
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
    
    # 获取AST到ESC的映射
    ast_to_esc = create_ast_to_esc_mapping()
    
    try:
        # 预处理音频
        audio = preprocess_function(audio_path, feature_extractor)
        
        # 模型推理
        model.set_train(False)
        outputs = model(audio)
        
        # 获取前5个最可能的预测结果
        probabilities = outputs.logits.softmax(axis=-1).asnumpy()[0]
        top5_indices = probabilities.argsort()[-5:][::-1]
        
        results = []
        for idx in top5_indices:
            if idx in ast_to_esc:
                sound_type = ast_to_esc[idx]
                probability = probabilities[idx]
                results.append((sound_type, probability))
        
        return results
        
    except Exception as e:
        print(f"处理文件 {audio_path} 时出错: {str(e)}")
        print(f"错误类型: {type(e)}")
        import traceback
        print(traceback.format_exc())
        return None

def main():
    # 测试音频文件路径
    audio_path = "./data/ESC-50-master/audio_16k/1-61252-A-11.wav"  # 替换为你的音频文件路径
    
    results = infer(audio_path)
    if results:
        print(f"\n音频文件 {audio_path} 的推理结果：")
        # 按照概率值（第二个元素）降序排序
        sorted_results = sorted(results, key=lambda x: x[1], reverse=True)
        for sound_type, probability in sorted_results:
            print(f"{sound_type}: {probability*100:.2f}%")
    else:
        print("推理失败")

if __name__ == "__main__":
    main() 




音频文件 ./data/ESC-50-master/audio_16k/1-61252-A-11.wav 的推理结果：
ocean: 43.49%
waves: 42.51%
wind: 4.36%
wind_noise: 3.82%
