In [8]:
import os
import pandas as pd
import librosa
import numpy as np
from pydub import AudioSegment
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
import torch.nn as nn
import torch.optim as optim

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device set to:", device)

# 音频预处理函数
def preprocess_audio(file_path):
    print("Preprocessing audio file:", file_path)
    audio = AudioSegment.from_wav(file_path)
    audio = audio.set_frame_rate(16000).set_channels(1)
    audio_data, _ = librosa.load(file_path, sr=16000)
    mfccs = librosa.feature.mfcc(y=audio_data, sr=16000, n_mfcc=13)
    mfccs_processed = np.mean(mfccs.T, axis=0)
    return mfccs_processed

# 文本预处理函数
def preprocess_text(text):
    return text.lower()

# 加载数据
def load_data(audio_dir, csv_file):
    print("Loading data...")
    attributes = pd.read_csv(csv_file)
    audio_files = [os.path.join(audio_dir, f"{idx:05d}.wav") for idx in range(1, 101)]
    data = {
        "audio_features": [],
        "labels": []
    }
    for file in audio_files:
        filename = os.path.basename(file)
        story = attributes[attributes['filename'] == filename]
        story_type = story['Story_type'].values[0]
        print("Processing file:", file)
        data['audio_features'].append(preprocess_audio(file))
        data['labels'].append(1 if story_type == 'True Story' else 0)
    print("Data loaded.")
    return data

# 定义一个简单的MLP模型
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# 训练MLP模型
def train_mlp_model(features, labels):
    print("Training MLP model...")
    features = torch.tensor(features, dtype=torch.float32).to(device)
    labels = torch.tensor(labels, dtype=torch.long).to(device)
    
    model = MLP(features.shape[1]).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    model.train()
    for epoch in range(10):  # 增加训练轮数以提高准确性
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')
    print("MLP model training complete.")
    return model

# 训练随机森林模型
def train_random_forest(audio_features, labels):
    print("Training Random Forest model...")
    clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
    clf.fit(audio_features, labels)
    print("Random Forest model training complete.")
    return clf

# 融合模型预测
def ensemble_predict(mlp_model, rf_model, audio_features):
    # 将音频特征转换为张量
    audio_features_tensor = torch.tensor(audio_features, dtype=torch.float32).to(device)
    
    # 使用MLP模型进行预测
    mlp_model.eval()
    with torch.no_grad():
        mlp_outputs = mlp_model(audio_features_tensor)
    mlp_predictions = mlp_outputs.argmax(dim=1).cpu().numpy()
    
    # 使用随机森林模型进行预测
    rf_predictions = rf_model.predict(audio_features)
    
    # 投票系统：如果两个模型的预测结果相同，则采用该结果；否则，选择多数票
    ensemble_predictions = np.where(mlp_predictions == rf_predictions, mlp_predictions, np.random.choice([0, 1], p=[0.5, 0.5]))
    
    return ensemble_predictions

# 主函数
def main():
    audio_dir = 'CBU0521DD_stories'
    csv_file = 'CBU0521DD_stories_attributes.csv'
    print("Starting main process...")
    
    # 加载数据
    data = load_data(audio_dir, csv_file)
    audio_features = np.array(data['audio_features'])
    labels = np.array(data['labels'])
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(audio_features, labels, test_size=0.2, random_state=42)
    
    # 训练MLP模型
    mlp_model = train_mlp_model(X_train, y_train)
    
    # 训练随机森林模型
    rf_model = train_random_forest(X_train, y_train)
    
    # 融合模型预测
    ensemble_predictions = ensemble_predict(mlp_model, rf_model, X_test)
    
    # 计算并输出融合模型的准确率和分类报告
    ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)
    print("Ensemble Accuracy:", ensemble_accuracy)
    print("Classification Report:\n", classification_report(y_test, ensemble_predictions))

if __name__ == "__main__":
    main()

Device set to: cpu
Starting main process...
Loading data...
Processing file: CBU0521DD_stories\00001.wav
Preprocessing audio file: CBU0521DD_stories\00001.wav
Processing file: CBU0521DD_stories\00002.wav
Preprocessing audio file: CBU0521DD_stories\00002.wav
Processing file: CBU0521DD_stories\00003.wav
Preprocessing audio file: CBU0521DD_stories\00003.wav
Processing file: CBU0521DD_stories\00004.wav
Preprocessing audio file: CBU0521DD_stories\00004.wav
Processing file: CBU0521DD_stories\00005.wav
Preprocessing audio file: CBU0521DD_stories\00005.wav
Processing file: CBU0521DD_stories\00006.wav
Preprocessing audio file: CBU0521DD_stories\00006.wav
Processing file: CBU0521DD_stories\00007.wav
Preprocessing audio file: CBU0521DD_stories\00007.wav
Processing file: CBU0521DD_stories\00008.wav
Preprocessing audio file: CBU0521DD_stories\00008.wav
Processing file: CBU0521DD_stories\00009.wav
Preprocessing audio file: CBU0521DD_stories\00009.wav
Processing file: CBU0521DD_stories\00010.wav
Prep