## MusicGen 音樂生成及預測系統


In [5]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import librosa
import warnings
warnings.filterwarnings('ignore')
# 設定隨機種子
SEED = 12
torch.manual_seed(SEED)
np.random.seed(SEED)
# 載入資料與預處理
data = pd.read_csv('data_features.csv')
X = data.drop(['filename', 'label'], axis=1).values
# 標準化 (使用原 notebook 相同邏輯)
scaler = StandardScaler()
X = scaler.fit_transform(X)

### 音樂風格分辨器模型
### Music Genre Classifier Model

In [6]:
class GenreClassifier(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 1024),  # 增加第一層神經元
            nn.ReLU(),
            nn.Dropout(0.6),              # 降低Dropout
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, 10)
        )

        
    def forward(self, x):
        return self.layers(x)

def extract_features_for_prediction(audio_path):
    """提取與訓練時相同的26維特徵，使用3秒片段"""
    try:
        # 載入音頻檔案
        y, sr = librosa.load(audio_path, sr=22050)
        
        # 計算可以切成幾個3秒片段
        segment_length = sr * 3  # 3秒的樣本數
        num_segments = len(y) // segment_length
        
        if num_segments == 0:
            # 如果音頻長度不足3秒，補零
            y = np.pad(y, (0, segment_length - len(y)), 'constant')
            num_segments = 1
        
        # 儲存所有片段的特徵
        all_features = []
        
        # 處理每個3秒片段
        for i in range(num_segments):
            start = i * segment_length
            end = start + segment_length
            segment = y[start:end]
            
            features = []
            
            # 1. chroma_stft
            chroma_stft = librosa.feature.chroma_stft(y=segment, sr=sr)
            features.append(np.mean(chroma_stft))
            
            # 2. rmse (使用 rms)
            rms = librosa.feature.rms(y=segment)
            features.append(np.mean(rms))
            
            # 3. spectral_centroid
            spectral_centroid = librosa.feature.spectral_centroid(y=segment, sr=sr)
            features.append(np.mean(spectral_centroid))
            
            # 4. spectral_bandwidth
            spectral_bandwidth = librosa.feature.spectral_bandwidth(y=segment, sr=sr)
            features.append(np.mean(spectral_bandwidth))
            
            # 5. rolloff
            rolloff = librosa.feature.spectral_rolloff(y=segment, sr=sr)
            features.append(np.mean(rolloff))
            
            # 6. zero_crossing_rate
            zcr = librosa.feature.zero_crossing_rate(segment)
            features.append(np.mean(zcr))
            
            # 7-26. mfcc1 到 mfcc20
            mfcc = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=20)
            for j in range(20):
                features.append(np.mean(mfcc[j]))
            
            all_features.append(features)
        
        # 計算所有片段特徵的平均值
        mean_features = np.mean(all_features, axis=0)
        return mean_features.reshape(1, -1)
        
    except Exception as e:
        print(f"特徵提取失敗: {e}")
        return None

def predict_music_genre(audio_path, model, scaler):
    """預測音樂風格"""
    
    # 音樂類型標籤 (與訓練時相同順序)
    genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 
              'jazz', 'metal', 'pop', 'reggae', 'rock']
    
    # 提取特徵
    features = extract_features_for_prediction(audio_path)
    
    if features is None:
        return None, None, None
    
    # 標準化特徵 (使用訓練時的 scaler)
    features_scaled = scaler.transform(features)
    
    # 轉換為 PyTorch tensor
    features_tensor = torch.tensor(features_scaled, dtype=torch.float32).to(device)
    
    # 預測
    model.eval()
    with torch.no_grad():
        outputs = model(features_tensor)
        probabilities = torch.softmax(outputs, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        confidence = probabilities[0, predicted_class].item()
    
    predicted_genre = genres[predicted_class]
    all_probabilities = probabilities[0].cpu().numpy()
    
    return predicted_genre, confidence, all_probabilities

# 載入最佳模型
# 初始化模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GenreClassifier(input_size=X.shape[1]).to(device)
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

GenreClassifier(
  (layers): Sequential(
    (0): Linear(in_features=26, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.6, inplace=False)
    (3): Linear(in_features=1024, out_features=512, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.4, inplace=False)
    (6): Linear(in_features=512, out_features=256, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.4, inplace=False)
    (9): Linear(in_features=256, out_features=128, bias=True)
    (10): ReLU()
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=128, out_features=10, bias=True)
  )
)

### 載入MusicGen
### Load MusicGen Model

In [17]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import scipy.io.wavfile

# 載入模型和處理器
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
musicgen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
print("✓ MusicGen 模型載入成功")
print(f"模型類型: {type(model)}")
print(f"處理器類型: {type(processor)}")


Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summ

✓ MusicGen 模型載入成功
模型類型: <class '__main__.GenreClassifier'>
處理器類型: <class 'transformers.models.musicgen.processing_musicgen.MusicgenProcessor'>


### Prompt 字典 (未完成!)
MusicGen接收使用者的prompt生成音樂，需使用合適的prompt才可生成指定風格(紀錄在style_test，名稱後有括號的代表預測的風格錯誤，需要修改prompt)，相同prompt容易生成一樣的音樂，需要更多不同prompt但能生成相同風格
### Prompt Dictionary (Unfinished!)
MusicGen generates music based on user prompts. To create music in a specific style, appropriate prompts are required. (Recorded in style_test; names with parentheses indicate incorrectly predicted styles, meaning the prompt needs modification.) The same prompt tends to generate similar music, so more varied prompts are needed to produce the same style.

In [27]:
def generate_optimized_prompts():
    """生成針對你的分類器優化的 prompt"""
    optimized_prompts = {
        'blues': """Traditional 12-bar blues with electric guitar string bending, 
           harmonica wailing, slow shuffle rhythm 65 BPM, minor pentatonic scale,
           NOT reggae off-beat, NOT Caribbean style, Delta blues tradition,
           melancholic and soulful, vintage tube amplifier tone""",
        
        'classical': "Classical string quartet, violin melody with cello bass line, piano accompaniment, baroque counterpoint style, concert hall reverb, elegant and refined, moderate tempo 100 BPM, D major key",
        
        'country': """Country ballad with acoustic guitar fingerpicking, pedal steel guitar slides,
            fiddle melody, storytelling vocal style, 3/4 waltz time signature,
            90 BPM, G major key, rural American atmosphere,
            NO electronic beats, NO distorted guitars, NO urban elements,
            Nashville production style, nostalgic and heartfelt""",
        
        'disco': """1970s disco with four-on-the-floor kick drum, orchestral string sections,
          syncopated bass guitar slap technique, brass stabs on beats 2 and 4,
          125 BPM dance tempo, Bb major key, Studio 54 atmosphere,
          NO hip-hop production, NO reggae off-beat, NO metal distortion,
          Saturday Night Fever era sound""",
        
        'hiphop': """Modern hip-hop with 808 kick drum, trap-influenced hi-hat rolls, 
            sub-bass frequencies, minimal harmonic content, 85 BPM tempo,
            NOT disco strings, NOT four-on-the-floor pattern,
            urban street atmosphere, quantized rhythm programming""",
        
        'jazz': "Jazz standard with tenor saxophone lead, piano comping, walking upright bass, swing drum pattern, sophisticated chord changes, improvisational feel, 130 BPM, F major",
        
        'metal': "Heavy metal with palm-muted electric guitars, power chord progressions, double bass drum, aggressive and intense, fast tempo 150 BPM, E minor, high-gain distortion",
        
        'pop': "Contemporary pop with synthesizer lead melody, clean electric guitar arpeggios, steady four-four drum beat, bright and commercial production, 110 BPM, C major key",
        
        'reggae': "Classic reggae with off-beat guitar skank, bass emphasis on beats 1 and 3, one-drop drum pattern, relaxed tempo 75 BPM, Caribbean style, A minor key",
        
        'rock': """Classic rock anthem with clean electric guitar power chords, live drum kit,
            driving eighth-note rhythm, stadium atmosphere, 135 BPM, A major key,
            NO hip-hop beats, NO electronic elements, NO palm muting,
            vintage Marshall amplifier tone, energetic and uplifting"""
    }
    
    return optimized_prompts
prompt = generate_optimized_prompts()
prompt['jazz']

'Jazz standard with tenor saxophone lead, piano comping, walking upright bass, swing drum pattern, sophisticated chord changes, improvisational feel, 130 BPM, F major'

In [28]:
genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 
          'jazz', 'metal', 'pop', 'reggae', 'rock']

genre_dict = {i+1: genre for i, genre in enumerate(genres)}

print("音樂風格編號對照表：")
for num, genre in genre_dict.items():
    print(f"{num:2d}. {genre:>10}")

音樂風格編號對照表：
 1.      blues
 2.  classical
 3.    country
 4.      disco
 5.     hiphop
 6.       jazz
 7.      metal
 8.        pop
 9.     reggae
10.       rock


### 生成音樂與預測風格系統
根據編號選擇生成的風格，並由分辨器預測風格是否正確(duration_tokens=1503 為30秒,生成約5-10分鐘,可改成更短測試)
### Music Generation and Style Prediction System
Select the desired generation style by its ID, and the discriminator will predict whether the generated style is correct. (A duration_tokens value of 1503 corresponds to 30 seconds; generation typically takes 5-10 minutes, but can be shortened for testing.)

In [31]:
def generate_and_predict(prompt, filename, duration_tokens=1503):
    """生成音樂並進行風格預測"""
    # 獲取當前時間並格式化
    from datetime import datetime
    current_time = datetime.now().strftime("%H%M")
    
    # 在檔案名稱中加入時間戳記
    filename_with_time = f"{filename.rsplit('.', 1)[0]}_{current_time}.wav"
    
    # 生成音樂
    inputs = processor(
        text=[prompt],
        padding=True,
        return_tensors="pt",
    )
    
    audio_values = musicgen.generate(**inputs, max_new_tokens=duration_tokens)
    sampling_rate = musicgen.config.audio_encoder.sampling_rate
    
    # 保存檔案
    scipy.io.wavfile.write(filename_with_time, 
                          rate=sampling_rate, 
                          data=audio_values[0, 0].numpy())
    print(f"已保存: {filename_with_time}")
    
    # 進行風格預測
    predicted_genre, confidence, all_probs = predict_music_genre(filename_with_time, model, scaler)
    
    if predicted_genre is not None:
        print("\n=== 音樂風格預測結果 ===")
        print(f"預測風格: {predicted_genre}")
        print(f"信心度: {confidence:.4f}")
        print()
        
        print("所有風格的機率分布:")
        for i, (genre, prob) in enumerate(zip(genres, all_probs)):
            print(f"  {genre:>10}: {prob:.4f} ({'█' * int(prob * 20)})")
        
        # 顯示前3名預測結果
        top_3_indices = all_probs.argsort()[-3:][::-1]
        print(f"\n前3名預測結果:")
        for i, idx in enumerate(top_3_indices):
            print(f"  {i+1}. {genres[idx]:>10}: {all_probs[idx]:.4f}")
            
    else:
        print("預測失敗，請檢查音檔路徑和格式")

# 使用範例
folder_path = 'music'
music_style = 4
file_name = genre_dict[music_style]  # 選擇第二個風格
audio_path = f'{folder_path}/{file_name}.wav'

In [32]:
duration_dict = {
    'test' : 151,    # 測試用
    'short': 453,    # 9秒 × 50 = 450 tokens
    'medium': 753,   # 15秒 × 50 = 750 tokens
    'long': 1503     # 30秒 × 50 = 1500 tokens
}
t, s , m , l = duration_dict.values()
print(t,s,m,l)

151 453 753 1503


In [33]:
generate_and_predict(prompt[file_name], audio_path, duration_tokens=l)

已保存: music/disco_2100.wav

=== 音樂風格預測結果 ===
預測風格: hiphop
信心度: 0.8305

所有風格的機率分布:
       blues: 0.0000 ()
   classical: 0.0000 ()
     country: 0.0000 ()
       disco: 0.0747 (█)
      hiphop: 0.8305 (████████████████)
        jazz: 0.0000 ()
       metal: 0.0002 ()
         pop: 0.0001 ()
      reggae: 0.0945 (█)
        rock: 0.0000 ()

前3名預測結果:
  1.     hiphop: 0.8305
  2.     reggae: 0.0945
  3.      disco: 0.0747


### 音樂風格預測系統
單純用來預測指定音樂的風格
### Music Genre Prediction System
This system is solely for predicting the genre of a specified piece of music.

classical, hiphop, raggae, metal, country

In [35]:
# 進行預測
test_audio_path = r'style_test\country.wav'
predicted_genre, confidence, all_probs = predict_music_genre(test_audio_path, model, scaler)

if predicted_genre is not None:
    print("=== 音樂風格預測結果 ===")
    print(f"預測風格: {predicted_genre}")
    print(f"信心度: {confidence:.4f}")
    print()
    
    print("所有風格的機率分布:")
    for i, (genre, prob) in enumerate(zip(genres, all_probs)):
        print(f"  {genre:>10}: {prob:.4f} ({'█' * int(prob * 20)})")
    
    # 顯示前3名預測結果
    top_3_indices = all_probs.argsort()[-3:][::-1]
    print(f"\n前3名預測結果:")
    for i, idx in enumerate(top_3_indices):
        print(f"  {i+1}. {genres[idx]:>10}: {all_probs[idx]:.4f}")
        
else:
    print("預測失敗，請檢查音檔路徑和格式")

=== 音樂風格預測結果 ===
預測風格: country
信心度: 0.8971

所有風格的機率分布:
       blues: 0.0036 ()
   classical: 0.0001 ()
     country: 0.8971 (█████████████████)
       disco: 0.0010 ()
      hiphop: 0.0010 ()
        jazz: 0.0002 ()
       metal: 0.0001 ()
         pop: 0.0002 ()
      reggae: 0.0961 (█)
        rock: 0.0008 ()

前3名預測結果:
  1.    country: 0.8971
  2.     reggae: 0.0961
  3.      blues: 0.0036
