In [2]:
import os
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from keras.models import Sequential, load_model
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from keras.utils import to_categorical
import librosa
from pydub import AudioSegment
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import json






In [2]:
def extract_features(file_path):
    try:
        # Load MP3 file and convert to WAV
        audio = AudioSegment.from_mp3(file_path)
        audio = audio.set_channels(1)  # Convert stereo to mono
        audio.export("temp.wav", format="wav")
        audio, _ = librosa.load("temp.wav", res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=22050, n_mfcc=13)
        chroma = librosa.feature.chroma_stft(y=audio, sr=22050)
        spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=22050)
        tonnetz = librosa.feature.tonnetz(y=audio, sr=22050)
        features = np.vstack([mfccs, chroma, spectral_contrast, tonnetz])
        mean_features = np.mean(features.T, axis=0)
        return mean_features
    except Exception as e:
        print(f"Error encountered while parsing file '{file_path}': {e}")
        return None

# Load Data
data = []
labels = []
genres = os.listdir('./Data/genres_original')
# Extract features and labels
for genre in genres:
    genre_path = os.path.join('./Data/genres_original', genre)
    for file in os.listdir(genre_path):
        file_path = os.path.join(genre_path, file)
        feature = extract_features(file_path)
        if feature is not None:
            data.append(feature)
            labels.append(genre)


In [3]:
# Convert labels to numerical values
label_dict = {label: idx for idx, label in enumerate(set(labels))}
numeric_labels = np.array([label_dict[label] for label in labels])
# Convert data and labels to numpy arrays
X = np.array(data)
y = np.array(numeric_labels)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Reshape data for CNN input
X_train_cnn = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_cnn = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
# Convert labels to one-hot encoding
y_train_onehot = to_categorical(y_train)
y_test_onehot = to_categorical(y_test)

with open('label_dict.json', 'w') as json_file:
    json.dump(label_dict, json_file)
np.save('y_test.npy', y_test)
np.save('X_test.npy', X_test)
np.save('X_test_cnn.npy', X_test_cnn)

In [4]:
# Build Improved CNN Model
def build_cnn_model(input_shape, filters=64, kernel_size=3, dropout_rate=0.5, l2_reg=0.001):
    model = Sequential()
    model.add(Conv1D(filters, kernel_size, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(filters*2, kernel_size, activation='relu'))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(filters*4, kernel_size, activation='relu'))
    model.add(MaxPooling1D(2))
    model.add(Flatten())
    model.add(Dense(256, activation='relu', kernel_regularizer='l2'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(len(label_dict), activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

improved_cnn_model = build_cnn_model(input_shape=(X_train_cnn.shape[1], 1))
improved_cnn_model.fit(X_train_cnn, y_train_onehot, epochs=50, batch_size=64, validation_data=(X_test_cnn, y_test_onehot))

improved_cnn_model.save('improved_cnn_model.h5')




Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [5]:
# SVM Classifier with Grid Search
svm_param_grid = {'C': [0.1, 1, 10, 100],
                  'kernel': ['linear', 'rbf', 'poly'],
                  'gamma': ['scale', 'auto']}

svm_grid_search = GridSearchCV(SVC(), svm_param_grid, cv=3)
svm_grid_search.fit(X_train.reshape(X_train.shape[0], -1), y_train)
best_svm_params = svm_grid_search.best_params_
svm_classifier_optimized = SVC(**best_svm_params)
svm_classifier_optimized.fit(X_train.reshape(X_train.shape[0], -1), y_train)

# Make predictions with optimized SVM
svm_predictions_optimized = svm_classifier_optimized.predict(X_test.reshape(X_test.shape[0], -1))
svm_accuracy_optimized = accuracy_score(y_test, svm_predictions_optimized)
print(f"Optimized SVM Model Accuracy: {svm_accuracy_optimized}")
joblib.dump(svm_classifier_optimized, 'optimized_svm_model.joblib')

Optimized SVM Model Accuracy: 0.615


['optimized_svm_model.joblib']

In [6]:
# Random Forest Classifier with Grid Search
rf_param_grid = {'n_estimators': [50, 100, 150],
                 'max_depth': [None, 10, 20, 30],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1, 2, 4]}

rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv=3)
rf_grid_search.fit(X_train.reshape(X_train.shape[0], -1), y_train)
best_rf_params = rf_grid_search.best_params_
rf_classifier_optimized = RandomForestClassifier(**best_rf_params)
rf_classifier_optimized.fit(X_train.reshape(X_train.shape[0], -1), y_train)

# Make predictions with optimized Random Forest
rf_predictions_optimized = rf_classifier_optimized.predict(X_test.reshape(X_test.shape[0], -1))
rf_accuracy_optimized = accuracy_score(y_test, rf_predictions_optimized)
print(f"Optimized Random Forest Model Accuracy: {rf_accuracy_optimized}")
joblib.dump(rf_classifier_optimized, 'optimized_rf_model.joblib')

Optimized Random Forest Model Accuracy: 0.66


['optimized_rf_model.joblib']

In [7]:
# Load models
# ...

# Load models
svm_classifier_optimized = joblib.load('optimized_svm_model.joblib')
rf_classifier_optimized = joblib.load('optimized_rf_model.joblib')
improved_cnn_model = load_model('improved_cnn_model.h5')

# Make predictions with the individual models
svm_predictions_optimized = svm_classifier_optimized.predict(X_test.reshape(X_test.shape[0], -1))
rf_predictions_optimized = rf_classifier_optimized.predict(X_test.reshape(X_test.shape[0], -1))
cnn_predictions_optimized_probs = improved_cnn_model.predict(X_test_cnn)

# Extract the predicted classes from the probabilities
cnn_predictions_optimized = np.argmax(cnn_predictions_optimized_probs, axis=1)

# Stack the predictions vertically
ensemble_predictions_optimized = np.vstack([svm_predictions_optimized, rf_predictions_optimized, cnn_predictions_optimized])


# Experiment with adjusting weights for each model in majority voting
weights = [2, 1, 1]  # Adjust these weights based on individual model performance

# Use weighted majority voting to determine the final prediction
weighted_majority_voting_predictions_optimized = np.apply_along_axis(
    lambda x: np.argmax(np.bincount(x, weights=weights)), axis=0, arr=ensemble_predictions_optimized)

# Evaluate the ensemble accuracy
ensemble_accuracy_optimized = accuracy_score(y_test, weighted_majority_voting_predictions_optimized)
print(f"Optimized Ensemble Model Accuracy: {ensemble_accuracy_optimized}")

Optimized Ensemble Model Accuracy: 0.63


In [4]:
def extract_features(file_path):
    try:
        # Load MP3 file and convert to WAV
        audio = AudioSegment.from_mp3(file_path)
        audio = audio.set_channels(1)  # Convert stereo to mono
        audio.export("temp.wav", format="wav")
        audio, _ = librosa.load("temp.wav", res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=22050, n_mfcc=13)
        chroma = librosa.feature.chroma_stft(y=audio, sr=22050)
        spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=22050)
        tonnetz = librosa.feature.tonnetz(y=audio, sr=22050)
        features = np.vstack([mfccs, chroma, spectral_contrast, tonnetz])
        mean_features = np.mean(features.T, axis=0)
        return mean_features
    except Exception as e:
        print(f"Error encountered while parsing file '{file_path}': {e}")
        return None
# Load label_dict
with open('label_dict.json', 'r') as json_file:
    label_dict = json.load(json_file)

# Load X_test
X_test = np.load('X_test.npy')
X_test_cnn = np.load('X_test_cnn.npy')
# Load models
def optimize_cnn_model(testing_feature, svm_classifier_optimized, rf_classifier_optimized, improved_cnn_model, normalized_weights, label_dict):
    result_dict = {}
    # Reshape features for CNN input
    testing_feature_cnn = testing_feature.reshape(1, testing_feature.shape[0], 1)
    # Make predictions with the individual models
    svm_prediction = svm_classifier_optimized.predict(testing_feature.reshape(1, -1))
    rf_prediction = rf_classifier_optimized.predict(testing_feature.reshape(1, -1))
    cnn_prediction_probs = improved_cnn_model.predict(testing_feature_cnn)
    cnn_prediction = np.argmax(cnn_prediction_probs, axis=1)
    # Ensemble: Weighted Voting with normalized weights
    ensemble_prediction_probs = (
        normalized_weights[0] * to_categorical(svm_prediction, num_classes=len(label_dict)) +
        normalized_weights[1] * to_categorical(rf_prediction, num_classes=len(label_dict)) +
        normalized_weights[2] * cnn_prediction_probs
    )
    # Normalize ensemble predictions to ensure they sum up to 1
    normalized_ensemble_probs = ensemble_prediction_probs / sum(ensemble_prediction_probs[0])
    weighted_majority_voting_prediction = np.argmax(normalized_ensemble_probs)
    predicted_genre = list(label_dict.keys())[weighted_majority_voting_prediction]
    result_dict["predicted_genre"] = predicted_genre
    # Store the predicted percentages in the result dictionary
    result_dict["predicted_percentages"] = {genre: percentage.item() * 100 for genre, percentage in zip(label_dict.keys(), normalized_ensemble_probs[0])}
    return json.dumps(result_dict, indent=2)

def evaluate_cnn_model(testing_feature, loaded_model, label_dict):
    result_dict = {}
    # Check if testing_feature is not None
    if testing_feature is not None:
        print(f"Shape of extracted features: {testing_feature.shape}")
        # Reshape features for CNN input
        testing_feature_cnn = testing_feature.reshape(1, testing_feature.shape[0], 1)
        # Use the model to predict the genre
        prediction = loaded_model.predict(testing_feature_cnn)
        # Get the predicted percentages for each genre
        predicted_percentages = (prediction * 100).tolist()[0]
        # Create a list of tuples with genre and its percentage
        genre_percentage_list = [(genre, percentage) for genre, percentage in zip(label_dict.keys(), predicted_percentages)]
        # Sort the list based on percentage in descending order
        genre_percentage_list.sort(key=lambda x: x[1], reverse=True)
        # Store the predicted genre and percentage in the result dictionary
        result_dict["predicted_genre"] = genre_percentage_list[0][0]
        result_dict["predicted_percentages"] = {genre: percentage for genre, percentage in genre_percentage_list}
    else:
        result_dict["error_message"] = "Error extracting features from 'testing.wav'"

    return json.dumps(result_dict, indent=2)


svm_classifier_optimized = joblib.load('optimized_svm_model.joblib')
rf_classifier_optimized = joblib.load('optimized_rf_model.joblib')
improved_cnn_model = load_model('improved_cnn_model.h5')

# Make predictions with the individual models
svm_predictions_optimized = svm_classifier_optimized.predict(X_test.reshape(X_test.shape[0], -1))
rf_predictions_optimized = rf_classifier_optimized.predict(X_test.reshape(X_test.shape[0], -1))
cnn_predictions_optimized_probs = improved_cnn_model.predict(X_test_cnn)
cnn_predictions_optimized = np.argmax(cnn_predictions_optimized_probs, axis=1)
# Ensemble: Weighted Voting
weights = [0.2, 0.2, 0.6]  # Adjust these weights based on individual model performance
# Normalize weights to ensure they sum up to 1
normalized_weights = np.array(weights) / sum(weights)

# Extract features from testing.wav
testing_file_path = './testing.wav'  # Replace with the actual path
testing_feature = extract_features(testing_file_path)

result_optimize_cnn = optimize_cnn_model(testing_feature, svm_classifier_optimized, rf_classifier_optimized, improved_cnn_model, normalized_weights, label_dict)
result_cnn = evaluate_cnn_model(testing_feature, improved_cnn_model, label_dict)
print(result_optimize_cnn)
print(result_cnn)






Shape of extracted features: (38,)
{
  "predicted_genre": "disco",
  "predicted_percentages": {
    "blues": 20.056311786174774,
    "hiphop": 5.588407814502716,
    "country": 0.14554032823070884,
    "rock": 0.2600289648398757,
    "pop": 0.6814057938754559,
    "jazz": 1.336020976305008,
    "metal": 0.014798181655351073,
    "disco": 70.73978185653687,
    "reggae": 0.7425688672810793,
    "classical": 0.4351396579295397
  }
}
{
  "predicted_genre": "disco",
  "predicted_percentages": {
    "disco": 84.56629180908203,
    "hiphop": 9.314011573791504,
    "jazz": 2.226701259613037,
    "reggae": 1.237614631652832,
    "pop": 1.1356761455535889,
    "classical": 0.7252326011657715,
    "rock": 0.4333815574645996,
    "country": 0.24256718158721924,
    "blues": 0.09384933859109879,
    "metal": 0.024663634598255157
  }
}
