In [1]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf




In [2]:
data_path = 'data/FULL'

In [3]:
voice_type_mapping = {
    'female1': 'soprano', 'female2': 'soprano', 'female3': 'soprano', 'female4': 'soprano',
    'female5': 'mezzo_soprano', 'female6': 'soprano', 'female7': 'soprano', 'female8': 'mezzo_soprano', 'female9': 'soprano',
    'male1': 'baritone', 'male2': 'tenor', 'male3': 'tenor', 'male4': 'bass',
    'male5': 'baritone', 'male6': 'baritone', 'male7': 'tenor', 'male8': 'bass',
    'male9': 'tenor', 'male10': 'bass', 'male11': 'tenor'
}

In [4]:
def load_and_preprocess_data(data_path, voice_type_mapping, sample_rate=22050, duration=5):
    X = []
    y = []
    for singer in os.listdir(data_path):
        singer_path = os.path.join(data_path, singer)
        if os.path.isdir(singer_path) and singer in voice_type_mapping:
            voice_type = voice_type_mapping[singer]
            for category in ['arpeggios', 'excerpts', 'long_tones', 'scales']:
                category_path = os.path.join(singer_path, category)
                if os.path.isdir(category_path):
                    for technique in os.listdir(category_path):
                        technique_path = os.path.join(category_path, technique)
                        if os.path.isdir(technique_path):
                            for file in os.listdir(technique_path):
                                if file.endswith('.wav'):
                                    file_path = os.path.join(technique_path, file)
                                    audio, _ = librosa.load(file_path, sr=sample_rate, duration=duration)
                                    if len(audio) < sample_rate * duration:
                                        audio = np.pad(audio, (0, sample_rate * duration - len(audio)))
                                    else:
                                        audio = audio[:sample_rate * duration]
                                    mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
                                    X.append(mfcc)
                                    y.append(voice_type)
    return np.array(X), np.array(y)

In [5]:
X, y = load_and_preprocess_data(data_path, voice_type_mapping)

In [6]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [8]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)

In [9]:
def build_model(input_shape, num_classes):
    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model

In [10]:
model = build_model(X_train.shape[1:], len(np.unique(y_encoded)))





In [12]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])




In [13]:
history = model.fit(X_train, y_train, epochs=10, validation_split=0.2)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print(f'\nTest accuracy: {test_acc}')

23/23 - 1s - loss: 0.6553 - accuracy: 0.8465 - 603ms/epoch - 26ms/step

Test accuracy: 0.8464730381965637


In [16]:
import joblib

In [20]:
joblib.dump(model,'vocal_range_classifier.pkl')

['vocal_range_classifier.pkl']

In [21]:
model.save('vocal_range_classifier.h5')

  saving_api.save_model(


In [18]:
joblib.dump(label_encoder,'label_encoder.pkl')

['label_encoder.pkl']