# **Voice Gender Classification using AI**
#### **Author:** Emmanuel Oludare Ejifolabi
#### **Date:** July 2024

In [10]:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from pydub import AudioSegment
import soundfile as sf
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [12]:
def convert_to_wav(path):
    ext = os.path.splitext(path)[1].lower()
    if ext != '.wav':
        sound = AudioSegment.from_file(path)
        wav_path = path.replace(ext, '.wav')
        sound.export(wav_path, format='wav')
        os.remove(path)
        return wav_path
    return path

In [13]:
def preprocess_audio(file_path, target_sample_rate=16000, duration=3):
    try:
        file_path = convert_to_wav(file_path)
        y, sr = librosa.load(file_path, sr=target_sample_rate)

        if len(y) < duration * sr:
            return None

        y = librosa.util.fix_length(y, size=duration * sr)
        return y
    except Exception as e:
        print(f"Error: {file_path} – {e}")
        return None

## FEATURE EXTRACTION

In [14]:
def extract_features(signal, sr=16000):
    mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=signal, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=signal, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y=signal)
    rms = librosa.feature.rms(y=signal)

    features = np.hstack([
        mfcc.mean(axis=1),
        chroma.mean(axis=1),
        contrast.mean(axis=1),
        zcr.mean(),
        rms.mean()
    ])
    return features

## DATASET LOADING

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
male_path = "/content/drive/MyDrive/DSP_PROJECTS/Gender_Voice_Classification/Gender_Dataset/data/male"
female_path = "/content/drive/MyDrive/DSP_PROJECTS/Gender_Voice_Classification/Gender_Dataset/data/female"

In [18]:
X, y = [], []

for folder, label in [(male_path, 0), (female_path, 1)]:
    for file in os.listdir(folder):
        full_path = os.path.join(folder, file)
        signal = preprocess_audio(full_path)
        if signal is not None:
            features = extract_features(signal)
            X.append(features)
            y.append(label)

X = np.array(X)
y = np.array(y)

print(f"Loaded {len(X)} samples.")


Loaded 9482 samples.


## MODEL TRAINING

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [28]:
pred = model.predict(X_test)

In [29]:
print(confusion_matrix(y_test, pred))

[[1213    2]
 [   9  673]]


In [30]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1215
           1       1.00      0.99      0.99       682

    accuracy                           0.99      1897
   macro avg       0.99      0.99      0.99      1897
weighted avg       0.99      0.99      0.99      1897



## SAVING MODEL

In [27]:
MODEL_PATH = "/content/drive/MyDrive/DSP_PROJECTS/Gender_Voice_Classification/voice_gender_model.pkl"

joblib.dump(model, MODEL_PATH)
print("Model saved to voice_gender_model.pkl")

Model saved to voice_gender_model.pkl
