# **Voice Gender Classification using AI**
#### **Author:** Emmanuel Oludare Ejifolabi
#### **Date:** July 2024

In [22]:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from pydub import AudioSegment
import soundfile as sf
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [23]:
def convert_to_wav(path):
    ext = os.path.splitext(path)[1].lower()
    if ext != '.wav':
        sound = AudioSegment.from_file(path)
        wav_path = path.replace(ext, '.wav')
        sound.export(wav_path, format='wav')
        os.remove(path)
        return wav_path
    return path

In [24]:
def preprocess_audio(file_path, target_sample_rate=16000, duration=3):
    try:
        file_path = convert_to_wav(file_path)
        y, sr = librosa.load(file_path, sr=target_sample_rate)

        if len(y) < duration * sr:
            return None

        y = librosa.util.fix_length(y, size=duration * sr)
        return y
    except Exception as e:
        print(f"Error: {file_path} – {e}")
        return None

## FEATURE EXTRACTION

In [25]:
def extract_features(signal, sr=16000):
    mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=signal, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=signal, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y=signal)
    rms = librosa.feature.rms(y=signal)

    features = np.hstack([
        mfcc.mean(axis=1),
        chroma.mean(axis=1),
        contrast.mean(axis=1),
        zcr.mean(),
        rms.mean()
    ])
    return features

## DATASET LOADING

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
male_path = "/content/drive/MyDrive/DSP_PROJECTS/Gender_Voice_Classification/Gender_Dataset/data/male"
female_path = "/content/drive/MyDrive/DSP_PROJECTS/Gender_Voice_Classification/Gender_Dataset/data/female"

In [26]:
X, y = [], []

for folder, label in [(male_path, 0), (female_path, 1)]:
    for file in os.listdir(folder):
        full_path = os.path.join(folder, file)
        signal = preprocess_audio(full_path)
        if signal is not None:
            features = extract_features(signal)
            X.append(features)
            y.append(label)

X = np.array(X)
y = np.array(y)

print(f"Loaded {len(X)} samples.")


Loaded 9482 samples.


In [27]:
import pandas as pd
signal = pd.DataFrame(X)
signal['label'] = y
print(signal.head())

            0           1          2          3          4          5  \
0 -252.147903  143.172363 -22.073370  26.665802   1.324313  -8.355433   
1 -251.239151   93.305138  -0.599273  44.870392  -6.116223   5.346209   
2 -230.780579   98.988564  12.839869  32.228050  -7.325163  14.489880   
3 -245.055069  109.032455   7.127656  16.977098 -12.258371  17.102945   
4 -237.725983  122.820351 -11.470632  34.082802   0.184801   0.806481   

           6          7         8          9  ...         25         26  \
0   6.371988 -12.993995 -5.271060   2.132325  ...  21.602986  18.287081   
1  -9.358086  -5.566604 -6.655022 -15.035367  ...  27.839877  19.168195   
2 -15.094665  -5.588855 -5.266789  -6.517415  ...  25.185741  17.870519   
3  -3.004932  -5.263079 -7.338573  -1.731696  ...  24.420761  17.261792   
4   4.998673 -12.071196  1.612485   3.892998  ...  19.139640  15.945619   

          27         28         29         30         31        32        33  \
0  19.564771  18.301664  16.65

In [29]:
signal.duplicated().sum()

np.int64(506)

In [30]:
signal.drop_duplicates(inplace=True)

In [31]:
signal['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,5924
1,3052


In [32]:
X = signal.drop('label', axis=1)
y = signal['label']

## DATA BALANCING

In [33]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=42)

X, y = smote.fit_resample(X, y)

In [34]:
print(X.shape)
print(y.shape)

(11848, 34)
(11848,)


## MODEL TRAINING

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [36]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(9478, 34)
(2370, 34)
(9478,)
(2370,)


In [37]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [38]:
pred = model.predict(X_test)

In [39]:
print(confusion_matrix(y_test, pred))

[[1182    3]
 [   9 1176]]


In [40]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1185
           1       1.00      0.99      0.99      1185

    accuracy                           0.99      2370
   macro avg       0.99      0.99      0.99      2370
weighted avg       0.99      0.99      0.99      2370



## SAVING MODEL

In [43]:
MODEL_PATH = "/content/drive/MyDrive/DSP_PROJECTS/Gender_Voice_Classification/voice_gender_model.pkl"

joblib.dump(model, MODEL_PATH)
print("Model saved to voice_gender_model.pkl")

Model saved to voice_gender_model.pkl
