In [None]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings(action='ignore')
import tqdm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from sklearn.model_selection import train_test_split
import datetime


In [None]:
df = pd.read_csv("/content/balanced-all.csv")
df.head()

Unnamed: 0,filename,gender
0,data/cv-other-train/sample-069205.npy,female
1,data/cv-valid-train/sample-063134.npy,female
2,data/cv-other-train/sample-080873.npy,female
3,data/cv-other-train/sample-105595.npy,female
4,data/cv-valid-train/sample-144613.npy,female


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df.tail()

Unnamed: 0,filename,gender
86590,data/cv-valid-train/sample-171098.npy,male
86591,data/cv-other-train/sample-022864.npy,male
86592,data/cv-valid-train/sample-080933.npy,male
86593,data/cv-other-train/sample-012026.npy,male
86594,data/cv-other-train/sample-013841.npy,male


In [None]:
# get total samples
n_samples = len(df)
# get total male samples
n_male_samples = len(df[df['gender'] == 'male'])
# get total female samples
n_female_samples = len(df[df['gender'] == 'female'])
print("Total samples:", n_samples)
print("Total male samples:", n_male_samples)
print("Total female samples:", n_female_samples)

Total samples: 86595
Total male samples: 53126
Total female samples: 33469


In [None]:
label2int = {
    "male": 1,
    "female": 0
}



In [None]:
!pip install keras

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
def split_data(X, y, test_size=0.1, valid_size=0.1):
    # split training set and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=7)
    # split training set and validation set
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=valid_size, random_state=7)
    # return a dictionary of values
    return {
        "X_train": X_train,
        "X_valid": X_valid,
        "X_test": X_test,
        "y_train": y_train,
        "y_valid": y_valid,
        "y_test": y_test
    }

In [None]:
X = df.drop(['gender'] ,axis = 1)
y = df.gender.map(label2int)
# split the data into training, validation and testing sets
data = split_data(X, y, test_size=0.1, valid_size=0.1)

y.head()

0    0
1    0
2    0
3    0
4    0
Name: gender, dtype: int64

In [None]:
data["X_train"].head()

Unnamed: 0,filename
58658,data/cv-valid-train/sample-024816.npy
35215,data/cv-valid-train/sample-053299.npy
13305,data/cv-valid-train/sample-019566.npy
56092,data/cv-valid-train/sample-095984.npy
7231,data/cv-other-train/sample-121029.npy


In [None]:

data["X_valid"].tail()

Unnamed: 0,filename
66997,data/cv-valid-train/sample-015694.npy
67862,data/cv-valid-train/sample-053570.npy
37930,data/cv-valid-train/sample-016615.npy
69680,data/cv-other-train/sample-104370.npy
16780,data/cv-other-train/sample-039752.npy


In [None]:
def create_model(vector_length=256):
    """5 hidden dense layers from 256 units to 64, not the best model."""
    model = Sequential()
    model.add(Dense(256, input_shape=(vector_length,)))
    model.add(Dropout(0.3))
    model.add(Dense(256, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.3))
    # one output neuron with sigmoid activation function, 0 means female, 1 means male
    model.add(Dense(1, activation="sigmoid"))
    # using binary crossentropy as it's male/female classification (binary)
    model.compile(loss="binary_crossentropy", metrics=["accuracy"], optimizer="adam")
    # print summary of the model
    model.summary()
    return model

In [None]:
model = create_model()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               33024     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               65792     
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 128)               1

In [None]:
# use tensorboard to view metrics
import os
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard = TensorBoard(logdir)
# define early stopping to stop training after 5 epochs of not improving
early_stopping = EarlyStopping(mode="min", patience=5, restore_best_weights=True)

batch_size = 64
epochs = 100
# train the model using the training set and validating using validation set
model.fit(data["X_train"], data["y_train"], epochs=epochs, batch_size=batch_size, validation_data=(data["X_valid"], data["y_valid"]),callbacks=[tensorboard, early_stopping])

Epoch 1/100




ValueError: ignored

In [None]:
model.save("results/model.h5")

In [None]:
# evaluating the model using the testing set
print(f"Evaluating the model using {len(data['X_test'])} samples...")
loss, accuracy = model.evaluate(data["X_test"], data["y_test"], verbose=0)
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy*100:.2f}%")

In [None]:
%load_ext tensorboard
%tensorboard --logdir="logs"

In [None]:
import librosa
import numpy as np

def extract_feature(file_name, **kwargs):

    mel = kwargs.get("mel")
    X, sample_rate = librosa.core.load(file_name)
    result = np.array([])

    if mel:
        mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
        result = np.hstack((result, mel))

    return result

In [None]:
 import pyaudio
 import os
 import wave
 import librosa
 import numpy as np
 from sys import byteorder
 from array import array
 from struct import pack



 THRESHOLD = 500
 CHUNK_SIZE = 1024
 FORMAT = pyaudio.paInt16
 RATE = 16000

 SILENCE = 30

 def is_silent(snd_data):
     "Returns 'True' if below the 'silent' threshold"
     return max(snd_data) < THRESHOLD

 def normalize(snd_data):
     "Average the volume out"
     MAXIMUM = 16384
    times = float(MAXIMUM)/max(abs(i) for i in snd_data)

     r = array('h')
     for i in snd_data:
         r.append(int(i*times))
     return r

 def trim(snd_data):
     "Trim the blank spots at the start and end"
     def _trim(snd_data):
         snd_started = False
         r = array('h')

         for i in snd_data:
             if not snd_started and abs(i)>THRESHOLD:
                 snd_started = True
                 r.append(i)

             elif snd_started:
                 r.append(i)
         return r

     # Trim to the left
     snd_data = _trim(snd_data)

     # Trim to the right
     snd_data.reverse()
     snd_data = _trim(snd_data)
     snd_data.reverse()
     return snd_data

 def add_silence(snd_data, seconds):
     "Add silence to the start and end of 'snd_data' of length 'seconds' (float)"
     r = array('h', [0 for i in range(int(seconds*RATE))])
     r.extend(snd_data)
     r.extend([0 for i in range(int(seconds*RATE))])
     return r

 def record():
#     """
#     Record a word or words from the microphone and
#     return the data as an array of signed shorts.
#     Normalizes the audio, trims silence from the
#     start and end, and pads with 0.5 seconds of
#     blank sound to make sure VLC et al can play
#     it without getting chopped off.
#     """
      p = pyaudio.PyAudio()
      stream = p.open(format=FORMAT, channels=1, rate=RATE,
         input=True, output=True,
         frames_per_buffer=CHUNK_SIZE)

     num_silent = 0
     snd_started = False

     r = array('h')

     while 1:
#         # little endian, signed short
         snd_data = array('h', stream.read(CHUNK_SIZE))
         if byteorder == 'big':
             snd_data.byteswap()
         r.extend(snd_data)

         silent = is_silent(snd_data)

         if silent and snd_started:
             num_silent += 1
         elif not silent and not snd_started:
             snd_started = True

         if snd_started and num_silent > SILENCE:
             break

     sample_width = p.get_sample_size(FORMAT)
     stream.stop_stream()
     stream.close()
     p.terminate()

     r = normalize(r)
     r = trim(r)
     r = add_silence(r, 0.5)
     return sample_width, r

 def record_to_file(path):
#     "Records from the microphone and outputs the resulting data to 'path'"
      sample_width, data = record()
      data = pack('<' + ('h'*len(data)), *data)

      wf = wave.open(path, 'wb')
      wf.setnchannels(1)
      wf.setsampwidth(sample_width)
      wf.setframerate(RATE)
      wf.writeframes(data)
      wf.close()


 # if __name__ == "__main__":
     # load the saved model (after training)
     # model = pickle.load(open("result/mlp_classifier.model", "rb"))

  print("talk")
  file = "test.wav"
  # record the file (start talking)
  record_to_file(file)
  # extract features and reshape it
 features = extract_feature(file, mel=True).reshape(1, -1)
 # predict the gender!
 male_prob = model.predict(features)[0][0]
 female_prob = 1 - male_prob
 gender = "male" if male_prob > female_prob else "female"
 # show the result!
 print("Result:", gender)
 print(f"Probabilities:     Male: {male_prob*100:.2f}%    Female: {female_prob*100:.2f}%")