In [96]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
import copy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Voice Gender
Gender Recognition by Voice and Speech Analysis

This database was created to identify a voice as male or female, based upon acoustic properties of the voice and speech. The dataset consists of 3,168 recorded voice samples, collected from male and female speakers. The voice samples are pre-processed by acoustic analysis in R using the seewave and tuneR packages, with an analyzed frequency range of 0hz-280hz (human vocal range).

## The Dataset
The following acoustic properties of each voice are measured and included within the CSV:

* meanfreq: mean frequency (in kHz)
* sd: standard deviation of frequency
* median: median frequency (in kHz)
* Q25: first quantile (in kHz)
* Q75: third quantile (in kHz)
* IQR: interquantile range (in kHz)
* skew: skewness (see note in specprop description)
* kurt: kurtosis (see note in specprop description)
* sp.ent: spectral entropy
* sfm: spectral flatness
* mode: mode frequency
* centroid: frequency centroid (see specprop)
* peakf: peak frequency (frequency with highest energy)
* meanfun: average of fundamental frequency measured across acoustic signal
* minfun: minimum fundamental frequency measured across acoustic signal
* maxfun: maximum fundamental frequency measured across acoustic signal
* meandom: average of dominant frequency measured across acoustic signal
* mindom: minimum of dominant frequency measured across acoustic signal
* maxdom: maximum of dominant frequency measured across acoustic signal
* dfrange: range of dominant frequency measured across acoustic signal
* modindx: modulation index. Calculated as the accumulated absolute difference between adjacent measurements of fundamental frequencies divided by the frequency range
* label: male or female

In [97]:
voice = pd.read_csv('Resources/voice.csv')
voice.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0,male
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632,male
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.00799,0.007812,0.015625,0.007812,0.046512,male
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.25,0.201497,0.007812,0.5625,0.554688,0.247119,male
4,0.13512,0.079146,0.124656,0.07872,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.13512,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,male


In [103]:
# Assign X (data) and y (target)
X = voice.drop("label", axis=1)
y = voice["label"]
print(X.shape, y.shape)

(3168, 20) (3168,)


In [104]:
#encoding y label
y = y.apply(lambda x: 1 if x=='male' else 0)

# converting pandas dataset to array
X=X.to_numpy()
y=y.to_numpy()

Split data into training and testing

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [107]:
# Create a StandardScater model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)

# Transform the training and testing data using the X_scaler and y_scaler models
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

Create a Logistic Regression Model

In [110]:
#Compute the sigmoid of z
def sigmoid(z):
    g = 1/(1+np.exp(-z))
    return g

In [111]:
#Computes the cost over all examples
def compute_cost(X, y, w, b, lambda_):

    m, n = X.shape
    
    cost = 0
    
    for i in range(m):
        z = np.dot(w,X[i]) + b
        f_wb = sigmoid(z)
        cost += (-y[i]*np.log(f_wb))-((1-y[i])*np.log(1-f_wb))
    
    reg = np.sum(np.square(w))
    
    # Add the regularization cost to get the total cost
    total_cost = (cost/m) + (lambda_/(2 * m)) * reg

    return total_cost

In [112]:
#Computes the gradient for logistic regression
def compute_gradient(X, y, w, b, lambda_): 

    m, n = X.shape
    dj_dw = np.zeros(w.shape)
    dj_db = 0.

    for i in range(m):
        # Calculate f_wb (exactly as you did in the compute_cost function above)
        z_wb = 0
        # Loop over each feature
        for j in range(n): 
            # Add the corresponding term to z_wb
            z_wb += X[i, j] * w[j]
        
        # Add bias term 
        z_wb += b
        
        # Calculate the prediction from the model
        f_wb = sigmoid(z_wb)
        
        # Calculate the  gradient for b from this example
        dj_db_i = f_wb - y[i]

        # add that to dj_db
        dj_db += dj_db_i

        # get dj_dw for each attribute
        for j in range(n):
        # You code here to calculate the gradient from the i-th example for j-th attribute
            dj_dw_ij =  (f_wb - y[i])* X[i][j]
            dj_dw[j] += dj_dw_ij

    # divide dj_db and dj_dw by total number of examples
    dj_dw = dj_dw / m
    dj_db = dj_db / m
    
    #regularization
    for j in range(n):
        dj_dw[j] = dj_dw[j] + ((lambda_*w[j])/m)

    return dj_db, dj_dw

In [132]:
#Performs batch gradient descent to learn theta. Updates theta by taking num_iters gradient steps with learning rate alpha
def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters, lambda_): 
    
    # number of training examples
    m = len(X)
    
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w_history = []
    
    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        dj_db, dj_dw = gradient_function(X, y, w_in, b_in, lambda_)   

        # Update Parameters using w, b, alpha and gradient
        w_in = w_in - alpha * dj_dw               
        b_in = b_in - alpha * dj_db              
       
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            cost =  cost_function(X, y, w_in, b_in, lambda_)
            J_history.append(cost)

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters/10) == 0 or i == (num_iters-1):
            w_history.append(w_in)
            print(f"Iteration {i:4}: Cost {float(J_history[-1]):8.2f}   ")
        
    return w_in, b_in, J_history, w_history #return w and J,w history for graphing

In [133]:
np.random.seed(1)
initial_w = np.random.rand(X_train.shape[1])-0.5
initial_b = 0.5

# Set regularization parameter lambda_ to 1 (you can try varying this)
lambda_ = 1                                       
# Some gradient descent settings
iterations = 2000
alpha = 0.2

w,b, J_history,_ = gradient_descent(X_train_scaled, y_train, initial_w, initial_b, 
                                    compute_cost, compute_gradient, 
                                    alpha, iterations, lambda_)

Iteration    0: Cost     0.63   
Iteration  200: Cost     0.12   
Iteration  400: Cost     0.11   
Iteration  600: Cost     0.11   
Iteration  800: Cost     0.11   
Iteration 1000: Cost     0.10   
Iteration 1200: Cost     0.10   
Iteration 1400: Cost     0.10   
Iteration 1600: Cost     0.10   
Iteration 1800: Cost     0.10   
Iteration 1999: Cost     0.10   


In [134]:
#Predict whether the label is 0 or 1 using learned logistic regression parameters w
def predict(X, w, b): 

    # number of training examples
    m, n = X.shape   
    p = np.zeros(m)
   
    # Loop over each example
    for i in range(m):   
        z = np.dot(w,X[i]) + b
        z_wb = 1/(1+np.exp(-z))

        # Apply the threshold
        if z_wb < 0.5:
            p[i] += 0
        else:
            p[i] += 1
            
    return p

In [135]:
predictions = predict(X_test_scaled, w, b)

In [136]:
# calculating percentrage of correct predictions
def score(predictions, y_test):
    correct=0
    for i in range(len(predictions)):
        if int(predictions[i])==y_test[i]:
            correct+=1
        else:
            pass
    return correct/len(y_test)
print(f"Testing Data Score: {score(predictions, y_test)}")

Testing Data Score: 0.9734848484848485


In [137]:
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0. 1. 1. 0. 1. 1. 0. 0. 1. 1.]
First 10 Actual labels: [0, 0, 1, 0, 1, 1, 0, 0, 1, 1]


In [138]:
df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
df.head(10)

Unnamed: 0,Prediction,Actual
0,0.0,0
1,1.0,0
2,1.0,1
3,0.0,0
4,1.0,1
5,1.0,1
6,0.0,0
7,0.0,0
8,1.0,1
9,1.0,1
