# importing needed libraries and reading dataset (common-voice)

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # visualizing data
import seaborn as sns # visualizing data with stunning default theme
import sklearn # contain algorithms
import warnings
warnings.filterwarnings('ignore')

# load dataset from input directory
df = pd.read_csv("../input/common-voice/cv-valid-train.csv") 
df[df['age'].notna()].head()

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
5,cv-valid-train/sample-000005.mp3,a shepherd may like to travel but he should ne...,1,0,twenties,female,us,
8,cv-valid-train/sample-000008.mp3,put jackie right on the staff,3,0,seventies,male,us,
13,cv-valid-train/sample-000013.mp3,but he had found a guide and didn't want to mi...,1,0,thirties,female,us,
14,cv-valid-train/sample-000014.mp3,as they began to decorate the hallway a silhou...,1,0,sixties,male,england,
19,cv-valid-train/sample-000019.mp3,then they got ahold of some dough and went goofy,1,0,fifties,male,australia,


# data pre-processing


**we need only audio file-path and gender**

In [2]:
#we extract the columns that we think useful are
df = df[['filename','gender']]
#To clean the data we remove the sample with NaN attribute values.
data = df[df['gender'].notna()]
data.reset_index(inplace=True, drop=True)
data.head()

Unnamed: 0,filename,gender
0,cv-valid-train/sample-000005.mp3,female
1,cv-valid-train/sample-000008.mp3,male
2,cv-valid-train/sample-000013.mp3,female
3,cv-valid-train/sample-000014.mp3,male
4,cv-valid-train/sample-000019.mp3,male


**we have around 74000 samples**


**we will train on only 1000 samples**

In [3]:
data = data[:1000]

In [4]:
data.shape

(1000, 2)

# feature extraction

In [5]:
import librosa
ds_path = "/kaggle/input/common-voice/cv-valid-train/"

#this function is used to extract audio frequency features
def feature_extraction(filename, sampling_rate=48000):
    path = "{}{}".format(ds_path, filename)
    features = list()
    audio, _ = librosa.load(path, sr=sampling_rate)
    
    gender = data[data['filename'] == filename].gender.values[0]
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sampling_rate))
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sampling_rate))
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sampling_rate))
    features.append(gender)
    features.append(spectral_centroid)
    features.append(spectral_bandwidth)
    features.append(spectral_rolloff)
    
    mfcc = librosa.feature.mfcc(y=audio, sr=sampling_rate)
    for el in mfcc:
        features.append(np.mean(el))
    
    return features
    
        
features = feature_extraction(data.iloc[0]['filename'])
print("features: ", features)

features:  ['female', 2147.6058803589067, 2430.4749711924064, 4428.830553016453, -625.28143, 111.306145, 6.3690877, 34.7671, 31.623457, -4.721562, -0.51193464, -4.9454904, -12.71285, -2.043672, -3.7277248, -10.708404, -11.206564, -12.003516, -8.506438, -5.4722967, -4.950396, -3.7100525, -6.3149858, -6.3280854]


In [6]:
#the function create dataframe to store the feature and label related to each other
def create_df_features(orig):
    new_rows = list()
    tot_rows = len(orig)-1
    stop_counter = 55001
    
    for idx, row in orig.iterrows():
        if idx >= stop_counter: break
        print("\r", end="")
        print("{}/{}".format(idx, tot_rows), end="", flush=True)
        features = feature_extraction(row['filename'])
        new_rows.append(features)

    return pd.DataFrame(new_rows, columns=["label", "spectral_centroid", "spectral_bandwidth", "spectral_rolloff",
                                    "mfcc1", "mfcc2", "mfcc3", "mfcc4", "mfcc5", "mfcc6", "mfcc7", "mfcc8",
                                   "mfcc9", "mfcc10", "mfcc11", "mfcc12", "mfcc13", "mfcc14", "mfcc15", "mfcc16",
                                   "mfcc17", "mfcc18", "mfcc19", "mfcc20"])

df_features = create_df_features(data)
df_features.head()

999/999

Unnamed: 0,label,spectral_centroid,spectral_bandwidth,spectral_rolloff,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,...,mfcc11,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20
0,female,2147.60588,2430.474971,4428.830553,-625.281433,111.306145,6.369088,34.767101,31.623457,-4.721562,...,-3.727725,-10.708404,-11.206564,-12.003516,-8.506438,-5.472297,-4.950396,-3.710052,-6.314986,-6.328085
1,male,2815.32544,2451.922347,4884.633819,-469.936646,126.283371,-16.548635,3.546783,2.184197,13.748073,...,-5.582068,-10.987885,1.132518,6.090083,-1.732454,-1.226424,-6.432127,-7.269328,-4.7966,-3.129157
2,female,1844.637736,1491.011525,3164.948048,-418.205475,147.66687,-49.973999,-2.286365,37.18531,-8.906046,...,-3.504616,-13.844883,-9.928528,-7.873624,-5.476491,-2.298847,-6.556987,-5.206552,-0.403855,-3.877069
3,male,2123.711334,2202.012929,4111.215965,-464.910706,118.437225,19.749664,27.143229,26.438824,2.309232,...,-7.230497,-5.461425,-2.908412,-4.496075,-2.716447,-0.080573,-5.294941,-5.868721,0.600507,-3.375833
4,male,2360.672043,2957.220239,4767.08005,-343.833008,157.15387,7.66116,41.898956,-15.152467,28.925102,...,0.814843,0.102648,6.319801,-1.603869,3.837118,-2.643296,2.323317,-1.382684,3.791498,-3.534382


In [7]:
df_features.shape

(1000, 24)

# feature scaling 

In [8]:
from sklearn.preprocessing import StandardScaler

def scale_features(data):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(np.array(data.iloc[:, 1:], dtype = float))
    # with data.iloc[:, 0:-1] we don't consider the label column
        
    return scaled_data, scaler

x, scaler = scale_features(df_features)

In [9]:
print("Before scaling:", df_features.iloc[0].values[:-1])
print("\nAfter scaling:", x[0])

Before scaling: ['female' 2147.6058803589067 2430.4749711924064 4428.830553016453
 -625.28143 111.306145 6.3690877 34.7671 31.623457 -4.721562 -0.51193464
 -4.9454904 -12.71285 -2.043672 -3.7277248 -10.708404 -11.206564
 -12.003516 -8.506438 -5.4722967 -4.950396 -3.7100525 -6.3149858]

After scaling: [-0.65717137 -0.11331652 -0.39015442 -1.98720349 -0.25654415  0.51448453
  0.7476843   1.58296401 -0.83665492  0.12582597 -0.33562685 -1.06755255
  0.42177349 -0.07062813 -1.21350452 -1.7620354  -1.37549538 -1.17934871
 -0.64012494 -0.34820576 -0.51769144 -1.08458817 -0.63011711]


# label encoding

In [10]:
df_features.iloc[:, 0]

0      female
1        male
2      female
3        male
4        male
        ...  
995      male
996    female
997    female
998      male
999      male
Name: label, Length: 1000, dtype: object

In [11]:
from sklearn.preprocessing import LabelEncoder

def get_labels(data):
    labels = data.iloc[:, 0]
    encoder = LabelEncoder()
    labels = encoder.fit_transform(labels)
    return labels, encoder

y, encoder = get_labels(df_features)
classes = encoder.classes_
print("Before encoding:", df_features.iloc[0].values[0])
print("\nAfter encoding:", y[0])
print("\nClasses:", classes)

Before encoding: female

After encoding: 0

Classes: ['female' 'male' 'other']


# training the model

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

model = SVC(C=100, gamma='scale')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
acc

0.885

# model evaluation

In [13]:
y_test

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1])

In [14]:
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1])

In [15]:
from IPython.display import Audio
file_path = '/kaggle/input/common-voice/cv-valid-test/cv-valid-test/sample-000020.mp3'  # Replace with your file path
y, sr = librosa.load(file_path, sr=None)  # sr=None to keep the original sample rate

# Play the audio
Audio(data=y, rate=sr)

In [16]:
def feature_extraction_pred(audio, sampling_rate=48000):
    y_features=list()
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sampling_rate))
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sampling_rate))
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sampling_rate))
    y_features.append(spectral_centroid)
    y_features.append(spectral_bandwidth)
    y_features.append(spectral_rolloff)
    
    mfcc = librosa.feature.mfcc(y=audio, sr=sampling_rate)
    for el in mfcc:
        y_features.append(np.mean(el))
    
    return y_features

In [17]:
pred=feature_extraction_pred(y)


In [18]:
pred=np.array(pred)

In [19]:
scaler = StandardScaler()
scaled_pred = scaler.fit_transform(pred.reshape(-1, 1))  # Reshape to 2D array
print("Original Features:", pred)
print("Scaled Features:", scaled_pred.flatten())

Original Features: [ 1.77206977e+03  1.66471802e+03  3.21346290e+03 -4.20309967e+02
  1.15845993e+02  4.24851799e+00  9.77571297e+00  1.40950222e+01
 -1.18696918e+01 -1.07429094e+01 -9.81115627e+00 -1.20861979e+01
  1.01288295e+00  3.48806906e+00 -5.00671625e+00 -9.99812031e+00
 -1.29763279e+01 -7.57334232e+00 -1.79154527e+00 -3.07536793e+00
 -2.27187490e+00 -5.97598886e+00 -9.54750347e+00]
Scaled Features: [ 1.87421458  1.74001079  3.67614534 -0.86654809 -0.19628271 -0.33579421
 -0.32888449 -0.32348479 -0.35594409 -0.35453547 -0.35337065 -0.35621475
 -0.33983918 -0.33674487 -0.34736447 -0.35360438 -0.35732753 -0.35057309
 -0.34334508 -0.34495003 -0.34394556 -0.34857619 -0.35304105]


In [20]:
scaled_pred=scaled_pred.reshape(1,23)

In [21]:
result = model.predict(scaled_pred)

In [22]:
result

array([0])