In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # visualizing data
import seaborn as sns # visualizing data with stunning default theme
import sklearn # contain algorithms
import warnings
warnings.filterwarnings('ignore')
import librosa
import numpy as np
import pandas as pd
from scipy.fftpack import fft
import dask.bag as db
from concurrent.futures import ProcessPoolExecutor
import multiprocessing



df = pd.read_csv("../input/common-voice/cv-valid-train.csv") 

In [3]:
# Preparing data
start=df.shape
end = df[df['age'].notna()& df['gender'].notna() & df['accent'].notna()].shape
print("initial: {} final: {}".format(start, end))
df = df[['filename','age','gender']]
# Cleaning data
data = df[df['age'].notna() & df['gender'].notna()]
data.reset_index(inplace=True, drop=True)
data.head()

initial: (195776, 8) final: (63163, 8)


In [15]:
import librosa
import numpy as np
import pandas as pd
from scipy.fftpack import fft
from concurrent.futures import ProcessPoolExecutor #just ot make the code run faster by compiling in parallel

ds_path = "/kaggle/input/common-voice/cv-other-train/"

#function to extract useful features from audio data using librosa and scipy.fftpack for Fourier analysis
def feature_extraction(filename, sampling_rate=48000):
    path = "{}{}".format(ds_path, filename)
    features = list()
    audio, _ = librosa.load(path, sr=sampling_rate)
    
    # Fast Fourier Transform
    audio_fft = np.abs(fft(audio))

    gender = data[data['filename'] == filename].gender.values[0]
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sampling_rate))
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sampling_rate))
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sampling_rate))
    features.append(gender)
    features.append(spectral_centroid)
    features.append(spectral_bandwidth)
    features.append(spectral_rolloff)
    
    # Using FFT to extract MFCC features
    mfcc = librosa.feature.mfcc(S=librosa.power_to_db(audio_fft.reshape((-1, 1))), sr=sampling_rate)
    for el in mfcc:
        features.append(np.mean(el))
    
    return features

def extract_features_and_append_age(row):
    features = feature_extraction(row['filename'])
    features.append(row['age'])
    return features

def create_df_features_parallel(orig, max_workers=4):
    new_rows = []
    tot_rows = len(orig) - 1
    stop_counter = 60000
    #70001

    # Create a subset of the input DataFrame
    subset = orig.head(stop_counter)

    # Extract features and append age in parallel
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(extract_features_and_append_age, row) for _, row in subset.iterrows()]
        for idx, future in enumerate(futures):
            print("\r", end="")
            print("{}/{}".format(idx, tot_rows), end="", flush=True)
            new_rows.append(future.result())
    
    columns = ["gender", "spectral_centroid", "spectral_bandwidth", "spectral_rolloff"]
    columns += [f"fft{i+1}" for i in range(20)]  # Assuming 20 MFCC features
    columns += ["label"]
    return pd.DataFrame(new_rows, columns=columns)

df_features = create_df_features_parallel(data)
df_features.head()

70000/73465

Unnamed: 0,gender,spectral_centroid,spectral_bandwidth,spectral_rolloff,zero_crossing_rate,chroma1,chroma2,chroma3,chroma4,chroma5,...,mel122,mel123,mel124,mel125,mel126,mel127,mel128,mel129,mel130,label
0,female,2679.939569,3347.669489,5745.486746,0.047586,0.258635,0.251495,0.315209,0.401681,0.461881,...,2.508107e-09,2.492743e-09,2.539437e-09,2.479796e-09,2.453964e-09,2.524172e-09,2.562106e-09,-14.771337,-7e-06,twenties
1,male,2859.467798,2576.661658,4912.241181,0.088031,0.32264,0.403628,0.523808,0.526183,0.554026,...,2.43872e-09,2.503767e-09,2.500356e-09,2.619601e-09,2.618112e-09,2.492591e-09,2.480642e-09,-12.683986,0.000427,seventies
2,female,1976.049163,1830.611037,3344.301008,0.055408,0.34813,0.423928,0.343355,0.313706,0.322862,...,2.532502e-09,2.478892e-09,2.470721e-09,2.427201e-09,2.509966e-09,2.485215e-09,2.530151e-09,-13.126539,3.1e-05,thirties
3,male,2333.782018,2533.27603,4398.731436,0.045556,0.446354,0.496136,0.539497,0.570656,0.541903,...,2.534119e-09,2.579378e-09,2.511387e-09,2.495975e-09,2.527928e-09,2.5406e-09,2.540989e-09,-9.571856,-1.5e-05,sixties
4,male,2351.958974,2974.758776,4743.172457,0.047405,0.553487,0.574426,0.596472,0.648865,0.674905,...,2.487033e-09,2.565779e-09,2.57549e-09,2.574453e-09,2.578162e-09,2.549932e-09,2.547036e-09,-4.583716,-2.6e-05,fifties


In [17]:
df_features.to_csv('my_data.csv', index=False)

In [44]:
df_features1 = pd.read_csv('/kaggle/input/voicefft/mydataku.csv')
df_features2 = pd.read_csv('/kaggle/input/voicefft2/dataset_other.csv')
first_column_name = df_features1.columns[0]
df_features1 = df_features1.drop(columns=first_column_name)
df_features = pd.concat([df_features1, df_features2])
df_features.head()
df_features.shape

(130001, 25)

In [45]:
cleanup_nums = {"gender": {"male":1,"female":0,"other":0.5}}
df_features = df_features.replace(cleanup_nums)
df_features.drop(df_features[df_features["gender"] == 0.5].index, inplace = True)
'''first_column_name = df_features.columns[-1]
df_features = df_features.drop(columns=first_column_name)'''
df_features.head()

Unnamed: 0,gender,spectral_centroid,spectral_bandwidth,spectral_rolloff,fft1,fft2,fft3,fft4,fft5,fft6,...,fft12,fft13,fft14,fft15,fft16,fft17,fft18,fft19,fft20,label
0,0.0,2679.939569,3347.669489,5745.486746,-7815.363281,0.067989,5590.484375,0.004343,2476.795898,-0.040136,...,0.039287,-208.588043,0.056004,461.177307,0.025915,586.334229,-0.009782,422.05304,-0.037146,twenties
1,1.0,2859.467798,2576.661658,4912.241181,-3652.987793,-0.074349,3694.873047,-0.252566,381.45105,-0.237524,...,-0.259256,-176.06781,-0.171367,137.595856,-0.205755,203.532379,-0.27617,-31.646547,-0.239039,seventies
2,0.0,1976.049163,1830.611037,3344.301008,-5910.616211,0.049991,5874.188477,-0.035536,2957.904541,-0.097991,...,-0.02062,-106.468979,-0.018917,459.807373,-0.089614,111.1614,-0.096511,-18.298313,-0.106024,thirties
3,1.0,2333.782018,2533.27603,4398.731436,-4862.352539,0.072208,6648.313965,-0.032831,509.296814,-0.021182,...,0.007381,164.312943,0.024873,606.313843,-0.013334,422.132385,-0.029056,209.511932,-0.023324,sixties
4,1.0,2351.958974,2974.758776,4743.172457,-1936.911621,0.041133,5767.038086,-0.117948,-1766.663818,0.010639,...,0.010957,445.105652,-0.00724,887.484985,-0.13337,-247.038513,-0.009954,305.451294,-0.035087,fifties


In [46]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

def get_labels(data):
    labels = data.iloc[:, -1]
    encoder = LabelEncoder()
    labels = encoder.fit_transform(labels)
    return labels, encoder

y, encoder = get_labels(df_features)
classes = encoder.classes_
print("Before:", df_features.iloc[0].values[-1])
print("\nAfter:", y[0])
print(classes)

#extract label and assign it to y, the rest to X
X = df_features.iloc[:, :-1]  # Features 


#split dataset
X_new, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=15)

#scale dataset
scaler = StandardScaler()
X_new = scaler.fit_transform(X_new)
X_test = scaler.transform(X_test)

Before: twenties

After: 7
['eighties' 'fifties' 'fourties' 'seventies' 'sixties' 'teens' 'thirties'
 'twenties']


### **Best Features**  

ANOVA (ANalysis Of VAriance) to select the best features.


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

# ou models
mlp = MLPClassifier()
xgb = XGBClassifier()

# parameters to gridsearch
mlp_params = {
    'hidden_layer_sizes': [(50, 50, 400), (100, ), (200,)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.00005, 0.0001]
}

xgb_params = {
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
}

# Gridsearching for best parameters
mlp_grid = GridSearchCV(mlp, mlp_params)
xgb_grid = GridSearchCV(xgb, xgb_params)

# Fit the grid search objects to the data
mlp_grid.fit(X_new, y_train)
xgb_grid.fit(X_new, y_train)

# Choosing best model
voting_clf = VotingClassifier(estimators=[
    ('mlp', mlp_grid.best_estimator_),
    ('xgb', xgb_grid.best_estimator_)
], voting='hard')

# Training best model
voting_clf.fit(X_new, y_train)

# Predicting
y_pred = voting_clf.predict(X_test)

# Calculating accuracy of the model
cm = accuracy_score(y_test, y_pred)
print(cm)

In [36]:
#Running the best model on whole dataset
model_for_age = MLPClassifier(hidden_layer_sizes=(50, 50, 400), activation='relu', solver='adam', alpha=0.00005)
scaler_for_age = StandardScaler()
X = scaler_for_age.fit_transform(X)
model_for_age.fit(X, y)

# Saving the model and scaler that was used for data
import joblib
filename_model = 'model_age.sav'
filename_scaler = 'scaler_age.joblib'
joblib.dump(model_for_age, filename_model)
joblib.dump(scaler_for_age, filename_scaler)

['scaler_age4.joblib']