## Extraction des mfcss

In [205]:
import os
import numpy as np
import scipy.io.wavfile as wav
import python_speech_features
from sklearn.mixture import GaussianMixture
import pickle
# Définition des paramètres MFCC
numcep = 13
nfilt = 26
nfft = 512
lowfreq = 0
highfreq = None
preemph = 0.97
ceplifter = 22
winlen = 0.025
winstep = 0.01


lang_dir = 'dataset/Train/F/'
for filename in os.listdir(lang_dir):
    #print(filename)
    (rate, sig) = wav.read(lang_dir+filename)
    mfcc_feat = python_speech_features.mfcc(sig, rate, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=False)
    
    n_components = 2  # number of GMM components
    gmm = GaussianMixture(n_components=n_components)  # create GMM object
    gmm.fit(mfcc_feat)  # fit GMM model to data
    log_prob = gmm.score_samples(mfcc_feat)
    # Select frames with log-likelihood above a threshold
    threshold = np.percentile(log_prob, 10)  # adjust percentile to select more or fewer frames
    non_silent = mfcc_feat[log_prob >= threshold]
    
    root, ext = os.path.splitext(filename)
    with open('mfcc/train/F/'+root+'.mfcc', 'wb') as f:
        pickle.dump(non_silent, f)
    




F1.wav
F11.wav




F2.wav




F3 .wav
F4.wav




F5.wav




F6.wav
F7.wav




F8.wav
F9.wav




In [208]:


lang_dir = 'dataset/Train/H/'
for filename in os.listdir(lang_dir):
    #print(filename)
    (rate, sig) = wav.read(lang_dir+filename)
    mfcc_feat = python_speech_features.mfcc(sig, rate, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=False)
    n_components = 2  # number of GMM components
    gmm = GaussianMixture(n_components=n_components)  # create GMM object
    gmm.fit(mfcc_feat)  # fit GMM model to data
    log_prob = gmm.score_samples(mfcc_feat)
    # Select frames with log-likelihood above a threshold
    threshold = np.percentile(log_prob, 10)  # adjust percentile to select more or fewer frames
    non_silent = mfcc_feat[log_prob >= threshold]
    root, ext = os.path.splitext(filename)
    with open('mfcc/train/H/'+root+'.mfcc', 'wb') as f:
        pickle.dump(non_silent, f)
    



H1.wav




H2.wav
H4.wav




H5.wav




H6.wav




H7.wav




H8.wav




H9.wav


In [209]:

lang_dir = 'dataset/Test/H/'
for filename in os.listdir(lang_dir):
    (rate, sig) = wav.read(lang_dir+filename)
    mfcc_feat = python_speech_features.mfcc(sig, rate, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=False)
    n_components = 2  # number of GMM components
    gmm = GaussianMixture(n_components=n_components)  # create GMM object
    gmm.fit(mfcc_feat)  # fit GMM model to data
    log_prob = gmm.score_samples(mfcc_feat)
    # Select frames with log-likelihood above a threshold
    threshold = np.percentile(log_prob, 10)  # adjust percentile to select more or fewer frames
    non_silent = mfcc_feat[log_prob >= threshold]
    root, ext = os.path.splitext(filename)
    with open('mfcc/test/H/'+root+'.mfcc', 'wb') as f:
        pickle.dump(non_silent, f)
    



In [210]:
lang_dir = 'dataset/Test/F/'
for filename in os.listdir(lang_dir):
    (rate, sig) = wav.read(lang_dir+filename)
    mfcc_feat = python_speech_features.mfcc(sig, rate, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=False)
    n_components = 2  # number of GMM components
    gmm = GaussianMixture(n_components=n_components)  # create GMM object
    gmm.fit(mfcc_feat)  # fit GMM model to data
    log_prob = gmm.score_samples(mfcc_feat)
    # Select frames with log-likelihood above a threshold
    threshold = np.percentile(log_prob, 10)  # adjust percentile to select more or fewer frames
    non_silent = mfcc_feat[log_prob >= threshold]
    root, ext = os.path.splitext(filename)
    with open('mfcc/test/F/'+root+'.mfcc', 'wb') as f:
        pickle.dump(non_silent, f)
    



## Segmentation

In [4]:
import os
import pickle
def segment(filename,nbr,lang_dir):
    with open(lang_dir+filename, 'rb') as f:
        file = pickle.load(f)
    num_frames, num_coefficients = file.shape
    # Calculate the number of frames per segment
    root, ext = os.path.splitext(filename)
    frames_per_segment = num_frames // nbr
    for i in range(nbr):
        start_frame = i * frames_per_segment
        end_frame = (i + 1) * frames_per_segment
        segment = file[start_frame:end_frame, :]
        if('/F/'in lang_dir):
            with open('Segments/'+str(int(60/nbr))+'/F/'+root+'.'+str(int(60/nbr))+"."+str(i+1)+'.mfcc', 'wb') as f:
                pickle.dump(segment, f)
        else :
            with open('Segments/'+str(int(60/nbr))+'/H/'+root+'.'+str(int(60/nbr))+"."+str(i+1)+'.mfcc', 'wb') as f:
                pickle.dump(segment, f)

In [212]:
lang_dir='mfcc/test/F/'
for filename in os.listdir(lang_dir):
    segment(filename,20,lang_dir)
    

In [7]:
lang_dir='mfcc/test/F/'
for filename in os.listdir(lang_dir):
    segment(filename,6,lang_dir)   

In [8]:
lang_dir='mfcc/test/F/'
for filename in os.listdir(lang_dir):
    segment(filename,4,lang_dir)   

In [12]:
lang_dir='mfcc/test/F/'
for filename in os.listdir(lang_dir):
    segment(filename,2,lang_dir)

In [216]:
lang_dir='mfcc/test/H/'
for filename in os.listdir(lang_dir):
    segment(filename,20,lang_dir)
    

In [6]:
lang_dir='mfcc/test/H/'
import pickle
for filename in os.listdir(lang_dir):
    segment(filename,6,lang_dir)   

In [9]:
lang_dir='mfcc/test/H/'
for filename in os.listdir(lang_dir):
    segment(filename,4,lang_dir)   

In [13]:
lang_dir='mfcc/test/H/'
for filename in os.listdir(lang_dir):
    segment(filename,2,lang_dir)

## Modélisation

In [221]:
def model(lang_dir,nbr):
    for filename in os.listdir(lang_dir):
        root, ext = os.path.splitext(filename)
        with open(lang_dir+filename, 'rb') as f:
            file = pickle.load(f)
        gmm = GaussianMixture(n_components=nbr, covariance_type='full')
        gmm.fit(np.vstack(file))
        if('/F/'in lang_dir):
            with open('gmm/'+'/F/'+str(nbr)+'/'+root+'.'+str(nbr)+'.gmm', 'wb') as f:
                pickle.dump(gmm, f)
        else :
            with open('gmm/'+'/H/'+str(nbr)+'/'+root+'.'+str(nbr)+'.gmm', 'wb') as f:
                pickle.dump(gmm, f)
    

In [222]:
lang_dir = 'mfcc/train/H/'
model(lang_dir,128)
model(lang_dir,256)    
model(lang_dir,512)
model(lang_dir,1024)

In [223]:
lang_dir = 'mfcc/train/F/'
model(lang_dir,128)
model(lang_dir,256)    
model(lang_dir,512)
model(lang_dir,1024)

## Scores

In [None]:
def score(file,gmm):
    log_likelihood = gmm.score_samples(file)
    mean =  np.mean(log_likelihood)
    return mean

In [225]:
def loadGmm(path,nbr):
    gmms=[]
    if('/F/'in path):
        path2 = 'gmm/F/'+str(nbr)
    else :
        path2 = 'gmm/H/'+str(nbr)
    
    roots =[]
    for filename in os.listdir(path2):
        root, ext = os.path.splitext(filename)
        with open(path2+'/'+filename, 'rb') as f:
            file = pickle.load(f)
            gmms.append(file)
            roots.append(root)
    return gmms,roots    

In [228]:
def calculate_gmm_scores(lang_dir,nbr):
    gmms,roots = loadGmm(lang_dir,nbr)
    models = []
    results = []
    for filename in os.listdir(lang_dir):
        scores =[]
        names = []
        root, ext = os.path.splitext(filename)
        with open(lang_dir+filename, 'rb') as f:
            file = pickle.load(f)
        for i in range(len(gmms)):
            root, ext = os.path.splitext(filename)
            scores.append(score(file,gmms[i]))
            results.append({
                        'Recording Name': root,
                        'Model Name': roots[i],
                        'Score': score(file,gmms[i])
                    })
    df = pd.DataFrame(results)
    return df

In [230]:
import pandas as pd
lang_dir = 'Segments/3/H/'
gmms,roots = loadGmm(lang_dir,128)
df = calculate_gmm_scores(lang_dir,128)
#calculate_and_store_scores(directory,nbr)

In [234]:
df[:]

Unnamed: 0,Recording Name,Model Name,Score
0,H1.3.1,H1.128,-50.886092
1,H1.3.1,H2.128,-454.76085
2,H1.3.1,H4.128,-549.383008
3,H1.3.1,H5.128,-590.224754
4,H1.3.1,H6.128,-257.973788
5,H1.3.1,H7.128,-88.847773
6,H1.3.1,H8.128,-73.618699
7,H1.3.1,H9.128,-73.161958


In [153]:
group_size = 20
num_groups = len(df) // group_size

max_scores = []
for i in range(num_groups):
    start_index = i * group_size
    end_index = (i + 1) * group_size
    group_scores = df['Score'][start_index:end_index]
    max_score = group_scores.max()
    max_scores.append(max_score)

In [154]:
max_scores

[-49.442451629354345,
 -50.11722268253231,
 -45.6205295834607,
 -46.10805484438308]

In [166]:
nom_fichier_excel = 'score_hommes.xlsx'
df.to_excel(nom_fichier_excel, index=False)

In [None]:
lang_dir = 'Segments/3/F/'
gmms,roots = loadGmm(lang_dir,128)
df2 = calculate_gmm_scores(lang_dir,128)

In [None]:
nom_fichier_excel = 'score_femmes.xlsx'
df2.to_excel(nom_fichier_excel, index=False)

## Taux d'erreur :

In [None]:
def Error(df,nbr):
    score = df[]