<a href="https://colab.research.google.com/github/ffer200395/The-Joe-Rogan-Experience/blob/main/Tuned_threshold_GMM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.mixture import GaussianMixture
from scipy.io.wavfile import read, write
import numpy as np
import pandas as pd
import scipy.special
import math
import os
from tqdm import tqdm

In [None]:
def get_signal(file_name):
    # Obtain raw signal from audio
    _, signal = read(file_name)
    # Convert signal into absolute values to avoid zero mean
    s_abs = np.abs(signal)
    if len(s_abs)%16!=0:
        cut = len(s_abs)-len(s_abs)%16
        s_abs = s_abs[:cut]
    # We will work with miliseconds, mean of 16 samples = 1ms
    s_abs = np.mean(s_abs.reshape(-1, 16), axis=1)
    return s_abs
    
def get_gmm_result(s_log):
    # Fit a GMM and apply it to he signal
    gm = GaussianMixture(n_components=2).fit(s_log.reshape(-1, 1))
    yhat = gm.predict(s_log.reshape(-1, 1))
    # Get both means and sigma assuming binomial distribution
    mu_1 = np.mean(s_log[np.where(yhat==0)])
    mu_2 = np.mean(s_log[np.where(yhat==1)])
    sigma = np.std(s_log)
    return mu_1, mu_2, sigma 

#https://stats.stackexchange.com/questions/275722/how-to-find-the-threshold-between-two-histograms-at-fixed-false-positive-rate
def area_under_gaussian_at_left(t,mu,sigma):
    a = t-mu
    b = math.sqrt(2)*sigma
    return .5*(1+scipy.special.erf(a/b))

def area_under_gaussian_at_right(t,mu,sigma):
    return 1-area_under_gaussian_at_left(t,mu,sigma)

def consecutive(data, stepsize=1):
    return np.split(data, np.where(np.diff(data) != stepsize)[0]+1)

def hms(total_ms):
    # Convert total miliseconds in 00:00:00:000 format
    ms = ((total_ms % 1000))
    s = int((total_ms / 1000) % 60)
    m = int((total_ms / (1000 * 60)) % 60)
    h = int((total_ms / (1000 * 60 * 60)) % 24)
    return '{:02d}:{:02d}:{:02d}:{:03d}'.format(h,m,s,ms)

def get_start_end_time(tup):
    return (hms(tup[0]),hms(tup[1]))

def get_noises_intervals(s_log, s_abs, mu_1, mu_2, sigma):
    false_positive = area_under_gaussian_at_right(s_log,mu_1,sigma)
    false_negative = area_under_gaussian_at_left(s_log,mu_2,sigma)
    # From which value it is considered "no silence"
    threshold = 10**s_log[np.argmin(false_negative+false_positive)]
    # Find all segments over threshold
    idx_noise = np.where(s_abs>threshold)[0]
    # Aggregate segments
    noise_intervals = consecutive(idx_noise)
    noise_start_end = [(x[0],x[0]+1) if len(x)==1 else (x[0],x[-1]) for x in noise_intervals]
    return noise_start_end, threshold

def run_all(path_audio):
    ls_thresholds = []
    # List of files to be processed
    files = os.listdir(path_audio)
    for file in tqdm(files):
        file_name = path_audio+file
        try:
            # Read .wav file
            s_abs = get_signal(file_name)
            # Convert into a logarithmic scale
            s_abs[s_abs == 0] = 0.1
            s_log = np.log10(s_abs)
            # Apply Gaussian Mixture Model assuming 2 distributions
            mu_1, mu_2, sigma = get_gmm_result(s_log)
            # Get noise intervals
            noise_start_end, threshold = get_noises_intervals(s_log, s_abs, mu_1, mu_2, sigma)
            # Convert time format
            map_object = map(get_start_end_time, noise_start_end)
            new_list = list(map_object)
            print(len(new_list),threshold)
            # Create and save a dataframe of noise intervals for each episode
            df = pd.DataFrame(new_list, columns=['start', 'end'])
            df.to_csv('data/noises/'+file.split('.')[0]+'.csv')
            # Keep record of thresholds
            ls_thresholds.append((int(file.split('.')[0][3:]), threshold))
        except:
            print(file)
    df_treshold = pd.DataFrame(ls_thresholds, columns=['Episode','Threshold'])
    df_treshold.to_csv('data/episode_threshold.csv',index=False)

In [None]:
# Audio path
path_audio = 'data/podcasts_wav/'
run_all(path_audio)