In [1]:
# because the valve samples are not continous, but rather they click with an interval between it makes 
# sense to look at a different type of modeling.
# first idea is to check if peaks occur at regular intervals. If not it might be an anomaly.
# second is to split the files by peak

# both need some form of peak detection.


In [83]:
import os
import librosa
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
# get the parent directory
parent_dir = os.path.abspath('..')
test_dir = os.path.join(parent_dir, 'Testing')
data_dir = os.path.join(parent_dir, 'Data')
meta_dir = os.path.join(data_dir, 'metadata')
sour_dir = os.path.join(data_dir, 'source')
data_dir = os.path.join(parent_dir, 'EDA')

In [7]:
valve_subset = pd.read_csv(os.path.join(test_dir, 'valve_subset.csv'))

In [74]:
random_file = os.path.join(sour_dir, valve_subset.iloc[1598]['file_rel_path'])


In [79]:
irregularIntervalCheck(random_file)

In [70]:
def irregularIntervalCheck(file):
    """
    For the valve machine types, this function checks the regularity of the timing between
    the opening and closing of the valve. Once the average interval exceeds a threshold, you can 
    be statistically 100% sure something is wrong.
    :file: input soundfile comprising of at least 5 seconds of valve operations
    :return: 0 or 1 representing normal or abnormal behavior.
    """
    # threshold variable, optimized through testing by calculating the max average interval 
    # of all normal samples and adding a buffer of 10%.
    # this represents a subset of 8% of all abnormalities with 100% accuracy.
    threshold = 52.5
    
    # load sound file into librosa library
    y, sr = librosa.load(file)

    # calculate the onsets of the volume peaks that represent the valve operating.
    onset_env = librosa.onset.onset_strength(
        y=y, 
        sr=sr,
        hop_length=512,
        aggregate=np.median
    )
    
    # get the a time series representing the peaks
    peaks_abnormal = librosa.util.peak_pick(onset_env, 3, 3, 3, 5, 0.5, 10)
    
    # calculating the intervals between the peaks
    intervals = []
    for idx, x in enumerate(peaks_abnormal):
        try:
            intervals.append(peaks_abnormal[idx+1] - peaks_abnormal[idx])
        except:
            pass
    
    # calculate average interval per soundfile, if the average is larger
    # then the threshold the signal is abnormal.
    average = sum(intervals) / len(intervals)

    if average > threshold:
        return 1
    else:
        return 0



In [76]:
all_valve_subset = pd.read_csv(os.path.join(test_dir, 'all_valve_subset.csv'))

In [86]:
# testing results on the full valve data
correct_pred = 0
total_pred = 0

for idx, row in tqdm(all_valve_subset.iterrows()):
    total_pred += 1
    path = os.path.join(sour_dir, row['file_rel_path'])
    result = irregularIntervalCheck(path)
    
    if row['anomaly'] == result:
        correct_pred += 1
        
accuracy = correct_pred / total_pred

4170it [27:10,  2.56it/s]


In [170]:
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0

threshold = 52.5
all_predictions = []

for idx, interval in enumerate(all_intervals):
    prediction = None
    average = sum(interval) / len(interval)
    if average > threshold:
        prediction = 1
        all_predictions.append(prediction)
        if prediction == all_valve_subset.iloc[idx]['anomaly']:
            true_positive += 1
        else:
            false_positive += 1
    else:
        prediction = 0
        if prediction == all_valve_subset.iloc[idx]['anomaly']:
            true_negative += 1
        else:
            false_negative += 1
        all_predictions.append(prediction)
    
    

In [168]:
true_positive, false_positive, true_negative, false_negative

(39, 0, 3691, 440)

In [172]:
all_valve_subset['predictionsPeakLimiter'] = all_predictions

In [173]:
all_valve_subset

Unnamed: 0,file_no,machine,model,anomaly,file_name,file_rel_path,duration_s,fs_Hz,SNR_dB,predictionsPeakLimiter
0,13849,valve,0,1,00000000.wav,6_dB_valve\valve\id_00\abnormal\00000000.wav,10.0,16000,6,1
1,13850,valve,0,1,00000001.wav,6_dB_valve\valve\id_00\abnormal\00000001.wav,10.0,16000,6,1
2,13851,valve,0,1,00000002.wav,6_dB_valve\valve\id_00\abnormal\00000002.wav,10.0,16000,6,1
3,13852,valve,0,1,00000003.wav,6_dB_valve\valve\id_00\abnormal\00000003.wav,10.0,16000,6,0
4,13853,valve,0,1,00000004.wav,6_dB_valve\valve\id_00\abnormal\00000004.wav,10.0,16000,6,1
...,...,...,...,...,...,...,...,...,...,...
4165,18014,valve,6,0,00000987.wav,6_dB_valve\valve\id_06\normal\00000987.wav,10.0,16000,6,0
4166,18015,valve,6,0,00000988.wav,6_dB_valve\valve\id_06\normal\00000988.wav,10.0,16000,6,0
4167,18016,valve,6,0,00000989.wav,6_dB_valve\valve\id_06\normal\00000989.wav,10.0,16000,6,0
4168,18017,valve,6,0,00000990.wav,6_dB_valve\valve\id_06\normal\00000990.wav,10.0,16000,6,0


In [89]:
def getIntervals(file):

    # threshold variable, optimized through testing by calculating the max average interval 
    # of all normal samples and adding a buffer of 10%.
    # this represents a subset of 8% of all abnormalities with 100% accuracy.
    threshold = 52.5
    
    # load sound file into librosa library
    y, sr = librosa.load(file)

    # calculate the onsets of the volume peaks that represent the valve operating.
    onset_env = librosa.onset.onset_strength(
        y=y, 
        sr=sr,
        hop_length=512,
        aggregate=np.median
    )
    
    # get the a time series representing the peaks
    peaks_abnormal = librosa.util.peak_pick(onset_env, 3, 3, 3, 5, 0.5, 10)
    
    # calculating the intervals between the peaks
    intervals = []
    for idx, x in enumerate(peaks_abnormal):
        try:
            intervals.append(peaks_abnormal[idx+1] - peaks_abnormal[idx])
        except:
            pass
   
    return intervals

In [90]:
# implementing this algorithm in a classifier
all_intervals = []
for idx, row in tqdm(all_valve_subset.iterrows()):
    total_pred += 1
    path = os.path.join(sour_dir, row['file_rel_path'])
    all_intervals.append(getIntervals(path))

4170it [19:12,  3.62it/s]


In [175]:
len(all_intervals), len(all_valve_subset)

(4170, 4170)

In [176]:
all_averages = []
for interval in all_intervals:
    average = sum(interval) / len(interval)
    all_averages.append(average)

In [177]:
all_averages

[71.5,
 67.66666666666667,
 53.714285714285715,
 47.0,
 53.57142857142857,
 28.153846153846153,
 65.16666666666667,
 57.0,
 38.0,
 28.4,
 66.66666666666667,
 50.0,
 41.857142857142854,
 66.33333333333333,
 45.111111111111114,
 31.125,
 62.166666666666664,
 38.5,
 28.4,
 23.5625,
 53.714285714285715,
 80.2,
 43.666666666666664,
 66.66666666666667,
 60.42857142857143,
 50.833333333333336,
 62.833333333333336,
 47.0,
 51.875,
 39.57142857142857,
 62.666666666666664,
 80.6,
 28.4,
 53.5,
 53.857142857142854,
 51.125,
 25.125,
 76.2,
 62.666666666666664,
 23.666666666666668,
 79.25,
 57.142857142857146,
 58.833333333333336,
 32.8,
 80.0,
 53.857142857142854,
 69.25,
 54.0,
 56.857142857142854,
 35.25,
 32.42857142857143,
 36.09090909090909,
 62.666666666666664,
 27.333333333333332,
 70.5,
 44.44444444444444,
 34.5,
 63.166666666666664,
 74.8,
 39.875,
 30.23076923076923,
 47.111111111111114,
 36.90909090909091,
 23.235294117647058,
 32.0,
 36.8,
 31.666666666666668,
 38.45454545454545,
 25.

In [179]:
import plotly.express as px


In [190]:
fig = px.violin(
    x=all_valve_subset['anomaly'], 
    y=all_averages,
    color=all_valve_subset['anomaly'])
fig.show()