In [9]:
import os
import librosa
import sys
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import librosa.display

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score

from get_mel_spectrogram import get_mel_spectrogram

In [3]:
# get the parent directory
parent_dir = os.path.abspath('..')
test_dir = os.path.join(parent_dir, 'Testing')
data_dir = os.path.join(parent_dir, 'Data')
meta_dir = os.path.join(data_dir, 'metadata')
sour_dir = os.path.join(data_dir, 'source')
data_dir = os.path.join(parent_dir, 'EDA')

In [7]:
fan_data = pd.read_csv(os.path.join(test_dir, 'all_fan_subset.csv'))
pump_data = pd.read_csv(os.path.join(test_dir, 'all_pump_subset.csv'))
slider_data = pd.read_csv(os.path.join(test_dir, 'all_slider_subset.csv'))
valve_data = pd.read_csv(os.path.join(test_dir, 'all_valve_subset.csv'))

In [33]:

#best number of mels 64, best window size, 1.0, best channel = 3, best number of neighbours = 5

In [34]:
def modeler(machine):
    """
    Function that does the actual training of the models, using the parameters that have come up
    as optimal from the testing process. 
    :machine: string that represents which machine type we are currently training on
    :return: both the model and a dict containing all params used and results are returned.
    """

    # select the correct dataset
    if machine == 'fan':
        data = fan_data
    elif machine == 'pump':
        data = pump_data
    elif machine == 'slider':
        data = slider_data
    elif machine == 'valve':
        data = valve_data
        
    # setting a dict for storing all parameters and results
    results_knn = {
        'window' :  [],
        'window_length' :  [],
        'overlap' :  [],
        'overlap_length' :  [],
        'hop_length' :  [],
        'n_fft' :  [],
        'n_mels' :  [],
        'n_neighbor' :  [],
        'channel' :  [],
        'accuracy' :  [],
        'recall' :  [],
        'precision' :  [],
        'f1' :  [],
        'cross_val_scores' :  [],
        'cross_val_mean' :  [],
        'cross_val_stdev' :  []
    }

    
    # storing the results of the data preprocessing for all the files
    all_mel_spect = []
    all_mel_spect_db = []
    all_mfcc = []
    all_params = []

    # setting the classifiers parameters here    
    window_p = 1
    overlap_p = None
    n_fft_p = None
    
    if machine == 'fan':
        n_mels_p = 32
        n_neighbors_p = 5
        channel_p = 3
    elif machine == 'pump':
        n_mels_p = 512
        n_neighbors_p = 3
        channel_p = 1
    elif machine == 'slider':
        n_mels_p = 512
        n_neighbors_p = 3
        channel_p = 6
    elif machine == 'valve':
        # not yet optimized
        n_mels_p = 512
        n_neighbors_p = 3
        channel_p = 7

    # loop through all files in the data set to apply preprocessing
    for idx, row in data.iterrows():
        mel_spect, mel_spect_db, mfcc, params = get_mel_spectrogram(
            os.path.join(sour_dir, row['file_rel_path']), 
            no_channel = channel_p, 
            window = window_p,
            overlap = overlap_p, 
            n_fft = n_fft_p, 
            n_mels = n_mels_p
        )

        # storing the results
        all_mel_spect.append(mel_spect)
        all_mel_spect_db.append(mel_spect_db)
        all_mfcc.append(mfcc)
        # store the parameters
        params['n_neighbor'] = n_neighbors_p
        params['channel'] = channel_p
        all_params.append(params)

    # write the results to the dataframe
    data['mel_spect'] = all_mel_spect
    data['mel_spect_db'] = all_mel_spect_db
    data['all_params'] = all_params

    # define target
    y = data['anomaly']
    
    # define features
    X = []
    for idx, row in data.iterrows():
        #X.append(row['mel_spect_db'].reshape(1, total_size))
        X.append(np.array(row['mel_spect_db']).flatten())

    # split the data in 2 groups, 1 for training, 1 for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2)

    # initialize the knn classifier
    model = KNeighborsClassifier(n_neighbors=n_neighbors_p)
    
    # train the classifier
    model.fit(X_train,y_train)

    # get predictions
    predicted = model.predict(X_test) 

    # calculate results
    accuracy = metrics.accuracy_score(y_test, predicted)
    recall = metrics.recall_score(predicted, y_test, average='macro')
    precision = metrics.precision_score(predicted, y_test, average='macro')
    f1 = metrics.f1_score(predicted, y_test, average='macro')

    # cross-validate
    cross_val_scores = cross_val_score(model, X_test, y_test, cv=5) # cv is the number of folds (k)
    cross_val_mean = cross_val_scores.mean() * 100 
    cross_val_stdev = cross_val_scores.std() * 100

    # store results
    for key, val in params.items():
        results_knn[key].append(val)
    results_knn['accuracy'].append(accuracy)
    results_knn['recall'].append(recall)
    results_knn['precision'].append(precision)
    results_knn['f1'].append(f1)
    results_knn['cross_val_scores'].append(cross_val_scores)
    results_knn['cross_val_mean'].append(cross_val_mean)
    results_knn['cross_val_stdev'].append(cross_val_stdev)

    print(f"params: {params}")
    print(f"Accuracy: {accuracy}")
    print(f"recall: {recall}")
    print(f"precision: {precision}")
    print(f"f1: {f1}")
    print("cross_val mean and acc: {:.2f}% (+/- {:.2f})".format(cross_val_mean, cross_val_stdev))
    
    return model, results_knn


In [35]:
fan_model, fan_results = modeler('fan')

KeyboardInterrupt: 

In [None]:
pump_model, pump_results = modeler('pump')

In [None]:
slider_model, slider_results = modeler('slider')

In [None]:
valve_model, valve_results = modeler('valve')

In [None]:
results_knn

In [24]:
# storing the results
import joblib
joblib.dump(fan_model, 'Models/fan_model.pkl')
joblib.dump(pump_model, 'Models/pump_model.pkl')
joblib.dump(slider_model, 'Models/slider_model.pkl')
joblib.dump(valve_model, 'Models/valve_model.pkl')

['Models/valve_model.pkl']

In [32]:
pd.DataFrame(fan_results).to_csv('Models/fan_results.csv')
pd.DataFrame(pump_results).to_csv('Models/pump_results.csv')
pd.DataFrame(slider_results).to_csv('Models/slider_results.csv')
pd.DataFrame(valve_results).to_csv('Models/valve_results.csv')

In [30]:
fan_model_cv_test, fan_results_cv_test = modeler('fan')

params: {'window': 1, 'window_length': 16000, 'overlap': 0.5, 'overlap_length': 8000, 'hop_length': 8000, 'n_fft': 16384, 'n_mels': 32, 'n_neighbor': 5, 'channel': 3}
Accuracy: 0.990990990990991
recall: 0.994061757719715
precision: 0.9820143884892086
f1: 0.9878556330169233
cross_val mean and acc: 97.75% (+/- 1.07)


In [None]:
fan_model_cv_test, fan_results_cv_test = modeler('fan')