In [None]:
import numpy as np
import pandas as pd

from scipy.signal import stft #Short Time Fourier Transform
from scipy.spatial.distance import seuclidean #Standardized Euclidean Distance
from scipy.stats import chi2 #Chi square distribution

from dask.distributed import Client
import dask.bag as db
from glob import glob

from datetime import datetime

In [None]:
def getParams(fs, alphaMax=2000, dfreq=1500, fMin=5e+4):
    # produces the parameters for the cyclic spectrum estimation
    
    # fs - samping rate [sample/sec]
    # alphaMax - max modulation frequency [Hz]
    # dfreq - carrier frequency resolution [Hz]
    # fMin - min carrier frequency [Hz]

    # STFT windows' hop [sample]
    R = int(np.floor(fs / (2 * alphaMax))) 
    # STFT window length [sample]
    Nw = int(fs / dfreq)

    # Hanning window
    w = np.hanning(Nw)
    # Dirichlet kernel parameter (oreder)
    P = int(np.round((Nw - 1) / (2 * R)))
    # Dirichlet kernel
    D = np.sum(
        [np.exp(2 * np.pi * 1j * p *(np.arange(Nw) - Nw / 2) / Nw) for p in np.arange(- P, P + 1)], 
        axis=0
    )
    D = D.real
    return fMin, R, Nw, w, D

In [None]:
def getCS(x, fs, approxCoh=True, normalization=False, alphaMax=2000, dfreq=1500): 
    # An implementation of the cyclic spectral correlation and cyclic spectral coherence according to 
    # Borghesani, P., and J. Antoni. "A faster algorithm for the calculation of the fast spectral correlation." 
    # Mechanical Systems and Signal Processing 111 (2018): 113-118.
    
    #inputs:
    # x - signal
    # fs - sampling rate [sample/sec]
    # approxCoh - neglect the deviation of the carrier freqeuncy due to the computation 
    # method for the estimation of the coherence Sxx(f-2 * alpha) = Sxx(f) 
    # normalization - calculation of the normalization factor
    # alphaMax - maximal modulation frequency [Hz]
    # dfreq - minimal carrier frequency [Hz]
    
    # comment: 
    # another approximation is applied always: S(f, alpha)=S(f-2*alpha, alpha)
    # this part should not affect the detection and it is mainly cosmetic
    
    fMin, R, Nw, w, D = getParams(fs, alphaMax=alphaMax, dfreq=dfreq)

    # STFT with with Hanning window 
    X_w = stft(x, fs=fs, window=w, nperseg=Nw, noverlap=Nw - R, nfft=Nw, return_onesided=True)[-1]
    # STFT with with Hanning multiplied by Dirichlet kernel window
    f, t, X_w_d = stft(x, fs=fs, window=w * D, nperseg=Nw, noverlap=Nw - R, nfft=Nw, return_onesided=True)

    if approxCoh:
        # here I save some computation time by removing the frequencies below fMin.
        X_w = X_w[f >= fMin, :-1]
        X_w_d = X_w_d[f >= fMin, :-1]

    # Cyclcic Spectrum
    CS = np.fft.fft(np.conjugate(X_w) * X_w_d, axis=1).T
    # Modulation frequency vector
    alpha = np.fft.fftfreq(X_w_d.shape[1], R / fs)
    # remove the negative modulation frequency part
    pistiveAlphaCond = alpha >= 0
    CS = CS[pistiveAlphaCond, :]
    alpha = alpha[pistiveAlphaCond]

    if normalization:
        # here I implemented the normalization but did not find it useful - the results' improvement is not impressive.
        normalizingFactor = np.fft.fft((w**2) * D, int(R * (1 + (x.size - Nw) / R)))[:np.sum(pistiveAlphaCond)]
        normalizingFactor *= fs * X_w_d.shape[1]
        CS = (CS.T / normalizingFactor).T
        normalizingFactor_abs = np.abs(normalizingFactor)
        normalizingFactorCond = normalizingFactor_abs / np.max(normalizingFactor_abs) > 0.95
        CS = CS[normalizingFactorCond, :]
    else:
        normalizingFactorCond = np.ones(np.sum(pistiveAlphaCond), dtype=bool)


    # Cyclic Coherence
    CS_abs = np.abs(CS)
    if approxCoh:
        # second approximation - neglect the impact of the modulation frquency on the spectrum
        CCoh = CS_abs / CS_abs[0, :]
    else:
        inds = np.atleast_2d(np.arange(f.size)) - np.atleast_2d((np.arange(CS.shape[0]) * Nw) / (R * alpha.size)).T
        inds = inds.astype(int)
        CCoh = CS_abs / np.sqrt(CS_abs[0, :] * CS_abs[0, inds])
        CS = CS[:, f >= fMin]
        CCoh = CCoh[:, f >= fMin]

    alpha = alpha[normalizingFactorCond]
    f = f[f >= fMin]

    return CS, CCoh, f, alpha

In [None]:
def getPvalEES(file, alphaMax=2000, dfreq=1500, alphaLims=[0.01, 0.51]):
    #wrapper for getCS + extracter of the p-value between the 100Hz harmonics to the background
    
    #inputs:
    # file - file name
    # alphaMax - max modulation frequency [Hz]
    # dfreq - carrier frequency resolution [Hz]
    # alphaLims - relative limits (0:1) of the feature extractor with respect to the modulation frequency 
    
    df = pd.read_feather(file)
    # rounding sample length for precise frequency resolution
    df = df[df.time_ms < np.round(df.time_ms.values[-1] / 1000) * 1000]
    
    # sampling rate [sample/sec]
    fs = 1000 / df.iloc[1, 0] 
    
    pValue = []
    EESs = []
    #iteration over sampling channels
    for ch in np.arange(1, df.shape[1]):
        x = df.iloc[:, ch].values
        
        CS, CCoh, f, alpha = getCS(x, fs, alphaMax=alphaMax, dfreq=dfreq, normalization=False)
        CCoh_abs = np.abs(CCoh.T)
        
        # cutting out with respect to the limits of the modulation frequency
        edgeCond = (alpha > alphaMax * alphaLims[0]) & (alpha < alphaMax * alphaLims[1])
        CCoh_abs = CCoh_abs[:, edgeCond]
        alpha = alpha[edgeCond]
        
        #Enhenced Envelope Spectrum
        EES = CCoh_abs.sum(axis=0) 
    
        # condition for harmonics of 100Hz
        cond100 = np.mod(alpha, 100) == 0
        cond360 = alpha > 360
        EES_PD = EES[cond100 & cond360]
        EESnot100 = EES[~cond100]
        
        # standardized Euclidean distance between the distribution of the background spectrum to the 100Hz harmonics
        SED = seuclidean(
            EES_PD, 
            EESnot100.mean() * np.ones_like(EES_PD), 
            EESnot100.var() * np.ones_like(EES_PD)
        )
        pValue.append(chi2(EES_PD.size).sf(SED))
        EESs.append(EES)
        
    res = {
        'channel': np.arange(1, df.shape[1]),  
        'pValue': pValue, 
        'alpha': alpha, 
        'EESs': EESs, 
        'file': file
    }
    
    return res

### Following allows running a smaller batch of data

In [None]:
# count = {a: 0 for a in list(set(f.split('/')[1].split('_16')[0] for f in files))}
# calcFiles = []
# for f in files[::-1]:
#     for k in count.keys():
#         if k in f:
#             count[k] += 1
#             calcFiles.append(f)
#             break
#     if count[k] == 200:
#         count.pop(k, None)
#     if len(count.keys()) == 0:
#         break
# len(calcFiles)

# files = calcFiles

### Calculation of the features

In [None]:
resFold = 'resPvalueHighHarmonicVSallSpect'

In [None]:
with Client() as client:
    display(client)
    d = db.from_sequence(glob('data/*.fthr'))
    df = d.map(getPvalEES).to_dataframe()
    df.to_parquet(resFold)

In [None]:
from glob import glob
df = pd.read_parquet(resFold)
df

In [None]:
df = df.assign(sensor=df.file.apply(lambda x: x.split('/')[-1].split('_16')[0]))
df = df.assign(time_num=df.file.apply(lambda x: int(x.split('/')[-1].split('_')[-1].split('.')[0])))
df = df.assign(time=df.time_num.apply(datetime.fromtimestamp))
df = df.reset_index(drop=True)
df = df.explode(column=['channel', 'pValue', 'EESs'])
df

In [None]:
df = df[~((df.time < '2021-08-15') & (df.sensor=='TRF01_IW976-0032'))]
df = df[~((df.time < '2021-11-01') & (df.sensor=='T7_ANALOGMAX-1'))]
df

In [None]:
for title, group in df.groupby(['sensor', 'channel']):
    ax = group.plot(x='time', y='pValue', title=str(title[0])+ ' ' + str(title[1]), figsize=(19, 5))
    ax.grid()

In [None]:
df = df.assign(PD=df.sensor.apply(lambda x: not (('T7' in x) or ('TRF10' in x))))
df

In [None]:
import matplotlib.pyplot as plt

TP = []
FP = []
for th in np.arange(0, 1.01, 0.01):
    dfTemp = df.assign(detectPD=df.pValue < th)
    dfTemp[dfTemp.PD].groupby(['sensor', 'channel'])['detectPD'].mean().reset_index()
    TP.append(dfTemp.loc[dfTemp.PD].groupby(['sensor', 'channel'])['detectPD'].mean().mean())
    FP.append(dfTemp.loc[~dfTemp.PD].groupby(['sensor', 'channel'])['detectPD'].mean().mean())
plt.figure(figsize=[10, 10])
plt.plot(FP, TP)
plt.xlabel('FP')
plt.ylabel('TP')
plt.xticks(np.arange(0, 1, 0.05))
plt.yticks(np.arange(0, 1, 0.05))
plt.autoscale(enable=True, tight=True)
plt.title("ROC: AUC={:.2f}".format(np.trapz(TP, FP)))
plt.grid()
plt.show()

In [None]:
import plotly.express as px
fig = px.ecdf(
    df.assign(sensor=df.sensor + '-' + df.channel.apply(str)),  
    x='pValue',
    color='sensor',
    title='CDF'
)
fig.show()
fig.write_html('cdf.html')

In [None]:
def plotRow(row):
    plt.figure(figsize=[19, 5])
    plt.plot(row['alpha'], row['EESs'])
    plt.grid()
    plt.title(row['file'])
    plt.show()
#plotting the false alarms
dfFP = df[(~df.PD) & (df.pValue < 0.1)].sort_values(by='pValue')
dfFP.apply(plotRow, axis=1);

In [None]:
dfFP

### Code for avro files - runs significantly slower

In [None]:
# def getPvalEESavro(avroDict, alphaMax=2000, dfreq=1500, alphaLims=[0.01, 0.51]):
#     #wrapper for getCS + extracter of the p-value between the 100Hz harmonics to the background
    
#     #inputs:
#     # file - file name
#     # alphaMax - max modulation frequency [Hz]
#     # dfreq - carrier frequency resolution [Hz]
#     # alphaLims - relative limits (0:1) of the feature extractor with respect to the modulation frequency 
    
#     # sampling rate [sample/sec]
#     fs = avroDict['samplerate']
    
#     pValue = []
#     EESs = []
#     #iteration over sampling channels
#     for x in avroDict['adc_signal_mv']:
#         # rounding sample length for precise frequency resolution
#         x = x[:int(np.floor(len(x) / fs) * fs)]
        
#         CS, CCoh, f, alpha = getCS(x, fs, alphaMax=alphaMax, dfreq=dfreq, normalization=False)
#         CCoh_abs = np.abs(CCoh.T)
        
#         # cutting out with respect to the limits of the modulation frequency
#         edgeCond = (alpha > alphaMax * alphaLims[0]) & (alpha < alphaMax * alphaLims[1])
#         CCoh_abs = CCoh_abs[:, edgeCond]
#         alpha = alpha[edgeCond]
        
#         #Enhenced Envelope Spectrum
#         EES = CCoh_abs.sum(axis=0) 
    
#         # condition for harmonics of 100Hz
#         cond100 = np.mod(alpha, 100) == 0
#         cond360 = alpha > 360
#         EES_PD = EES[cond100 & cond360]
#         EESnot100 = EES[~cond100]
        
#         # standardized Euclidean distance between the distribution of the background spectrum to the 100Hz harmonics
#         SED = seuclidean(
#             EES_PD, 
#             EESnot100.mean() * np.ones_like(EES_PD), 
#             EESnot100.var() * np.ones_like(EES_PD)
#         )
#         pValue.append(chi2(EES_PD.size).sf(SED))
#         EESs.append(EES)
        
#     res = {
#         'channel': np.arange(len(avroDict['adc_signal_mv'])),  
#         'pValue': pValue, 
#         'alpha': alpha, 
#         'EESs': EESs, 
#         'timestamp': avroDict['timestamp'],
#         'id': avroDict['id']
#     }
    
#     return res

In [None]:
# with Client() as client:
#     display(client)
#     d = db.read_avro(glob('dataAvro/*'), blocksize=None)
#     df = d.map(getPvalEESavro).to_dataframe()
#     df.to_parquet(resFile+'_avro')