# Umgebungsvariablen

In [1]:
train_data_path = '''C:/studium/studium/CAS_PML/Projekt_Arbeit/earthquake/Daten/all/train.csv'''
feature_data_path = '''C:/studium/studium/CAS_PML/Projekt_Arbeit/earthquake/Daten/all/'''

# Import

In [2]:
import time

import pandas as pd
import numpy as np
#data visualization
import matplotlib.pyplot as plt
import librosa

from scipy.stats import kurtosis
from scipy.stats import skew

from scipy import signal

import warnings
warnings.filterwarnings('ignore')

pd.options.display.precision = 10

# Trainingsdaten laden

In [3]:
train_data = pd.read_csv(train_data_path, dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

## Low-pass (noise-reducing) filter

In [329]:
def downsample(data, downsample_rate):
    trunc = int((data.shape[0] // downsample_rate) * downsample_rate)
    data = data[:trunc, :]
    x = signal.decimate(data[:, 0].ravel(), downsample_rate)
    y_ = data[:, 1].reshape((-1, downsample_rate)).T
    y = y_.mean(axis=0)
    return x, y

In [94]:
#The scipy.signal.decimate function applies a low-pass (noise-reducing) filter, and samples the data every n-th value based on the downsample rate
dsamprate = 4
train_data_lowpassfilter = signal.decimate(train_data.acoustic_data, dsamprate)
train_data_lowpassfilter['time_to_failure'] = train_data.time_to_failure[::dsamprate]

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [319]:
time_to_failure_list = list(train_data.time_to_failure)[::dsamprate]

# Feature Extraction

Die Testsegmente bestehen jeweils aus 150.000 Messpunkte. Aus diesem Grund setzten wir die Fenstergrösse auf 150000 Messpunkte.

## Feature Extraction Function

Extracting features
We will extract

 - statistical moments
 - root-mean-square
 - Zero Crossing Rate
 - Spectral Centroid
 - Chroma Frequencies
 - Spectral Rolloff
 - Mel-Frequency Cepstral Coefficients

In [4]:
def generate_featureRow_old(acoustic_data, time_to_failure, sample_rate):
    
    strain_feature = pd.DataFrame()
    strain_feature['mean'] = [acoustic_data.mean()]
    strain_feature['std'] = [acoustic_data.std()]
    strain_feature['kurt'] = [acoustic_data.kurtosis()]
    strain_feature['skew'] = [acoustic_data.skew()]
    strain_feature['min'] = [acoustic_data.min()]
    strain_feature['max'] = [acoustic_data.max()]
    strain_feature['1%q'] = [acoustic_data.quantile(0.01, interpolation='midpoint')]
    strain_feature['5%q'] = [acoustic_data.quantile(0.05,interpolation='midpoint')]
    strain_feature['25%q'] = [acoustic_data.quantile(0.25,interpolation='midpoint')]
    strain_feature['75%q'] = [acoustic_data.quantile(0.75,interpolation='midpoint')]
    strain_feature['95%q'] = [acoustic_data.quantile(0.95,interpolation='midpoint')]
    strain_feature['99%q'] = [acoustic_data.quantile(0.99,interpolation='midpoint')]
    strain_feature['absMax'] = [acoustic_data.abs().max()]
    strain_feature['absMean'] = [acoustic_data.abs().mean()]
    strain_feature['absStd'] = [acoustic_data.abs().std()]
    
    
    #Compute root-mean-square (RMS) value for each frame
    rms = librosa.feature.rms(y=np.array(acoustic_data,dtype=np.float64),frame_length=len(acoustic_data))
    rms = pd.Series(rms[0,:])
    strain_feature['rms_mean'] = [rms.mean()]
    strain_feature['rms_std'] = [rms.std()]
    strain_feature['rms_kurt'] = [rms.kurtosis()]
    strain_feature['rms_skew'] = [rms.skew()]
    strain_feature['rms_min'] = [rms.min()]
    strain_feature['rms_max'] = [rms.max()]
    strain_feature['rms_1%q'] = [rms.quantile(0.01, interpolation='midpoint')]
    strain_feature['rms_5%q'] = [rms.quantile(0.05,interpolation='midpoint')]
    strain_feature['rms_25%q'] = [rms.quantile(0.25,interpolation='midpoint')]
    strain_feature['rms_75%q'] = [rms.quantile(0.75,interpolation='midpoint')]
    strain_feature['rms_95%q'] = [rms.quantile(0.95,interpolation='midpoint')]
    strain_feature['rms_99%q'] = [rms.quantile(0.99,interpolation='midpoint')]
    strain_feature['rms_absMax'] = [rms.abs().max()]
    strain_feature['rms_absMean'] = [rms.abs().mean()]
    strain_feature['rms_absStd'] = [rms.abs().std()]
    
    #Zero Crossing Rate
    '''The zero crossing rate is the rate of sign-changes along a signal, i.e., 
    the rate at which the signal changes from positive to negative or back.'''
    zero_crossings = librosa.zero_crossings(np.array(acoustic_data,dtype=np.float64), pad=False)
    strain_feature['zero_crossings'] = [sum(zero_crossings)]
    
    #Spectral Centroid
    '''Each frame of a magnitude spectrogram is normalized and treated as a distribution
    over frequency bins, from which the mean (centroid) is extracted per frame.So spectral centroid for blues 
    song will lie somewhere near the middle of its spectrum while that for a metal song would be towards its end.'''
    spectral_centroids = librosa.feature.spectral_centroid(np.array(acoustic_data,dtype=np.float64), sr=sample_rate)[0]
    spectral_centroids = pd.Series(spectral_centroids)
    strain_feature['spec_cent_mean'] = [spectral_centroids.mean()]
    strain_feature['spec_cent_std'] = [spectral_centroids.std()]
    strain_feature['spec_cent_kurt'] = [spectral_centroids.kurtosis()]
    strain_feature['spec_cent_skew'] = [spectral_centroids.skew()]
    strain_feature['spec_cent_min'] = [spectral_centroids.min()]
    strain_feature['spec_cent_max'] = [spectral_centroids.max()]
    strain_feature['spec_cent_1%q'] = [spectral_centroids.quantile(0.01, interpolation='midpoint')]
    strain_feature['spec_cent_5%q'] = [spectral_centroids.quantile(0.05,interpolation='midpoint')]
    strain_feature['spec_cent_25%q'] = [spectral_centroids.quantile(0.25,interpolation='midpoint')]
    strain_feature['spec_cent_75%q'] = [spectral_centroids.quantile(0.75,interpolation='midpoint')]
    strain_feature['spec_cent_95%q'] = [spectral_centroids.quantile(0.95,interpolation='midpoint')]
    strain_feature['spec_cent_99%q'] = [spectral_centroids.quantile(0.99,interpolation='midpoint')]
    strain_feature['spec_cent_absMax'] = [spectral_centroids.abs().max()]
    strain_feature['spec_cent_absMean'] = [spectral_centroids.abs().mean()]
    strain_feature['spec_cent_absStd'] = [spectral_centroids.abs().std()]
    
    
    
    #power spectral density
    f, Pxx_den = signal.welch(acoustic_data, sample_rate)
    strain_feature['psd_mean'] = [np.mean(Pxx_den)]
    strain_feature['psd_std'] = [np.std(Pxx_den)]
    strain_feature['psd_kurt'] = [kurtosis(Pxx_den)]
    strain_feature['psd_skew'] = [skew(Pxx_den)]
    strain_feature['psd_min'] = [np.min(Pxx_den)]
    strain_feature['psd_max'] = [np.max(Pxx_den)]
    strain_feature['psd_1%q'] = [np.quantile(Pxx_den,0.01, interpolation='midpoint')]
    strain_feature['psd_5%q'] = [np.quantile(Pxx_den,0.05,interpolation='midpoint')]
    strain_feature['psd_25%q'] = [np.quantile(Pxx_den,0.25,interpolation='midpoint')]
    strain_feature['psd_75%q'] = [np.quantile(Pxx_den,0.75,interpolation='midpoint')]
    strain_feature['psd_95%q'] = [np.quantile(Pxx_den,0.95,interpolation='midpoint')]
    strain_feature['psd_99%q'] = [np.quantile(Pxx_den,0.99,interpolation='midpoint')]
   
    #Chroma Frequencies
    '''Chroma features are an interesting and powerful representation for music audio in which the entire 
    spectrum is projected onto 12 bins representing the 12 distinct semitones (or chroma) of the musical octave.'''
    chroma_stft = librosa.feature.chroma_stft(y=np.array(acoustic_data,dtype=np.float64), sr=sample_rate)
    strain_feature['chroma_stft_mean'] = [np.mean(chroma_stft)]
    strain_feature['chroma_stft_std'] = [np.std(chroma_stft)]
    strain_feature['chroma_stft_kurt'] = [kurtosis(chroma_stft[0])]
    strain_feature['chroma_stft_skew'] = [skew(chroma_stft[0])]
    strain_feature['chroma_stft_min'] = [np.min(chroma_stft)]
    strain_feature['chroma_stft_max'] = [np.max(chroma_stft)]
    strain_feature['chroma_stft_1%q'] = [np.quantile(chroma_stft,0.01, interpolation='midpoint')]
    strain_feature['chroma_stft_5%q'] = [np.quantile(chroma_stft,0.05,interpolation='midpoint')]
    strain_feature['chroma_stft_25%q'] = [np.quantile(chroma_stft,0.25,interpolation='midpoint')]
    strain_feature['chroma_stft_75%q'] = [np.quantile(chroma_stft,0.75,interpolation='midpoint')]
    strain_feature['chroma_stft_95%q'] = [np.quantile(chroma_stft,0.95,interpolation='midpoint')]
    strain_feature['chroma_stft_99%q'] = [np.quantile(chroma_stft,0.99,interpolation='midpoint')]
    
    #Spectral Rolloff
    '''It is a measure of the shape of the signal. 
    It represents the frequency below which a specified percentage of the total spectral energy, e.g. 85%, lies.'''
    rolloff = librosa.feature.spectral_rolloff(y=np.array(acoustic_data,dtype=np.float64), sr=sample_rate)[0]
    strain_feature['rolloff_mean'] = [np.mean(rolloff)]
    strain_feature['rolloff_std'] = [np.std(rolloff)]
    strain_feature['rolloff_kurt'] = [kurtosis(rolloff)]
    strain_feature['rolloff_skew'] = [skew(rolloff)]
    strain_feature['rolloff_min'] = [np.min(rolloff)]
    strain_feature['rolloff_max'] = [np.max(rolloff)]
    strain_feature['rolloff_1%q'] = [np.quantile(rolloff,0.01, interpolation='midpoint')]
    strain_feature['rolloff_5%q'] = [np.quantile(rolloff,0.05,interpolation='midpoint')]
    strain_feature['rolloff_25%q'] = [np.quantile(rolloff,0.25,interpolation='midpoint')]
    strain_feature['rolloff_75%q'] = [np.quantile(rolloff,0.75,interpolation='midpoint')]
    strain_feature['rolloff_95%q'] = [np.quantile(rolloff,0.95,interpolation='midpoint')]
    strain_feature['rolloff_99%q'] = [np.quantile(rolloff,0.99,interpolation='midpoint')]
    
    #spectral_bandwidth
    spec_bw = librosa.feature.spectral_bandwidth(y=np.array(acoustic_data,dtype=np.float64), sr=sample_rate)
    spec_bw = spec_bw[0,:]
    strain_feature['spec_bw_mean'] = [np.mean(spec_bw)]
    strain_feature['spec_bw_std'] = [np.std(spec_bw)]
    strain_feature['spec_bw_kurt'] = [kurtosis(spec_bw)]
    strain_feature['spec_bw_skew'] = [skew(spec_bw)]
    strain_feature['spec_bw_min'] = [np.min(spec_bw)]
    strain_feature['spec_bw_max'] = [np.max(spec_bw)]
    strain_feature['spec_bw_1%q'] = [np.quantile(spec_bw,0.01, interpolation='midpoint')]
    strain_feature['spec_bw_5%q'] = [np.quantile(spec_bw,0.05,interpolation='midpoint')]
    strain_feature['spec_bw_25%q'] = [np.quantile(spec_bw,0.25,interpolation='midpoint')]
    strain_feature['spec_bw_75%q'] = [np.quantile(spec_bw,0.75,interpolation='midpoint')]
    strain_feature['spec_bw_95%q'] = [np.quantile(spec_bw,0.95,interpolation='midpoint')]
    strain_feature['spec_bw_99%q'] = [np.quantile(spec_bw,0.99,interpolation='midpoint')]
    
                          
                
    #Time to failure
    strain_feature['time_to_failure'] = time_to_failure.values[-1]
  
    return strain_feature

Reduced according feature selection

In [4]:
def generate_featureRow(acoustic_data, time_to_failure, sample_rate):
    
    strain_feature = pd.DataFrame()
    strain_feature['mean'] = [acoustic_data.mean()]
    strain_feature['kurt'] = [acoustic_data.kurtosis()]
    strain_feature['skew'] = [acoustic_data.skew()]
    strain_feature['min'] = [acoustic_data.min()]
    strain_feature['max'] = [acoustic_data.max()]
   
    strain_feature['absMax'] = [acoustic_data.abs().max()]
    
    #Compute root-mean-square (RMS) value for each frame
    rms = librosa.feature.rms(y=np.array(acoustic_data,dtype=np.float64),frame_length=len(acoustic_data))
    rms = pd.Series(rms[0,:])
    strain_feature['rms_kurt'] = [rms.kurtosis()]
  
    #Zero Crossing Rate
    '''The zero crossing rate is the rate of sign-changes along a signal, i.e., 
    the rate at which the signal changes from positive to negative or back.'''
    zero_crossings = librosa.zero_crossings(np.array(acoustic_data,dtype=np.float64), pad=False)
    strain_feature['zero_crossings'] = [sum(zero_crossings)]
    
    #Spectral Centroid
    '''Each frame of a magnitude spectrogram is normalized and treated as a distribution
    over frequency bins, from which the mean (centroid) is extracted per frame.So spectral centroid for blues 
    song will lie somewhere near the middle of its spectrum while that for a metal song would be towards its end.'''
    spectral_centroids = librosa.feature.spectral_centroid(np.array(acoustic_data,dtype=np.float64), sr=sample_rate)[0]
    spectral_centroids = pd.Series(spectral_centroids)
    strain_feature['spec_cent_max'] = [spectral_centroids.max()]
    strain_feature['spec_cent_75%q'] = [spectral_centroids.quantile(0.75,interpolation='midpoint')]

    #power spectral density
    f, Pxx_den = signal.welch(acoustic_data, sample_rate)
    strain_feature['psd_kurt'] = [kurtosis(Pxx_den)]
    strain_feature['psd_skew'] = [skew(Pxx_den)]
    strain_feature['psd_min'] = [np.min(Pxx_den)]
    strain_feature['psd_5%q'] = [np.quantile(Pxx_den,0.05,interpolation='midpoint')]
    strain_feature['psd_25%q'] = [np.quantile(Pxx_den,0.25,interpolation='midpoint')]
    strain_feature['psd_75%q'] = [np.quantile(Pxx_den,0.75,interpolation='midpoint')]
   
    #Chroma Frequencies
    '''Chroma features are an interesting and powerful representation for music audio in which the entire 
    spectrum is projected onto 12 bins representing the 12 distinct semitones (or chroma) of the musical octave.'''
    chroma_stft = librosa.feature.chroma_stft(y=np.array(acoustic_data,dtype=np.float64), sr=sample_rate)
   
    strain_feature['chroma_stft_skew'] = [skew(chroma_stft[0])]
    strain_feature['chroma_stft_75%q'] = [np.quantile(chroma_stft,0.75,interpolation='midpoint')]
    
    #spectral_bandwidth
    spec_bw = librosa.feature.spectral_bandwidth(y=np.array(acoustic_data,dtype=np.float64), sr=sample_rate)
    spec_bw = spec_bw[0,:]
    strain_feature['spec_bw_max'] = [np.max(spec_bw)]
    strain_feature['spec_bw_99%q'] = [np.quantile(spec_bw,0.99,interpolation='midpoint')]
  
    #Time to failure
    strain_feature['time_to_failure'] = time_to_failure.values[-1]

    return strain_feature

Window wird durch das Trainingssignal geschoben mit einer Überlappung von step

In [5]:
def generate_Features(train,step,sample_rate):
    
    window_size=150000
    
    init_idx = 0
    features = pd.DataFrame()

    while (init_idx + window_size < len(train)):
        window = train.iloc[init_idx:init_idx + window_size]
        features = features.append(generate_featureRow(window.acoustic_data,window.time_to_failure,sample_rate), ignore_index=True)
        init_idx += step
   
    return features

In [6]:
# https://www.kaggle.com/flaport/linear-regression-on-180-features/notebook

## Feature extrahieren

### Testhalber eine Reihe extrahieren

In [7]:
step = 150000
train_data_test = train_data.iloc[0:151000,:] 

In [8]:
sample_rate = 4000000
features = generate_Features(train_data_test,step,sample_rate)

In [10]:
features

Unnamed: 0,mean,kurt,skew,min,max,absMax,rms_kurt,zero_crossings,spec_cent_max,spec_cent_75%q,...,psd_skew,psd_min,psd_5%q,psd_25%q,psd_75%q,chroma_stft_skew,chroma_stft_75%q,spec_bw_max,spec_bw_99%q,time_to_failure
0,4.8841133333,33.6624812935,-0.0240611666,-98,104,104,-1.9144601086,11112,779718.3984031264,733159.4642193337,...,4.6738462448,5.522e-07,1.1604e-06,1.8106e-06,5.1602e-06,0.3649612855,0.9961892727,593848.0949585186,589247.3423812583,1.4307971859


### Überlappung definieren

In [6]:
sample_rate = 4000000
step = 15000

## Feature extrahieren

In [7]:
features = generate_Features(train_data,step,sample_rate)

In [8]:
features.shape

(41934, 95)

### Generierte Feature sichern

In [9]:
# saving the dataframe
timestamp = time.time()
features.to_csv(feature_data_path+'earthquakeFeatures_'+str(timestamp)+'_.csv', header=True, index=True)

# Feature laden

In [None]:
feature_data_path = ''

In [None]:
features = pd.read_csv(feature_data_path,dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

## New Feature

 - Sehr interessant für neue Feature: https://towardsdatascience.com/music-genre-classification-with-python-c714d032f0d8 
 - spectral_features https://musicinformationretrieval.com/spectral_features.html
 
 

- Signal: phase, frequency, angular frequency, wavelength and a period. --> new features for mean / std / skew / kurt / percentile https://pythontic.com/visualization/signals/magnitude%20spectrum

 - RMS: root mean square of a signal (Effektivwert): https://de.wikipedia.org/wiki/Effektivwert
 - https://librosa.github.io/librosa/feature.html (RMS/etc..) ()