# Imports

In [44]:
import pandas as pd
import numpy as np

import librosa

from scipy.stats import kurtosis
from scipy.stats import skew
from scipy import signal

import warnings
warnings.filterwarnings('ignore')

# Feature Extraction

In [45]:
def generate_featureRow(acoustic_data, sample_rate):
    
    strain_feature = pd.DataFrame()
    strain_feature['mean'] = [acoustic_data.mean()]
    strain_feature['std'] = [acoustic_data.std()]
    strain_feature['kurt'] = [acoustic_data.kurtosis()]
    strain_feature['skew'] = [acoustic_data.skew()]
    strain_feature['min'] = [acoustic_data.min()]
    strain_feature['max'] = [acoustic_data.max()]
    strain_feature['1%q'] = [acoustic_data.quantile(0.01, interpolation='midpoint')]
    strain_feature['5%q'] = [acoustic_data.quantile(0.05,interpolation='midpoint')]
    strain_feature['25%q'] = [acoustic_data.quantile(0.25,interpolation='midpoint')]
    strain_feature['75%q'] = [acoustic_data.quantile(0.75,interpolation='midpoint')]
    strain_feature['95%q'] = [acoustic_data.quantile(0.95,interpolation='midpoint')]
    strain_feature['99%q'] = [acoustic_data.quantile(0.99,interpolation='midpoint')]
    strain_feature['absMax'] = [acoustic_data.abs().max()]
    strain_feature['absMean'] = [acoustic_data.abs().mean()]
    strain_feature['absStd'] = [acoustic_data.abs().std()]
    
    
    #Compute root-mean-square (RMS) value for each frame
    rms = librosa.feature.rms(y=np.array(acoustic_data,dtype=np.float64),frame_length=len(acoustic_data))
    rms = pd.Series(rms[0,:])
    strain_feature['rms_mean'] = [rms.mean()]
    strain_feature['rms_std'] = [rms.std()]
    strain_feature['rms_kurt'] = [rms.kurtosis()]
    strain_feature['rms_skew'] = [rms.skew()]
    strain_feature['rms_min'] = [rms.min()]
    strain_feature['rms_max'] = [rms.max()]
    strain_feature['rms_1%q'] = [rms.quantile(0.01, interpolation='midpoint')]
    strain_feature['rms_5%q'] = [rms.quantile(0.05,interpolation='midpoint')]
    strain_feature['rms_25%q'] = [rms.quantile(0.25,interpolation='midpoint')]
    strain_feature['rms_75%q'] = [rms.quantile(0.75,interpolation='midpoint')]
    strain_feature['rms_95%q'] = [rms.quantile(0.95,interpolation='midpoint')]
    strain_feature['rms_99%q'] = [rms.quantile(0.99,interpolation='midpoint')]
    strain_feature['rms_absMax'] = [rms.abs().max()]
    strain_feature['rms_absMean'] = [rms.abs().mean()]
    strain_feature['rms_absStd'] = [rms.abs().std()]
    
    #Zero Crossing Rate
    '''The zero crossing rate is the rate of sign-changes along a signal, i.e., 
    the rate at which the signal changes from positive to negative or back.'''
    zero_crossings = librosa.zero_crossings(np.array(acoustic_data,dtype=np.float64), pad=False)
    strain_feature['zero_crossings'] = [sum(zero_crossings)]
    
    #Spectral Centroid
    '''Each frame of a magnitude spectrogram is normalized and treated as a distribution
    over frequency bins, from which the mean (centroid) is extracted per frame.So spectral centroid for blues 
    song will lie somewhere near the middle of its spectrum while that for a metal song would be towards its end.'''
    spectral_centroids = librosa.feature.spectral_centroid(np.array(acoustic_data,dtype=np.float64), sr=sample_rate)[0]
    spectral_centroids = pd.Series(spectral_centroids)
    strain_feature['spec_cent_mean'] = [spectral_centroids.mean()]
    strain_feature['spec_cent_std'] = [spectral_centroids.std()]
    strain_feature['spec_cent_kurt'] = [spectral_centroids.kurtosis()]
    strain_feature['spec_cent_skew'] = [spectral_centroids.skew()]
    strain_feature['spec_cent_min'] = [spectral_centroids.min()]
    strain_feature['spec_cent_max'] = [spectral_centroids.max()]
    strain_feature['spec_cent_1%q'] = [spectral_centroids.quantile(0.01, interpolation='midpoint')]
    strain_feature['spec_cent_5%q'] = [spectral_centroids.quantile(0.05,interpolation='midpoint')]
    strain_feature['spec_cent_25%q'] = [spectral_centroids.quantile(0.25,interpolation='midpoint')]
    strain_feature['spec_cent_75%q'] = [spectral_centroids.quantile(0.75,interpolation='midpoint')]
    strain_feature['spec_cent_95%q'] = [spectral_centroids.quantile(0.95,interpolation='midpoint')]
    strain_feature['spec_cent_99%q'] = [spectral_centroids.quantile(0.99,interpolation='midpoint')]
    strain_feature['spec_cent_absMax'] = [spectral_centroids.abs().max()]
    strain_feature['spec_cent_absMean'] = [spectral_centroids.abs().mean()]
    strain_feature['spec_cent_absStd'] = [spectral_centroids.abs().std()]
    
    
    
    #power spectral density
    f, Pxx_den = signal.welch(acoustic_data, sample_rate)
    strain_feature['psd_mean'] = [np.mean(Pxx_den)]
    strain_feature['psd_std'] = [np.std(Pxx_den)]
    strain_feature['psd_kurt'] = [kurtosis(Pxx_den)]
    strain_feature['psd_skew'] = [skew(Pxx_den)]
    strain_feature['psd_min'] = [np.min(Pxx_den)]
    strain_feature['psd_max'] = [np.max(Pxx_den)]
    strain_feature['psd_1%q'] = [np.quantile(Pxx_den,0.01, interpolation='midpoint')]
    strain_feature['psd_5%q'] = [np.quantile(Pxx_den,0.05,interpolation='midpoint')]
    strain_feature['psd_25%q'] = [np.quantile(Pxx_den,0.25,interpolation='midpoint')]
    strain_feature['psd_75%q'] = [np.quantile(Pxx_den,0.75,interpolation='midpoint')]
    strain_feature['psd_95%q'] = [np.quantile(Pxx_den,0.95,interpolation='midpoint')]
    strain_feature['psd_99%q'] = [np.quantile(Pxx_den,0.99,interpolation='midpoint')]
   
    #Chroma Frequencies
    '''Chroma features are an interesting and powerful representation for music audio in which the entire 
    spectrum is projected onto 12 bins representing the 12 distinct semitones (or chroma) of the musical octave.'''
    chroma_stft = librosa.feature.chroma_stft(y=np.array(acoustic_data,dtype=np.float64), sr=sample_rate)
    strain_feature['chroma_stft_mean'] = [np.mean(chroma_stft)]
    strain_feature['chroma_stft_std'] = [np.std(chroma_stft)]
    strain_feature['chroma_stft_kurt'] = [kurtosis(chroma_stft[0])]
    strain_feature['chroma_stft_skew'] = [skew(chroma_stft[0])]
    strain_feature['chroma_stft_min'] = [np.min(chroma_stft)]
    strain_feature['chroma_stft_max'] = [np.max(chroma_stft)]
    strain_feature['chroma_stft_1%q'] = [np.quantile(chroma_stft,0.01, interpolation='midpoint')]
    strain_feature['chroma_stft_5%q'] = [np.quantile(chroma_stft,0.05,interpolation='midpoint')]
    strain_feature['chroma_stft_25%q'] = [np.quantile(chroma_stft,0.25,interpolation='midpoint')]
    strain_feature['chroma_stft_75%q'] = [np.quantile(chroma_stft,0.75,interpolation='midpoint')]
    strain_feature['chroma_stft_95%q'] = [np.quantile(chroma_stft,0.95,interpolation='midpoint')]
    strain_feature['chroma_stft_99%q'] = [np.quantile(chroma_stft,0.99,interpolation='midpoint')]
    
    #Spectral Rolloff
    '''It is a measure of the shape of the signal. 
    It represents the frequency below which a specified percentage of the total spectral energy, e.g. 85%, lies.'''
    rolloff = librosa.feature.spectral_rolloff(y=np.array(acoustic_data,dtype=np.float64), sr=sample_rate)[0]
    strain_feature['rolloff_mean'] = [np.mean(rolloff)]
    strain_feature['rolloff_std'] = [np.std(rolloff)]
    strain_feature['rolloff_kurt'] = [kurtosis(rolloff)]
    strain_feature['rolloff_skew'] = [skew(rolloff)]
    strain_feature['rolloff_min'] = [np.min(rolloff)]
    strain_feature['rolloff_max'] = [np.max(rolloff)]
    strain_feature['rolloff_1%q'] = [np.quantile(rolloff,0.01, interpolation='midpoint')]
    strain_feature['rolloff_5%q'] = [np.quantile(rolloff,0.05,interpolation='midpoint')]
    strain_feature['rolloff_25%q'] = [np.quantile(rolloff,0.25,interpolation='midpoint')]
    strain_feature['rolloff_75%q'] = [np.quantile(rolloff,0.75,interpolation='midpoint')]
    strain_feature['rolloff_95%q'] = [np.quantile(rolloff,0.95,interpolation='midpoint')]
    strain_feature['rolloff_99%q'] = [np.quantile(rolloff,0.99,interpolation='midpoint')]
    
    #spectral_bandwidth
    spec_bw = librosa.feature.spectral_bandwidth(y=np.array(acoustic_data,dtype=np.float64), sr=sample_rate)
    spec_bw = spec_bw[0,:]
    strain_feature['spec_bw_mean'] = [np.mean(spec_bw)]
    strain_feature['spec_bw_std'] = [np.std(spec_bw)]
    strain_feature['spec_bw_kurt'] = [kurtosis(spec_bw)]
    strain_feature['spec_bw_skew'] = [skew(spec_bw)]
    strain_feature['spec_bw_min'] = [np.min(spec_bw)]
    strain_feature['spec_bw_max'] = [np.max(spec_bw)]
    strain_feature['spec_bw_1%q'] = [np.quantile(spec_bw,0.01, interpolation='midpoint')]
    strain_feature['spec_bw_5%q'] = [np.quantile(spec_bw,0.05,interpolation='midpoint')]
    strain_feature['spec_bw_25%q'] = [np.quantile(spec_bw,0.25,interpolation='midpoint')]
    strain_feature['spec_bw_75%q'] = [np.quantile(spec_bw,0.75,interpolation='midpoint')]
    strain_feature['spec_bw_95%q'] = [np.quantile(spec_bw,0.95,interpolation='midpoint')]
    strain_feature['spec_bw_99%q'] = [np.quantile(spec_bw,0.99,interpolation='midpoint')]
    
  
    return strain_feature

Das Fenster, mit der grösse 150000), wird durch das Trainingssignal geschoben mit einer geplanten Überlappung von 80%

# Submission

## Umgebungsvariablen

In [46]:
failure_datapath = '''D:/jupyter-notebooks/LANL_Earthquake_Prediction/failure/'''
train_data_path = '''C:/studium/studium/CAS_PML/Projekt_Arbeit/earthquake/Daten/all/train.csv'''
feature_data_path = '''C:/studium/studium/CAS_PML/Projekt_Arbeit/earthquake/Daten/all/earthquakeFeatures.csv'''

cat_boost_path = '''C:/studium/studium/CAS_PML/Projekt_Arbeit/earthquake/Daten/all/catboost.dump'''

## load test data

### Function to convert test data into features

In [47]:
def load_test_features(sampleRate):
    """ load raw test data and transform to a feature dataframe """
    print("loading train segment ids...")
    seg_ids = pd.read_csv(submission_file_path, index_col="seg_id").index
    print("converting test segments into feature dataframes. ")
    features = pd.DataFrame()
    for i, seg_id in enumerate(seg_ids):
        segment = pd.read_csv(
            filepath_or_buffer=f"C:/studium/studium/CAS_PML/Projekt_Arbeit/earthquake/Daten/all/test/{seg_id}.csv",
            dtype={"acoustic_data": np.int16, "time_to_failure": np.float32})
        feature_row = generate_featureRow(segment.acoustic_data,sampleRate)
        features = features.append(feature_row, ignore_index=True)
        
    print("test feature dataframe created")
    return features

In [48]:
submission_file_path = '''C:/studium/studium/CAS_PML/Projekt_Arbeit/earthquake/Daten/all/sample_submission.csv'''
test_data_path = '''C:/studium/studium/CAS_PML/Projekt_Arbeit/earthquake/Daten/all/test/'''

In [49]:
sampleRate=4000000
test_features = load_test_features(sampleRate)

loading train segment ids...
converting test segments into feature dataframes. 


KeyboardInterrupt: 

## load model

In [14]:
model = CatBoostRegressor()
model.load_model(cat_boost_path);

In [None]:

submission['time_to_failure'] = m.predict(fea)
submission.to_csv('submission.csv')