In [1]:
import pandas as pd
import numpy as np
#data visualization
import matplotlib.pyplot as plt
import librosa

from scipy.stats import kurtosis
from scipy.stats import skew

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

pd.options.display.precision = 10

## Umgebungsvariablen

In [2]:
#feature generated 24.03.2019
earthquake_daten = '''C:/studium/studium/CAS_PML/Projekt_Arbeit/earthquake/Daten/earthquake_data/'''
feature_62900_94 = 'Features_62900-94.csv'
feature_41934_94 = 'Features_41934_94.csv'
Features_4194_94 = 'Features_4194_94.csv'

# Feature laden

In [3]:
train_data = pd.read_csv(earthquake_daten+Features_4194_94 )

In [4]:
train_data_reduced = train_data.filter(items=['spec_cent_75%q',
 'min',
 'mean',
 'max',
 'zero_crossings',
 'skew',
 'absMax',
 'rolloff_min',
 'chroma_stft_std',
 'kurt',
 'spec_cent_min',
 'spec_cent_95%q',
 'spec_bw_min',
 'spec_bw_75%q',
 'rolloff_75%q',
 'chroma_stft_skew',
 'chroma_stft_mean',
 'psd_skew',
 'chroma_stft_min',
 'chroma_stft_kurt',
 'psd_75%q',
 'psd_kurt',
 'rolloff_max',
 'psd_25%q',
 'chroma_stft_75%q',
 'spec_cent_max',
 'rolloff_95%q',
 'spec_cent_99%q',
 'psd_max',
 'spec_bw_99%q',
 'spec_bw_95%q',
 'absMean',
 'spec_bw_25%q',
 'chroma_stft_25%q',
 'spec_bw_max',
 'rolloff_99%q',
 'spec_cent_25%q',
 'psd_5%q',
 'spec_bw_kurt',
 'spec_cent_std',
 'time_to_failure'])

## Testdaten vorbereiten

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    train_data_reduced.iloc[:,0:40], train_data_reduced.iloc[:,-1], random_state=0, test_size=0.25)

# Create Model Workbench

## Imports

In [6]:
# Scaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Model selection
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

# Modell
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Support Vector Regression (SVR) using linear and non-linear kernels

In [11]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('regressor', SVR())])

In [20]:
param_grid = [
    {'regressor': [SVR(kernel='rbf')], 
     'preprocessing': [StandardScaler(),MinMaxScaler()],
     'regressor__epsilon': [0.1],
     'regressor__gamma': [0.008,0.01,0.03,0.05],
     'regressor__C': [ 0.9, 1, 1.2, 1,5]}]

In [21]:
param_grid = [
    {'regressor': [SVR(kernel='linear')],
     'preprocessing': [StandardScaler()],
     'regressor__gamma': [0.1, 1, 10],
      'regressor__C': [0.1,1, 10, 100]}]

In [24]:
param_grid = [
    {'regressor': [SVR(kernel='poly')],
      'preprocessing': [StandardScaler()],
     'regressor__gamma': [0.1, 1, 10],
     'regressor__degree': [1,2],
     'regressor__C': [0.1,1, 10, 100]}]

In [28]:
param_grid = [
    {'regressor': [SVR(kernel='rbf',epsilon=.1)], 
     'preprocessing': [StandardScaler()],
     'regressor__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
     'regressor__C': [0.001, 0.01, 0.1, 1, 10, 100]},
    {'regressor': [SVR(kernel='linear')],
     'preprocessing': [StandardScaler()],
     'regressor__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
     'regressor__C': [0.001, 0.01, 0.1, 1, 10, 100]},]

In [None]:
grid = GridSearchCV(pipe, param_grid, scoring='neg_mean_absolute_error', cv=5,n_jobs=-1 )
grid.fit(X_train, y_train)

In [18]:
param_grid = [
    {'regressor': [SVR(kernel='rbf')],
      'preprocessing': [StandardScaler()],
     'regressor__gamma': [0.1, 1, 10],
     'regressor__C': [0.1,1, 10, 100]}]

grid = GridSearchCV(pipe, param_grid, scoring='neg_mean_absolute_error', cv=5,n_jobs=-1 )
grid.fit(X_train, y_train)

#26.03.2018 mit 40 Features
print('SVR(kernel=rbf')
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Test-set: mean absolute error : {:.2f}".format(mean_absolute_error(y_test, grid.predict(X_test))))

SVR(kernel=rbf
Best params:
{'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True), 'regressor': SVR(C=1, cache_size=200, coef0=0.0, degree=3, epsilon=0.5, gamma=0.01,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False), 'regressor__C': 1, 'regressor__epsilon': 0.5, 'regressor__gamma': 0.01}

Best cross-validation score: -2.05
Test-set score: -2.05
Test-set: mean absolute error : 2.05


In [23]:
param_grid = [
    {'regressor': [SVR(kernel='linear')],
      'preprocessing': [StandardScaler()],
     'regressor__gamma': [0.1, 1, 10],
     'regressor__degree': [1,2],
     'regressor__C': [0.1,1, 10, 100]}]

grid = GridSearchCV(pipe, param_grid, scoring='neg_mean_absolute_error', cv=5,n_jobs=-1 )
grid.fit(X_train, y_train)

#26.03.2018 mit 40 Features
print('SVR(kernel=linear')
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Test-set: mean absolute error : {:.2f}".format(mean_absolute_error(y_test, grid.predict(X_test))))

SVR(kernel=linear
Best params:
{'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True), 'regressor': SVR(C=0.1, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False), 'regressor__C': 0.1, 'regressor__gamma': 0.1}

Best cross-validation score: -2.11
Test-set score: -2.09
Test-set: mean absolute error : 2.09


In [None]:
#26.03.2018 mit 40 Features
print('SVR(kernel=poly')
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Test-set: mean absolute error : {:.2f}".format(mean_absolute_error(y_test, grid.predict(X_test))))

### print('SVR(kernel=poly')
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Test-set: mean absolute error : {:.2f}".format(mean_absolute_error(y_test, grid.predict(X_test))))

# Erkenntnisse

# Model selection
 - Random forrest
 - Other GradientBoost (AdaBoost)

 - Feed Forward NN (with Karas)
 - Reccurent Neuronal Net: LSTM (with Karas)
 - Convolutional NN https://www.kaggle.com/michael422/spectrogram-convolution https://www.kaggle.com/michael422/spectrogram-convolution
 
 
 - CatBoost vs. Light GBM vs. XGBoost https://www.kdnuggets.com/2018/03/catboost-vs-light-gbm-vs-xgboost.html
 - CatBoost-GridSearch https://setscholars.net/2019/02/19/how-to-find-optimal-parameters-for-catboost-using-gridsearchcv-for-regression-in-python/