In [32]:
import pandas as pd
import numpy as np
#data visualization
import matplotlib.pyplot as plt
import librosa

from scipy.stats import kurtosis
from scipy.stats import skew

pd.options.display.precision = 10

## Umgebungsvariablen

In [33]:
feature_data_path = '''C:/studium/studium/CAS_PML/Projekt_Arbeit/earthquake/Daten/all/earthquakeFeatures_ext.csv'''

# Feature laden

In [40]:
train_data = pd.read_csv(feature_data_path, dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

## Testdaten vorbereiten

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    features.iloc[:,1:93], features.iloc[:,-1], random_state=0, test_size=0.25)

# Create Model Workbench

## Imports

In [8]:
# Scaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

# Model selection
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

# Modell
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Support Vector Regression (SVR) using linear and non-linear kernels

In [19]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('regressor', SVR())])

In [25]:
param_grid = [
    {'regressor': [SVR(kernel='rbf',epsilon=.1)], 
     'preprocessing': [StandardScaler()],
     'regressor__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
     'regressor__C': [0.001, 0.01, 0.1, 1, 10, 100]}]

In [20]:
param_grid = [
    {'regressor': [SVR(kernel='linear')],
     'preprocessing': [StandardScaler(), RobustScaler()],
     'regressor__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
     'regressor__C': [0.001, 0.01, 0.1, 1, 10, 100]}]

In [15]:
param_grid = [
    {'regressor': [SVR(kernel='poly',epsilon=.1)],
     'preprocessing': [StandardScaler()],
     'regressor__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
     'regressor__degree': [1,2],
     'regressor__C': [0.001, 0.01, 0.1, 1, 10, 100]}]

In [38]:
param_grid = [
    {'regressor': [SVR(kernel='rbf',epsilon=.1)], 
     'preprocessing': [StandardScaler()],
     'regressor__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
     'regressor__C': [0.001, 0.01, 0.1, 1, 10, 100]},
    {'regressor': [SVR(kernel='linear')],
     'preprocessing': [StandardScaler(), RobustScaler()],
     'regressor__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
     'regressor__C': [0.001, 0.01, 0.1, 1, 10, 100]},
    {'regressor': [SVR(kernel='poly',epsilon=.1)],
     'preprocessing': [StandardScaler(), RobustScaler()],
     'regressor__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
     'regressor__degree': [1,2,3],
     'regressor__C': [0.001, 0.01, 0.1, 1, 10, 100]}]

In [None]:
grid = GridSearchCV(pipe, param_grid, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

In [28]:
print('SVR(kernel=rbf')
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Test-set: mean absolute error : {:.2f}".format(mean_absolute_error(y_test, grid.predict(X_test))))

SVR(kernel=rbf
Best params:
{'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True), 'regressor': SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.01,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False), 'regressor__C': 100, 'regressor__gamma': 0.01}

Best cross-validation score: -2.11
Test-set score: -2.12
Test-set: mean absolute error : 2.12


In [33]:
print('SVR(kernel=linear')
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Test-set: mean absolute error : {:.2f}".format(mean_absolute_error(y_test, grid.predict(X_test))))

SVR(kernel=linear
Best params:
{'preprocessing': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True), 'regressor': SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.001,
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False), 'regressor__C': 100, 'regressor__gamma': 0.001}

Best cross-validation score: -2.19
Test-set score: -2.21
Test-set: mean absolute error : 2.21


In [None]:
print('SVR(kernel=poly')
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Test-set: mean absolute error : {:.2f}".format(mean_absolute_error(y_test, grid.predict(X_test))))

# Erkenntnisse

# Model selection
 - Random forrest
 - Other GradientBoost (AdaBoost)

 - Feed Forward NN (with Karas)
 - Reccurent Neuronal Net: LSTM (with Karas)
 - Convolutional NN https://www.kaggle.com/michael422/spectrogram-convolution https://www.kaggle.com/michael422/spectrogram-convolution
 
 
 - CatBoost vs. Light GBM vs. XGBoost https://www.kdnuggets.com/2018/03/catboost-vs-light-gbm-vs-xgboost.html
 - CatBoost-GridSearch https://setscholars.net/2019/02/19/how-to-find-optimal-parameters-for-catboost-using-gridsearchcv-for-regression-in-python/