# Vorarbeiten

In [1]:
import pandas as pd
import numpy as np

## Umgebungsvariablen

In [None]:
failure_datapath = '''D:/jupyter-notebooks/LANL_Earthquake_Prediction/failure/'''
train_data_path = '''C:/studium/studium/CAS_PML/Projekt_Arbeit/earthquake/Daten/all/train.csv'''
feature_data_path = '''C:/studium/studium/CAS_PML/Projekt_Arbeit/earthquake/Daten/all/earthquakeFeatures.csv'''

## Trainingsdaten laden

In [3]:
train_data = pd.read_csv(train_data_path, dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

# Feature Extraction

Die Testsegmente bestehen jeweils aus 150.000 Messpunkte. Aus diesem Grund setzten wir die Fenstergrösse auch auf 150000 Messpunkte.

In [42]:
def generate_featureRow(acoustic_data, time_to_failure):
    
    strain_feature = []
    strain_feature.append(acoustic_data.mean())
    strain_feature.append(acoustic_data.std())
    strain_feature.append(acoustic_data.kurtosis())
    strain_feature.append(acoustic_data.skew())
    strain_feature.append(acoustic_data.min())
    strain_feature.append(acoustic_data.max())
    strain_feature.append(acoustic_data.quantile(0.01, interpolation='midpoint'))
    strain_feature.append(acoustic_data.quantile(0.05,interpolation='midpoint'))
    strain_feature.append(acoustic_data.quantile(0.95,interpolation='midpoint'))
    strain_feature.append(acoustic_data.quantile(0.99,interpolation='midpoint'))
    strain_feature.append(acoustic_data.abs().max())
    strain_feature.append(acoustic_data.abs().mean())
    strain_feature.append(acoustic_data.abs().std())
    
    strain_feature.append(time_to_failure.values[-1])
    
    return strain_feature

Das Fenster, mit der grösse 150000), wird durch das Trainingssignal geschoben mit einer geplanten Überlappung von 80%

In [43]:
def generate_Features(train,step, window_size=150000, ):
    init_idx = 0
    features = []

    while (init_idx + window_size < len(train)):
        window = train.iloc[init_idx:init_idx + window_size]
        features.append(generate_featureRow(window.acoustic_data,window.time_to_failure))
        init_idx += step

    return pd.DataFrame(features,dtype=np.float64)

In [67]:
step = 100
features = generate_Features(train_data,step)

KeyboardInterrupt: 

In [58]:
features.shape

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,4.884113,5.101106,33.662481,-0.024061,-98,104,-8.0,-2.0,11.0,18.0,104,5.576567,4.333325
1,4.693913,6.586175,98.883014,0.400110,-154,181,-11.0,-2.0,12.0,20.0,181,5.708033,5.729643
2,4.830240,5.696757,33.914544,-0.169765,-115,111,-11.0,-2.0,12.0,21.0,115,5.725880,4.795674
3,4.927573,7.889741,81.215533,0.714251,-199,197,-16.0,-3.0,13.0,26.0,199,6.264413,6.876483
4,4.958067,6.640955,58.342651,0.163228,-125,145,-12.0,-2.0,12.0,22.0,145,5.965053,5.753501
5,4.990007,7.334238,59.370621,0.054065,-144,145,-15.0,-2.0,12.0,25.0,145,6.164487,6.378890
6,4.841033,5.379147,20.573419,0.022897,-71,107,-10.0,-2.0,12.0,20.0,107,5.679860,4.484413
7,4.728547,5.847489,30.998860,0.195492,-89,120,-12.0,-2.0,12.0,22.0,120,5.682480,4.925610
8,4.531267,6.114053,71.093381,-0.311078,-145,139,-11.0,-2.0,11.0,20.0,145,5.478947,5.281581
9,4.698693,7.280849,66.665490,-0.121489,-156,168,-15.0,-3.0,13.0,24.0,168,6.022293,6.230600


In [None]:
# saving the dataframe 
df.to_csv(feature_data_path, header=False, index=False) 

# Create Model Workbench

## Imports

In [59]:
# Scaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

# Model selection
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Modell
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

## RandomForrest and GradientBoosting

In [60]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', GradientBoostingRegressor())])

In [63]:
param_grid = [
    {'classifier': [GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1,
      max_depth=1, random_state=0, loss='ls')], 'preprocessing': [None]}]

In [61]:
'''param_grid = [
    {'classifier': [SVC()], 'preprocessing': [StandardScaler(), None],
     'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
     'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},
    {'classifier': [RandomForestClassifier(n_estimators=100)],
     'preprocessing': [None], 'classifier__max_features': [1, 2, 3]}]'''

In [66]:
X_train, X_test, y_train, y_test = train_test_split(
    features.iloc[:,0:13], features.iloc[:,-1], random_state=0)

grid = GridSearchCV(pipe, param_grid, scoring='neg_mean_absolute_error', cv=5)
grid.fit(X_train, y_train)

print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

Best params:
{'classifier': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=1, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=0, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False), 'preprocessing': None}

Best cross-validation score: -2.15
Test-set score: -2.13
