# 30200 - Mini-Projekte

In [5]:
import pandas as pd
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

%matplotlib inline

In [6]:
# Some common functions
from keras import backend as K

def calculate_metrics(model, X_test, y_test):
    import numpy as np
    from sklearn import metrics

    predicted = np.rint(model.predict(X_test))

    metrics = {
        "Accuracy": metrics.accuracy_score(y_test, predicted),
        "Precession": metrics.precision_score(y_test, predicted),
        "Recall": metrics.recall_score(y_test, predicted),
        "F1-Score": metrics.f1_score(y_test, predicted)
    }

    return metrics

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## Erkennen von Rissen auf Betonwänden (Competition 1)

In [7]:
df_train = pd.read_csv('../data/comp1/data_risse.csv', sep=';', header=None)

In [8]:
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,512
0,0.0,3285.80396,1510.82166,2731.94043,0.0,3424.64282,4330.51367,3367.21899,0.0,0.0,...,2432.8252,0.0,0.0,0.0,0.0,2994.42871,0.0,3900.63892,3414.04224,0.0
1,1.0,3043.73804,1565.28137,2659.80981,0.0,3219.83789,4012.70532,3161.57178,0.0,0.0,...,2124.0415,0.0,0.0,0.0,0.0,2892.67065,0.0,3768.45215,3265.97778,0.0
2,0.0,4811.396,2309.01709,4087.53223,0.0,5035.70654,6315.00977,5001.66748,0.0,0.0,...,3479.72388,0.0,0.0,0.0,0.0,4452.56641,0.0,5834.60645,5091.24316,0.0
3,0.0,4507.62842,2074.94629,3750.32056,0.0,4704.63867,5941.25586,4621.97656,0.0,0.0,...,3328.27295,0.0,0.0,0.0,0.0,4115.44922,0.0,5367.67139,4688.30273,0.0
4,0.0,3785.34399,1756.06714,3171.94019,0.0,3948.11133,4985.46045,3906.12256,0.0,0.0,...,2784.45337,0.0,0.0,0.0,0.0,3464.22168,0.0,4522.1626,3963.88428,0.0


In [None]:
df_train[0].describe()

In [None]:
sns.distplot(df_train[0], kde=False, rug=True, bins=3);

### SciKit-Learn Baseline

#### SVM with Parameter Tuning

In [10]:
# Prepare Dataset
X = df_train.iloc[:, 1:]
y = df_train.iloc[:, :1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Construct Classifier
standard_scaler = StandardScaler().fit(X_train)
maxabs_scaler = MaxAbsScaler().fit(X_train)
classifier = SVC()
pipe = Pipeline([('StandardScaler', standard_scaler), ('MaxAbsScaler', maxabs_scaler), ('SVC', classifier)])
    
# Parameter Tuning
params = {
    'SVC__C': scipy.stats.expon(scale=100), 
    'SVC__gamma': scipy.stats.expon(scale=.1),
    'SVC__kernel': ['rbf', 'sigmoid'], 
    'SVC__class_weight': ['balanced', None]
}
tuned_pipe = RandomizedSearchCV(pipe, params, random_state=42, n_iter=50, verbose=1)

# Fit & Calculate Metrics
trained_classifier = tuned_pipe.fit(X_train, y_train.values.ravel())
metrics = calculate_metrics(trained_classifier, X_test, y_test)
metrics

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:   55.0s finished


{'Accuracy': 0.8208333333333333,
 'Precession': 0.8980582524271845,
 'Recall': 0.74,
 'F1-Score': 0.8114035087719298}

#### Trees with Parameter Tuning

In [11]:
# Prepare Dataset
X = df_train.iloc[:, 1:]
y = df_train.iloc[:, :1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Construct Classifier
standard_scaler = StandardScaler().fit(X_train)
maxabs_scaler = MaxAbsScaler().fit(X_train)
classifier = DecisionTreeClassifier()
pipe = Pipeline([('StandardScaler', standard_scaler), ('MaxAbsScaler', maxabs_scaler), ('Tree', classifier)])
    
# Parameter Tuning
params = {
    'Tree__min_weight_fraction_leaf': scipy.stats.expon(scale=.1)
}
tuned_pipe = RandomizedSearchCV(pipe, params, random_state=42, n_iter=5, verbose=1)

# Fit & Calculate Metrics
trained_classifier = tuned_pipe.fit(X_train, y_train.values.ravel())
metrics = calculate_metrics(trained_classifier, X_test, y_test)
metrics

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    1.7s finished


{'Accuracy': 0.7604166666666666,
 'Precession': 0.7424242424242424,
 'Recall': 0.8065843621399177,
 'F1-Score': 0.7731755424063116}

#### RandomForest with Parameter Tuning

* Funktioniert besser ohne Scaler

In [9]:
# Prepare Dataset
X = df_train.iloc[:, 1:]
y = df_train.iloc[:, :1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Construct Classifier
standard_scaler = StandardScaler().fit(X_train)
maxabs_scaler = MaxAbsScaler().fit(X_train)
classifier = RandomForestClassifier()
pipe = Pipeline([('Tree', classifier)])
    
# Parameter Tuning
params = {
    'Tree__min_weight_fraction_leaf': scipy.stats.expon(scale=.1),
    'Tree__max_features': [512]
}
tuned_pipe = RandomizedSearchCV(pipe, params, random_state=42, n_iter=5, verbose=3)

# Fit & Calculate Metrics
trained_classifier = tuned_pipe.fit(X_train, y_train.values.ravel())
metrics = calculate_metrics(trained_classifier, X_test, y_test)
metrics

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] Tree__max_features=512, Tree__min_weight_fraction_leaf=0.04692680899768591 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  Tree__max_features=512, Tree__min_weight_fraction_leaf=0.04692680899768591, score=0.804, total=   3.5s
[CV] Tree__max_features=512, Tree__min_weight_fraction_leaf=0.04692680899768591 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.5s remaining:    0.0s


[CV]  Tree__max_features=512, Tree__min_weight_fraction_leaf=0.04692680899768591, score=0.835, total=   3.6s
[CV] Tree__max_features=512, Tree__min_weight_fraction_leaf=0.04692680899768591 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.2s remaining:    0.0s


[CV]  Tree__max_features=512, Tree__min_weight_fraction_leaf=0.04692680899768591, score=0.790, total=   3.5s
[CV] Tree__max_features=512, Tree__min_weight_fraction_leaf=0.04692680899768591 
[CV]  Tree__max_features=512, Tree__min_weight_fraction_leaf=0.04692680899768591, score=0.759, total=   3.5s
[CV] Tree__max_features=512, Tree__min_weight_fraction_leaf=0.04692680899768591 
[CV]  Tree__max_features=512, Tree__min_weight_fraction_leaf=0.04692680899768591, score=0.812, total=   3.6s
[CV] Tree__max_features=512, Tree__min_weight_fraction_leaf=0.3010121430917521 
[CV]  Tree__max_features=512, Tree__min_weight_fraction_leaf=0.3010121430917521, score=0.826, total=   1.2s
[CV] Tree__max_features=512, Tree__min_weight_fraction_leaf=0.3010121430917521 
[CV]  Tree__max_features=512, Tree__min_weight_fraction_leaf=0.3010121430917521, score=0.830, total=   1.2s
[CV] Tree__max_features=512, Tree__min_weight_fraction_leaf=0.3010121430917521 
[CV]  Tree__max_features=512, Tree__min_weight_fraction

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  1.2min finished


{'Accuracy': 0.8145833333333333,
 'Precession': 0.8695652173913043,
 'Recall': 0.743801652892562,
 'F1-Score': 0.8017817371937639}

In [None]:
RandomForestClassifier?

### Keras Baseline

In [None]:
model = Sequential()
model.add(Dense(units=112, activation='relu', input_dim=512))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy'])

keras_pipe = Pipeline([
    ('StandardScaler', standard_scaler), 
    ('MaxAbsScaler', maxabs_scaler), 
    ('Model', model)])
keras_pipe = keras_pipe.fit(X_train, y_train, Model__epochs=1000, Model__batch_size=100)
metrics = calculate_metrics(keras_pipe, X_test, y_test)

In [None]:
metrics

In [None]:
import pydot
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model).create(prog='dot', format='svg'))

## Links

* https://machinelearningmastery.com/binary-classification-tutorial-with-the-keras-deep-learning-library/

In [None]:
np.rint(pipe.predict(X_test))

In [None]:
sns.distplot(pipe.predict(X_test));

In [None]:
sns.distplot(keras_pipe.predict(X_test));

In [None]:
X_test.shape

In [None]:
foo = { "hello": "world" }

In [None]:
foo

In [None]:
scipy.stats.expon(scale=100).rvs(size=10)

In [None]:
RandomizedSearchCV?