# Import libraries

In [None]:
import os
import numpy as np
import pandas as pd
import random
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import auc, roc_curve
from sklearn.metrics import classification_report
from glob import glob
import librosa
import librosa.display
import matplotlib.pyplot as plt
random.seed(42)
np.random.seed(42)
%matplotlib inline

# Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Feature extraction

In [None]:
# Functions to read all files and put into dictionaries
def read_data_vn(root_path='', labels=[]):
    data = {'dir': [],
            'labels': []}
    buzz_path = os.path.join(root_path)
    for root, dirs, files in os.walk(buzz_path):
        for item in files:
            if item.endswith('.wav'):
                x = os.path.join(root, item)
                data['dir'].append(x)
                data['labels'].append('0')
    features = []
    outputs = []

    # Get extraction features: MFCC, Spectral, etc.
    for filename, label in zip(data['dir'], data['labels']):
        extract_feature = extract_mfcc(filename)
        features.append(extract_feature)
        outputs.append(label)
    return features, outputs

In [None]:
# Functions to get extraction features

def extract_mfcc(filename, n_mfcc=40):
    y, sr = librosa.load(filename)
    mfcc = librosa.feature.mfcc(y, sr=sr, n_mfcc=n_mfcc)

    mfcc_mean = mfcc.mean(axis=1).T
    mfcc_std = mfcc.std(axis=1).T
    mfcc_feature = np.hstack([mfcc_mean, mfcc_std])
    return mfcc_feature

def extract_spectral_contrast(filename, n_bands=3):
    y, sr = librosa.load(filename)
    spec_con = librosa.feature.spectral_contrast(y=y, sr=sr, n_bands=n_bands)
    spec_con_mean = spec_con.mean(axis=1).T
    spec_con_std = spec_con.std(axis=1).T
    spec_con_feature = np.hstack([spec_con_mean, spec_con_std])
    # features = np.hstack([features, spec_con_feature] if features is not None else spec_con_feature)
    return spec_con_feature

In [None]:
# Functions to read all files and put into dictionaries
def read_data(root_path='E:\\Download\\sounds\BUZZ1\\sounds\\BUZZ2\\validation',
              labels=[]):
    data = {'dir': [],
            'labels': []}
    for label in labels:
        buzz_path = os.path.join(root_path, label)
        for root, dirs, files in os.walk(buzz_path):
            for item in files:
                if item.endswith('.wav'):
                    x = os.path.join(root, item)
                    data['dir'].append(x)
                    if label == 'bee':
                        data['labels'].append('bee')
                    if label == 'noise':
                        data['labels'].append('noise')
                    elif label=='cricket':
                        data['labels'].append('cricket')
    return data

# When we have dictionary, we get extraction features
def get_extraction_data(root_path, subset='train', method='mfcc'):
    labels=['bee', 'cricket', 'noise']
    data = read_data(os.path.join(root_path, subset), labels)
    features = []
    outputs = []

    # Get extraction features: MFCC, Spectral, etc.
    for filename, label in zip(data['dir'], data['labels']):
        if method == 'mfcc':
            extract_feature = extract_mfcc(filename)
        elif method == 'spectral':
            extract_feature = extract_spectral_contrast(filename)
        elif method == 'melspectrogram':
            y, sr = librosa.load(filename)
            extract_feature = np.mean(librosa.feature.melspectrogram(y, sr=sr).T,axis=0)
        else:
            y, sr = librosa.load(filename)
            s = np.abs(librosa.stft(y))
            extract_feature = np.mean(librosa.feature.chroma_stft(S=s, sr=sr).T,axis=0)
        features.append(extract_feature)
        outputs.append(label)
    return features, outputs

In [None]:
from sklearn import preprocessing
# PUT IT ALL TOGETHER
def get_all_data(root_path='E:\\Download\\sounds\\BUZZ2\\'):
    features, outputs = get_extraction_data(root_path, subset='train')
    le = preprocessing.LabelEncoder()
    le.fit(outputs)

    X_train = np.asarray(features)
    y_train = le.transform(outputs)
    y_train = np.asarray(y_train)

    test_features, test_outputs = get_extraction_data(root_path, subset='test')
    X_test = np.asarray(test_features)
    y_test = le.transform(test_outputs)
    y_test = np.asarray(y_test)

    val_features, val_outputs = get_extraction_data(root_path, subset='val')
    X_val = np.asarray(val_features)
    y_val = le.transform(val_outputs)
    y_val = np.asarray(y_val)

    np.save('2_X_train', X_train)
    np.save('2_y_train', y_train)
    np.save('2_X_val', X_val)
    np.save('2_y_val', y_val)
    np.save('2_X_test', X_test)
    np.save('2_y_test', y_test)
    return X_train, y_train, X_test, y_test, X_val, y_val

# Read data

In [None]:
#if we do have the saved data, we run all files
# X_train, y_train, X_test, y_test, X_val, y_val = get_all_data()

# OR if we have saved data, we load it
X_train = np.load('2_X_train_mfcc.npy')
X_test = np.load('2_X_test_mfcc.npy')
X_val = np.load('2_X_val_mfcc.npy')

y_train = np.load('2_y_train_mfcc.npy')
y_test = np.load('2_y_test_mfcc.npy')
y_val = np.load('2_y_val_mfcc.npy')

X_train_vn, y_train_vn = read_data_vn(root_path='/content/drive/MyDrive/Dataset/Sounds/Bee_VN_add_BUZZ2/train_200', labels=['0'])
X_test_vn, y_test_vn = read_data_vn(root_path='/content/drive/MyDrive/Dataset/Sounds/Bee_VN_add_BUZZ2/test_800', labels=['0'])
X_val_vn, y_val_vn = read_data_vn(root_path='/content/drive/MyDrive/Dataset/Sounds/Bee_VN_add_BUZZ2/val_1600', labels=['0'])

X_train_vn = np.concatenate([X_train, X_train_vn], axis=0)
X_test_vn = np.concatenate([X_test, X_test_vn], axis=0)
X_val_vn = np.concatenate([X_val, X_val_vn], axis=0)
y_train_vn = np.concatenate([y_train, y_train_vn], axis=0)
y_test_vn = np.concatenate([y_test, y_test_vn], axis=0)
y_val_vn = np.concatenate([y_val, y_val_vn], axis=0)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print(X_val.shape)
print(y_val.shape)

In [None]:
X = np.concatenate([X_train, X_test], axis=0)
y = np.concatenate([y_train, y_test], axis=0)

# Feature selection

## SelectFromModel

In [None]:
# from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
# from sklearn.feature_selection import SelectFromModel
# clf = ExtraTreesClassifier(n_estimators=100)
# clf = clf.fit(X, y)
# model = SelectFromModel(estimator=clf, prefit=True,
                        # max_features=0.5)

# X_train = model.transform(X_train)
# X_val = model.transform(X_val)
# X_test = model.transform(X_test)
# X = model.transform(X)

## RFECV

In [None]:
# from sklearn.feature_selection import RFECV
# from sklearn.ensemble import ExtraTreesClassifier
# rfe = RFECV(estimator=ExtraTreesClassifier(),
#             cv=5, scoring='accuracy', step=0.3)
# rfe.fit(X_train, y_train)
# X_train = rfe.transform(X_train)
# X_val = rfe.transform(X_val)
# X_test = rfe.transform(X_test)

## Select KBest

In [None]:
# from sklearn.feature_selection import SelectKBest, chi2
# kbest = SelectKBest(chi2, k=20).fit(X, y)

# X = kbest.transform(X)
# X_train = kbest.transform(X_train)
# X_test = kbest.transform(X_test)
# X_val = kbest.transform(X_val)
# X_train = SelectKBest(chi2, k=20).fit_transform(X, y)

## Sequential Selection

In [None]:
# from sklearn.feature_selection import SequentialFeatureSelector

# clf = ExtraTreesClassifier(random_state=42)
# sfs = SequentialFeatureSelector(clf,
#                                 n_features_to_select=3)
# sfs.fit(X, y)

# X = sfs.transform(X)
# X_train = sfs.transform(X_train)
# X_test = sfs.transform(X_test)
# X_val = sfs.transform(X_val)

# Ensemble feature selections NOT FINISH

In [None]:
# from sklearn.feature_selection import RFE
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import f_classif
# from sklearn.pipeline import Pipeline

# models = []
# fs = SelectKBest(score_func=f_classif, k=20)
# models.append(('fs', fs))
# rfe = RFE(estimator=ExtraTreesClassifier(), n_features_to_select=20)
# models.append(('rfe', rfe))

# Tuning models

In [None]:
def search_for_best_models(clf, params, X_train, y_train, X_test, y_test,
                           scoring='accuracy', cv=5):
    X = np.concatenate([X_train, X_test], axis=0)
    y = np.concatenate([y_train, y_test], axis=0)
    optimal_models = RandomizedSearchCV(clf,
                            param_distributions=params,
                            cv=cv, n_jobs=-1,
                            random_state=42,
                            n_iter=10,
                            scoring=scoring)
    optimal_models.fit(X, y)

    return (optimal_models.best_params_,
            optimal_models.best_estimator_)

#SVM

In [None]:
import scipy
svm_params = {'C': scipy.stats.expon(scale=1.),
              'gamma': scipy.stats.expon(scale=.1),
              'kernel': ['rbf'], 'class_weight':[None]}
optimal_params, _ = search_for_best_models(SVC(),
                                           params=svm_params,
                                           X_train=X_train,
                                           y_train=y_train,
                                           X_test=X_test,
                                           y_test=y_test)


In [None]:
optimal_params

In [None]:
model = SVC(C=optimal_params['C'],
          gamma=optimal_params['gamma'],
          kernel=optimal_params['kernel'],
          class_weight=optimal_params['class_weight'],
          random_state=42)
model.fit(X_train, y_train)
y_predict = model.predict(X_val)
print('Accuracy Score is {:.5}'.format(accuracy_score(y_val, y_predict)))
print(classification_report(y_val, y_predict))

#Decision Tree

In [None]:
params = {"max_depth": ['None', 2, 3, 5, 10, 15],
        "min_samples_split": [2, 3, 5, 10],
        "min_samples_leaf": [1, 2, 3, 5],
      }

optimal_params, optimal_models = search_for_best_models(clf=DecisionTreeClassifier(),
                                                         params=params,
                                                         X_train=X_train,
                                                         y_train=y_train,
                                                        X_test=X_test,
                                                        y_test=y_test)

In [None]:
print(optimal_params)

In [None]:
model = DecisionTreeClassifier(max_depth=optimal_params['max_depth'],
                               min_samples_split=optimal_params['min_samples_split'],
                               min_samples_leaf=optimal_params['min_samples_leaf'],
                               random_state=42)

In [None]:
model.fit(X, y)

In [None]:
y_preds = model.predict(X_val)
print(classification_report(y_true=y_val, y_pred=y_preds))

## ROC CURVE Decision Tree

In [None]:
from sklearn import metrics
import seaborn as sns
n_classes = 3

y_score = model.predict_proba(X_val)
# y_score = model.decision_function(X_test)

fpr = dict()
tpr = dict()
roc_auc = dict()

y_val_dummies = pd.get_dummies(y_val, drop_first=False).values
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_val_dummies[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fig, ax = plt.subplots(figsize=(12, 9))
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('Receiver operating characteristic example')
for i in range(n_classes):
    ax.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f) for label %i' % (roc_auc[i], i))
ax.legend(loc="best")
ax.grid(alpha=.4)
sns.despine()
plt.show()

# Extra Tree

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
random.seed(42)
np.random.seed(42)
params = {
    'n_estimators':[50, 100, 200,
                    300, 500, 1000],
    "max_features": ['auto', 'sqrt', 'None', 2, 4, 5, 9, 10, 15, 18],
    'max_depth':[None, 2, 5, 8, 10],
    'min_samples_split': [2, 3, 5],
    'criterion': ['gini', 'entropy']
}

In [None]:
optimal_params, optimal_models = search_for_best_models(
    clf=RandomForestClassifier(), params=params,
    X_train=X_train, y_train=y_train,
    X_test=X_test, y_test=y_test)
model = ExtraTreesClassifier(n_estimators=optimal_params['n_estimators'],
                              max_depth=optimal_params['max_depth'],
                              max_features=optimal_params['max_features'],
                               min_samples_leaf=optimal_params['min_samples_split'],
                               random_state=42)

In [None]:
print(optimal_params)

In [None]:
model.fit(X, y)

In [None]:
y_preds = model.predict(X_val)
print(classification_report(y_true=y_val, y_pred=y_preds))

## ROC CURVE Extra Tree

In [None]:
from sklearn import metrics
import seaborn as sns
n_classes = 3

y_score = model.predict_proba(X_val)
# y_score = model.decision_function(X_test)

fpr = dict()
tpr = dict()
roc_auc = dict()

y_val_dummies = pd.get_dummies(y_val, drop_first=False).values
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_val_dummies[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fig, ax = plt.subplots(figsize=(12, 9))
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('Receiver operating characteristic example')
for i in range(n_classes):
    ax.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f) for label %i' % (roc_auc[i], i))
ax.legend(loc="best")
ax.grid(alpha=.4)
sns.despine()
plt.show()

#RandomForest

In [None]:
random.seed(42)
np.random.seed(42)
params = {
    'n_estimators':[50, 100, 200,
                    300, 500, 1000],
    "max_features": ['auto', 'sqrt', 'None', 2, 4, 5, 9, 10, 15, 18],
    'max_depth':[None, 2, 5, 8, 10],
    'min_samples_split': [2, 3, 5],
    'criterion': ['gini', 'entropy']
}
optimal_params, optimal_models = search_for_best_models(
    clf=RandomForestClassifier(), params=params,
    X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
model = RandomForestClassifier(n_estimators=optimal_params['n_estimators'],
                              max_depth=optimal_params['max_depth'],
                              max_features=optimal_params['max_features'],
                               min_samples_leaf=optimal_params['min_samples_split'],
                               random_state=42)

model.fit(X, y)

In [None]:
print(optimal_params)

In [None]:
y_predict = model.predict(X_val)
print('Accuracy Score is {:.5}'.format(accuracy_score(y_val, y_predict)))
print(classification_report(y_val, y_predict))

## ROC CURVE RandomForest

In [None]:
from sklearn import metrics
import seaborn as sns
n_classes = 3

y_score = model.predict_proba(X_val)
# y_score = model.decision_function(X_test)

fpr = dict()
tpr = dict()
roc_auc = dict()

y_val_dummies = pd.get_dummies(y_val, drop_first=False).values
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_val_dummies[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fig, ax = plt.subplots(figsize=(12, 9))
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('Receiver operating characteristic example')
for i in range(n_classes):
    ax.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f) for label %i' % (roc_auc[i], i))
ax.legend(loc="best")
ax.grid(alpha=.4)
sns.despine()
plt.show()

#XGBoost

In [None]:
param_grid = {
     'max_depth': [2, 4, 5, 8, 12, 15],
     'learning_rate': [0.001, 0.01, 0.1, 0.2],
     'gamma': [0.01, 0.1, 0.25, 0.5],
     'reg_lambda': [10.0, 20., 50., 100.],
      'scale_pos_weight': [1]}
optimal_params = RandomizedSearchCV(
    estimator=XGBClassifier(
    seed=42, subsample=0.9,
    colsample_bytree=0.5),
    param_distributions=param_grid,
    scoring='accuracy',
    n_jobs=-1, cv=5)
optimal_params.fit(X, y)
params = optimal_params.best_params_
model = XGBClassifier(
                        gamma=params['gamma'],
                        learn_rate=params['learning_rate'],
                        max_depth=params['max_depth'],
                        reg_lambda=params['reg_lambda'],
                        scale_pos_weight=1,
                        subsample=0.9,
                        colsample_bytree=0.5,
                        seed=42,
                        n_jobs=4)
model.fit(X_train, y_train)

In [None]:
print(params)

In [None]:
y_predict = model.predict(X_val)
print('Accuracy Score is {:.5}'.format(accuracy_score(y_val, y_predict)))
print(classification_report(y_val, y_predict))

# Logistic Regression

In [None]:
param_dist = {
    'penalty' : ['None', 'l1', 'l2'],
    'C' : [0.001, 0.01, 0.1, 0.5, 1.0, 1.5, 2.0]
}
lr_model = LogisticRegression(multi_class='multinomial',
                           solver='lbfgs')
optimal_params, optimal_model = search_for_best_models(clf=lr_model,
                                                       X_train=X_train,
                                                       y_train=y_train,
                                                       X_test=X_test,
                                                       y_test=y_test,
                                                       params=param_dist)
lr_model = LogisticRegression(multi_class='multinomial',
                           solver='lbfgs',
                           penalty=optimal_params['penalty'],
                           C=optimal_params['C'])
lr_model.fit(X, y)
y_preds = lr_model.predict(X_val)
print(classification_report(y_true=y_val,
                            y_pred=y_preds))


In [None]:
print(optimal_params)

## ROC CURVE Logistic Regression

In [None]:
from sklearn import metrics
import seaborn as sns
n_classes = 3

y_score = lr_model.predict_proba(X_val)
# y_score = model.decision_function(X_test)

fpr = dict()
tpr = dict()
roc_auc = dict()

y_val_dummies = pd.get_dummies(y_val, drop_first=False).values
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_val_dummies[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fig, ax = plt.subplots(figsize=(12, 9))
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('Receiver operating characteristic example')
for i in range(n_classes):
    ax.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f) for label %i' % (roc_auc[i], i))
ax.legend(loc="best")
ax.grid(alpha=.4)
sns.despine()
plt.show()

# GradientBoosting

In [None]:
np.random.seed(42)
params = {
    'loss': ['deviance', 'exponential'],
    'learning_rate': [0.001, 0.01, 0.1, 0.5, 1., 2.],
    'n_estimators':[50, 100, 200,
                    300, 500, 1000],
    "max_features": ['auto', 'sqrt', 'None', 2, 4, 5, 9, 10, 15, 18],
    'max_depth':[None, 2, 5, 8, 10],
    'min_samples_split': [2, 3, 5],
    'criterion': ['friedman_mse', 'mse']
}
optimal_params, optimal_models = search_for_best_models(
    clf=GradientBoostingClassifier(), params=params,
    X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
model = GradientBoostingClassifier(n_estimators=optimal_params['n_estimators'],
                        loss=optimal_params['loss'],
                        learning_rate=optimal_params['learning_rate'],
                        max_depth=optimal_params['max_depth'],
                        max_features=optimal_params['max_features'],
                        min_samples_leaf=optimal_params['min_samples_split'],
                        random_state=42)
model.fit(X_train, y_train)
y_predict = model.predict(X_val)
print('Accuracy Score is {:.5}'.format(accuracy_score(y_val, y_predict)))
print(classification_report(y_val, y_predict))

In [None]:
print(optimal_params)

## ROC CURVE Gradient Boosting

In [None]:
from sklearn import metrics
import seaborn as sns
n_classes = 3

y_score = model.predict_proba(X_val)
# y_score = model.decision_function(X_test)

fpr = dict()
tpr = dict()
roc_auc = dict()

y_val_dummies = pd.get_dummies(y_val, drop_first=False).values
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_val_dummies[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fig, ax = plt.subplots(figsize=(12, 9))
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('Receiver operating characteristic example')
for i in range(n_classes):
    ax.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f) for label %i' % (roc_auc[i], i))
ax.legend(loc="best")
ax.grid(alpha=.4)
sns.despine()
plt.show()

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
params = {
    'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'algorithm': ['auto', 'ball_tree', 'kd_tree'],
}
optimal_params, optimal_models = search_for_best_models(clf=KNeighborsClassifier(),
                                                        params=params,
                                                        X_train=X_train,
                                                        y_train=y_train,
                                                        X_test=X_test,
                                                        y_test=y_test)


In [None]:
print(optimal_params)

In [None]:
neigh = KNeighborsClassifier(
    n_neighbors=optimal_params['n_neighbors'],
    algorithm=optimal_params['algorithm']
)
neigh.fit(X, y)

In [None]:
y_predict = neigh.predict(X_val)
print('Accuracy Score is {:.5}'.format(accuracy_score(y_val, y_predict)))
print(classification_report(y_val, y_predict))

## ROC CURVE KNN

In [None]:
from sklearn import metrics
import seaborn as sns
n_classes = 3

y_score = model.predict_proba(X_val)
# y_score = model.decision_function(X_test)

fpr = dict()
tpr = dict()
roc_auc = dict()

y_val_dummies = pd.get_dummies(y_val, drop_first=False).values
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_val_dummies[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fig, ax = plt.subplots(figsize=(12, 9))
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('Receiver operating characteristic example')
for i in range(n_classes):
    ax.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f) for label %i' % (roc_auc[i], i))
ax.legend(loc="best")
ax.grid(alpha=.4)
sns.despine()
plt.show()

# MLP

In [None]:
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import backend as K
# from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.utils import to_categorical

input_train = X_train
target_train = to_categorical(y_train)
input_val = X_val
target_val = to_categorical(y_val)
input_test = X_test
target_test = to_categorical(y_test)

input_layer = keras.layers.Input(shape=(input_train.shape[1]))
dense = keras.layers.Dense(256, activation = 'relu')(input_layer)
dense = keras.layers.Dense(256, activation = 'relu')(dense)
dense = keras.layers.Dropout(0.6)(dense)
dense = keras.layers.Dense(128, activation = 'relu')(dense)
dense = keras.layers.Dropout(0.5)(dense)
dense = keras.layers.Dense(3, activation = 'softmax')(dense)
model = Model(inputs=input_layer, outputs=dense)
model.compile(loss='categorical_crossentropy',
              metrics=['accuracy'], optimizer='adam')

In [None]:
model.fit(input_train,
          target_train,
          batch_size=256,
          validation_data=(input_test, target_test),
          epochs=80)

In [None]:
model.evaluate(input_val,
               target_val)

# VAE

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
# from tensorlow.keras import regularizers
from tensorflow.keras import regularizers
random.seed(42)
np.random.seed(42)

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_reshape = scaler.transform(X_train)
X_test_reshape = scaler.transform(X_test)
X_val_reshape = scaler.transform(X_val)

X_train_vn_reshape = scaler.transform(X_train_vn)
X_test_vn_reshape = scaler.transform(X_test_vn)
X_val_vn_reshape = scaler.transform(X_val_vn)

In [None]:
original_dim = X_train_reshape.shape[1]
intermediate_dim = 32
latent_dim = 2

In [None]:
random.seed(42)
np.random.seed(42)

def sampling(args):
    z_mean, z_log_sigma = args
    epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim),
                              mean=0., stddev=0.1)
    return z_mean + K.exp(z_log_sigma) * epsilon

inputs = keras.Input(shape=(original_dim,))
h = layers.Dense(16, activation='relu')(inputs)
# h = layers.Dense(256, activation='relu')(h)
h = layers.Dense(intermediate_dim, activation='elu')(h)
z_mean = layers.Dense(latent_dim)(h)
z_log_sigma = layers.Dense(latent_dim)(h)

z = layers.Lambda(sampling)([z_mean, z_log_sigma])

# Create encoder
encoder = keras.Model(inputs, [z_mean, z_log_sigma, z], name='encoder')
# Create decoder
latent_inputs = keras.Input(shape=(latent_dim,), name='z_sampling')
x = layers.Dense(intermediate_dim, activation='elu')(latent_inputs)
# x = layers.Dense(256, activation='relu')(x)
x = layers.Dense(16, activation='elu')(x)
outputs = layers.Dense(original_dim, activation='linear')(x)
decoder = keras.Model(latent_inputs, outputs, name='decoder')
# instantiate VAE model
outputs = decoder(encoder(inputs)[2])
vae = keras.Model(inputs, outputs, name='vae_mlp')

# reconstruction_loss = keras.losses.binary_crossentropy(inputs, outputs)
reconstruction_loss = keras.losses.mean_squared_error(inputs, outputs)
reconstruction_loss *= original_dim
kl_loss = 1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = K.mean(reconstruction_loss + kl_loss)
vae.add_loss(vae_loss)
vae.compile(optimizer='adam', metrics=['mse'])

In [None]:
vae.fit(X_train_reshape, X_train_reshape,
        epochs=100,
        batch_size=256,
        validation_split=0.1,
        verbose=0)

In [None]:
labels = {
    0: 'Bee',
    1: 'Cricket',
    2: 'Noise'
}

print(labels[int(y[0])])

In [None]:
np.random.seed(42)
random.seed(42)

X_encoded = encoder.predict(X_train_reshape)
X_encoded = np.asarray(X_encoded)
X_encoded = X_encoded[0,:, :]

plt.figure(figsize=(15, 10))
for label in np.unique(y):
    plt.scatter(X_encoded[y_train==label, 0],
                X_encoded[y_train==label, 1],
                label=str(labels[int(label)]))
# plt.title("Training data on 2 dimensions")
# plt.colorbar()
plt.legend()

In [None]:
np.random.seed(42)
random.seed(42)

X_test_encoded = encoder.predict(X_test_reshape)
X_test_encoded = np.asarray(X_test_encoded)
X_test_encoded = X_test_encoded[0,:, :]

plt.figure(figsize=(15, 10))
for label in np.unique(y_val):
    plt.scatter(X_test_encoded[y_test==label, 0],
                X_test_encoded[y_test==label, 1], 
                label=str(labels[int(label)]))
plt.legend()
# plt.title("None churn Validation data - Latent vector")
# plt.colorbar()

In [None]:
np.random.seed(42)
random.seed(42)

X_val_encoded = encoder.predict(X_val_reshape)
X_val_encoded = np.asarray(X_val_encoded)
X_val_encoded = X_val_encoded[0,:, :]

plt.figure(figsize=(15, 10))
for label in np.unique(y_val):
    plt.scatter(X_val_encoded[y_val==label, 0],
                X_val_encoded[y_val==label, 1], 
                label=str(labels[int(label)]))
plt.legend()
# plt.title("None churn Validation data - Latent vector")
# plt.colorbar()

## Adding Buzz 2

In [None]:
np.random.seed(42)
random.seed(42)

X_encoded = encoder.predict(X_train_vn_reshape)
X_encoded = np.asarray(X_encoded)
X_encoded = X_encoded[0,:, :]

plt.figure(figsize=(15, 10))
for label in np.unique(y_train_vn):
    plt.scatter(X_encoded[y_train_vn==label, 0],
                X_encoded[y_train_vn==label, 1],
                label=str(labels[int(label)]))
# plt.title("Training data on 2 dimensions")
# plt.colorbar()
plt.legend()

In [None]:
np.random.seed(42)
random.seed(42)

X_test_encoded = encoder.predict(X_test_vn_reshape)
X_test_encoded = np.asarray(X_test_encoded)
X_test_encoded = X_test_encoded[0,:, :]

plt.figure(figsize=(15, 10))
for label in np.unique(y_test_vn):
    plt.scatter(X_test_encoded[y_test_vn==label, 0],
                X_test_encoded[y_test_vn==label, 1], 
                label=str(labels[int(label)]))
plt.legend()
# plt.title("None churn Validation data - Latent vector")
# plt.colorbar()

In [None]:
np.random.seed(42)
random.seed(42)

X_val_encoded = encoder.predict(X_val_vn_reshape)
X_val_encoded = np.asarray(X_val_encoded)
X_val_encoded = X_val_encoded[0,:, :]

plt.figure(figsize=(15, 10))
for label in np.unique(y_val_vn):
    plt.scatter(X_val_encoded[y_val_vn==label, 0],
                X_val_encoded[y_val_vn==label, 1], 
                label=str(labels[int(label)]))
plt.legend()
# plt.title("None churn Validation data - Latent vector")
# plt.colorbar()