import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam, SGD, Adagrad
from tensorflow.keras.metrics import Accuracy, BinaryAccuracy, CategoricalAccuracy, mean_squared_error
from tensorflow.keras.layers import Conv2D, Conv1D, Dense, Flatten, Reshape, LeakyReLU, Dropout, UpSampling2D, UpSampling1D, Softmax, MaxPool1D

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '1'
tqdm.pandas()

print(pd.__version__)
print(tf.__version__)
print("GPUs Available: ", tf.config.list_physical_devices('GPU'))

In [5]:
data_path = save_path = os.path.join('..', '..', 'data', 'mfcc_data', 'split_samples')
data_path

'../../data/mfcc_data/split_samples'

In [7]:
phonon_dirs = os.listdir(data_path)
phonon_dirs

['aa', 'yy', 'ee', 'uu', 'oo']

In [31]:
dfs = {}
for _, phonon in tqdm(enumerate(phonon_dirs), total=5, desc='loading dataframes to environment'):
    dfp = {}
    dfp['train'] = pd.read_csv(os.path.join(data_path, phonon, 'train.csv'), sep=',', index_col=False)
    dfp['val'] = pd.read_csv(os.path.join(data_path, phonon, 'validation.csv'), sep=',', index_col=False)
    dfp['test'] = pd.read_csv(os.path.join(data_path, phonon, 'test.csv'), sep=',', index_col=False)
    dfs[phonon] = dfp

loading dataframes to environment: 100%|██████████| 5/5 [00:06<00:00,  1.31s/it]


In [32]:
def conv_to_32(df, cols): df[cols] = df[cols].astype(np.float32)

for key, dfp in tqdm(dfs.items(), total=5, desc='dropping columns'):
    for type_df, df in dfp.items():
        df.drop(['per_frame_idx', 'mb_name', 'phonon'], axis=1, inplace=True)

        # convert float64 to float32
        conv_to_32(df, ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12'])

        # convert catagory column to encoded
        df['asthma_status'] = df['asthma_status'].astype('category')
        df['asthma_status'] = df['asthma_status'].cat.codes

dropping columns:   0%|          | 0/5 [00:00<?, ?it/s]

dropping columns: 100%|██████████| 5/5 [00:00<00:00,  8.25it/s]


In [35]:
dfs['aa']['train']['asthma_status'].value_counts()

asthma_status
0    326244
1    204440
Name: count, dtype: int64

In [36]:
scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)

In [45]:
dfs_processed = {}

for key, dfp in tqdm(dfs.items(), total=5, desc='scaling and splitting'):
    dfp_processed = {}

    for type_df, df in dfp.items():
        dfp_processed[type_df] = {}
        X = df.loc[:, df.columns != 'asthma_status'].to_numpy()
        Y = df.loc[:, 'asthma_status'].to_numpy()
        X_scaled = scaler.fit_transform(X)
        
        dfp_processed[type_df]['x'] = X_scaled
        dfp_processed[type_df]['y'] = Y
    dfs_processed[key] = dfp_processed

scaling and splitting: 100%|██████████| 5/5 [00:00<00:00,  6.76it/s]


In [46]:
dfs_processed['uu']['train']

{'x': array([[-1.337415  , -0.6985663 ,  1.2311405 , ..., -0.3127998 ,
          0.19380844,  1.1113658 ],
        [-0.1615102 , -0.3692186 ,  0.3990377 , ..., -0.8130399 ,
         -1.4893736 , -0.66435325],
        [ 0.29784968, -0.31937256,  0.78876686, ..., -1.7560292 ,
         -1.1412911 , -0.5701466 ],
        ...,
        [-0.7438909 , -1.977828  , -2.2126198 , ..., -0.48026887,
          0.61447006,  1.0540202 ],
        [-0.5610471 , -1.8996087 , -1.708643  , ...,  0.28472131,
          0.5608368 ,  1.0788132 ],
        [-1.1814328 , -1.6038698 , -2.426512  , ...,  0.32641545,
          0.4170865 ,  0.83271617]], dtype=float32),
 'y': array([0, 0, 0, ..., 0, 0, 0], dtype=int8)}

In [49]:
# tuned model
def make_model_s(hp):                  
    hp_layer1 = hp.Int('layer1', min_value=64, max_value=512, step=64)
    hp_layer2 = hp.Int('layer2', min_value=64, max_value=512, step=64)
    hp_layer3 = hp.Int('layer3', min_value=64, max_value=512, step=64)

    ### model layout
    model = keras.Sequential(name='Phonon_to_Asthama_NN')
    model.add(keras.Input(shape=(12,)))
    model.add(Dense(units=hp_layer1, activation='relu'))
    model.add(Dropout(0.30))
    model.add(Dense(units=hp_layer2, activation='relu'))
    model.add(Dropout(0.30))
    model.add(Dense(units=hp_layer3, activation='relu'))
    model.add(Dropout(0.30))
    model.add(Dense(1, activation='sigmoid'))
    model.summary()

    ### model compiling
    model.compile(
        loss=BinaryCrossentropy(),
        optimizer='adam',
        metrics=[BinaryAccuracy()]
    )

    return model

In [50]:
tuner_4t = kt.Hyperband(
    make_model_s,
    objective='val_binary_accuracy',
    max_epochs=15,
    factor=3,
    directory='tuned_models',
    project_name='aa_s'
)

Model: "Phonon_to_Asthama_NN"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                832       
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 1)        

In [54]:
train_aa = dfs_processed['aa']['train']
val_aa = dfs_processed['aa']['val']

In [55]:
stop_early = keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=3)
tb_storage = keras.callbacks.TensorBoard("/tmp/tb_logs")
tuner_4t.search(
    train_aa['x'], train_aa['y'], 
    epochs=15, 
    validation_data=(val_aa['x'], val_aa['y']), 
    batch_size=128, 
    use_multiprocessing=True, 
    callbacks=[stop_early, tb_storage]
)


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
320               |320               |layer1
384               |384               |layer2
448               |448               |layer3
2                 |2                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
0                 |0                 |tuner/round

Model: "Phonon_to_Asthama_NN"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 320)               4160      
                                                                 
 dropout (Dropout)           (None, 320)               0         
                                                                 
 dense_1 (Dense)             (None, 384)               123264    
                                                                 
 dropout_1 (Dro

KeyboardInterrupt: 

In [175]:
best_hps = tuner_4t.get_best_hyperparameters()[0]
best_model = tuner_4t.hypermodel.build(best_hps)

Model: "Phonon_to_Asthama_NN"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_16 (Dense)            (None, 384)               4992      
                                                                 
 dropout_12 (Dropout)        (None, 384)               0         
                                                                 
 dense_17 (Dense)            (None, 448)               172480    
                                                                 
 dropout_13 (Dropout)        (None, 448)               0         
                                                                 
 dense_18 (Dense)            (None, 512)               229888    
                                                                 
 dropout_14 (Dropout)        (None, 512)               0         
                                                                 
 dense_19 (Dense)            (None, 1)        

In [190]:
# best_model.save(os.path.join('saved_models', 'supervised', 'aa1.h5'))

In [None]:
# model = best_model.fit(x_train, y_train, epochs=5, validation_split=0.2, use_multiprocessing=True)
# best_epoch = history.history['val_binary_accuracy'].index(max(history.history['val_binary_accuracy'])) + 1
# print('Best epoch: %d' % (best_epoch,))

In [None]:
# hypermodel = tuner.hypermodel.build(best_hps)
# hypermodel.fit(x_train, y_train, epochs=best_epoch, validation_split=0.2, callbacks=[
#     keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=3)
# ])

In [None]:
# summarize history for accuracy
plt.plot(best_model.history['accuracy'])
plt.plot(best_model.history['val_accuracy'])
plt.title('model binary_accuracy')
plt.ylabel('binary_accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(best_model.history['loss'])
plt.plot(best_model.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [207]:
y_pred = best_model.predict(x_test)

   1/2399 [..............................] - ETA: 48s



In [215]:
y_test.shape

(76768,)

In [213]:
def convert_to_binary(pred, threshold=0.5):
    return np.where(pred >= threshold, 1, 0)

# Example usage
y_pred_th = convert_to_binary(y_pred).T[0]
y_pred_th

array([0, 0, 1, ..., 1, 0, 1])

In [219]:
test_acc = 1 - sum(np.abs(y_pred_th - y_test))/y_test.shape[0]
print(f'Test accuracy: {test_acc:.2f}')

Test accuracy: 0.96
