In [33]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt
from tensorflow.keras.regularizers import L1L2
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam, SGD, Adagrad
from tensorflow.keras.metrics import Accuracy, BinaryAccuracy, CategoricalAccuracy, mean_squared_error
from tensorflow.keras.layers import Conv2D, Conv1D, Dense, Flatten, Reshape, LeakyReLU, Dropout, UpSampling2D, UpSampling1D, Softmax, MaxPool1D

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '1'
tqdm.pandas()

print(pd.__version__)
print(tf.__version__)
print("GPUs Available: ", tf.config.list_physical_devices('GPU'))

2.0.2
2.12.0
GPUs Available:  []


In [3]:
data_path = save_path = os.path.join('..', '..', 'data', 'mfcc_data', 'split_samples')
data_path

'../../data/mfcc_data/split_samples'

In [4]:
phonon_dirs = os.listdir(data_path)
phonon_dirs

['aa', 'yy', 'ee', 'uu', 'oo']

In [5]:
dfs = {}
for _, phonon in tqdm(enumerate(phonon_dirs), total=5, desc='loading dataframes to environment'):
    dfp = {}
    dfp['train'] = pd.read_csv(os.path.join(data_path, phonon, 'train.csv'), sep=',', index_col=False)
    dfp['val'] = pd.read_csv(os.path.join(data_path, phonon, 'validation.csv'), sep=',', index_col=False)
    dfp['test'] = pd.read_csv(os.path.join(data_path, phonon, 'test.csv'), sep=',', index_col=False)
    dfs[phonon] = dfp

loading dataframes to environment: 100%|██████████| 5/5 [00:07<00:00,  1.59s/it]


In [6]:
def conv_to_32(df, cols): df[cols] = df[cols].astype(np.float32)

for key, dfp in tqdm(dfs.items(), total=5, desc='dropping columns'):
    for type_df, df in dfp.items():
        df.drop(['per_frame_idx', 'mb_name', 'phonon'], axis=1, inplace=True)

        # convert float64 to float32
        conv_to_32(df, ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12'])

        # convert catagory column to encoded
        df['asthma_status'] = df['asthma_status'].astype('category')
        df['asthma_status'] = df['asthma_status'].cat.codes

dropping columns: 100%|██████████| 5/5 [00:00<00:00,  8.18it/s]


In [8]:
dfs['aa']['train']['asthma_status'].value_counts()

asthma_status
0    326244
1    204440
Name: count, dtype: int64

In [9]:
scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)

In [10]:
dfs_processed = {}
for key, dfp in tqdm(dfs.items(), total=5, desc='scaling and splitting'):
    dfp_processed = {}

    for type_df, df in dfp.items():
        dfp_processed[type_df] = {}
        X = df.loc[:, df.columns != 'asthma_status'].to_numpy()
        Y = df.loc[:, 'asthma_status'].to_numpy()
        X_scaled = scaler.fit_transform(X)
        
        dfp_processed[type_df]['x'] = X_scaled
        dfp_processed[type_df]['y'] = Y
    dfs_processed[key] = dfp_processed

scaling and splitting: 100%|██████████| 5/5 [00:00<00:00,  8.68it/s]


### Autoencoder

In [42]:
class AutoEncoder:
    def __init__(self, input_dim):
        self.input_dim = input_dim

        self.encoder = self.make_encoder()
        self.decoder = self.make_decoder()
        self.autoencoder = keras.Sequential([self.encoder, self.decoder], name='Phonon_Autoencoder')

        self.autoencoder.compile(optimizer='adam', loss='mse')

    def make_encoder(self):
        encoder = keras.Sequential([
            Dense(32, activation='relu', input_shape=(self.input_dim,)),
            Dense(64, activation='relu'),
            Dense(128, activation='relu')
        ], name='phonon_encoder')
        return encoder

    
    def make_decoder(self):
        decoder = keras.Sequential([
            Dense(64, activation='relu', input_shape=(128,)),
            Dense(32, activation='relu'),
            Dense(self.input_dim, activation='linear')
        ], name='phonon_decoder')
        return decoder

ae = AutoEncoder(12)

In [39]:
class AutoEncoder_Tuned:
    def __init__(self, input_dim):
        self.input_dim = input_dim        

    def make_encoder(self, hp_layers, l1r, l2r):
        encoder = keras.Sequential([
            Dense(hp_layers[0], activation='relu', activity_regularizer=L1L2(l1=l1r, l2=l2r), input_shape=(self.input_dim,)),
            Dropout(0.10),
            Dense(hp_layers[1], activation='relu', activity_regularizer=L1L2(l1=l1r, l2=l2r))
        ], name='phonon_encoder')
        return encoder

    
    def make_decoder(self, hp_layers, l1r, l2r):
        decoder = keras.Sequential([
            Dense(hp_layers[1], activation='relu', activity_regularizer=L1L2(l1=l1r, l2=l2r), input_shape=(hp_layers[0],)),
            Dropout(0.10),
            Dense(hp_layers[2], activation='relu', activity_regularizer=L1L2(l1=l1r, l2=l2r)),
            Dropout(0.10),
            Dense(self.input_dim, activation='linear')
        ], name='phonon_decoder')
        return decoder
    
    def make_model(self, hp):
        l1r, l2r = 0.005, 0.005
        hpl0 = hp.Int('layer0', min_value=64, max_value=512, step=64)
        hpl1 = hp.Int('layer1', min_value=64, max_value=512, step=64)
        hpl2 = hp.Int('layer2', min_value=64, max_value=512, step=64)
        hpl3 = hp.Int('layer3', min_value=64, max_value=512, step=64)

        self.encoder = self.make_encoder((hpl0, hpl1), l1r, l2r)
        self.decoder = self.make_decoder((hpl1, hpl2, hpl3), l1r, l2r)

        autoencoder = keras.Sequential([self.encoder, self.decoder], name='Phonon_Autoencoder')
        autoencoder.compile(optimizer='adam', loss='mse')

        return autoencoder


aet = AutoEncoder_Tuned(12)

In [40]:
ae_tuner1 = kt.Hyperband(
    aet.make_model,
    objective='val_loss',
    max_epochs=15,
    factor=3,
    directory='tuned_models',
    project_name='aa_us'
)

In [37]:
train_aa = dfs_processed['aa']['train']
val_aa = dfs_processed['aa']['val']

In [41]:
stop_early = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
# tb_storage = keras.callbacks.TensorBoard("/tmp/tb_logs")
ae_tuner1.search(
    train_aa['x'], train_aa['y'], 
    epochs=30, 
    validation_data=(val_aa['x'], val_aa['y']), 
    batch_size=128, 
    use_multiprocessing=True, 
    callbacks=[stop_early]
)

Trial 30 Complete [00h 01m 27s]
val_loss: 0.34715625643730164

Best val_loss So Far: 0.29748964309692383
Total elapsed time: 00h 41m 21s
INFO:tensorflow:Oracle triggered exit


In [None]:
metric_hist = ae.autoencoder.history.history

# summarize history for loss
plt.plot(metric_hist['loss'])
plt.plot(metric_hist['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

### Neural Network

In [83]:
def reconstructionScore(originalDF, reducedDF):
    loss = np.sum((np.array(originalDF) - np.array(reducedDF))**2, axis=0)
    # loss = pd.Series(data=loss,index=originalDF.index)
    loss = (loss-np.min(loss))/(np.max(loss)-np.min(loss))
    print('Mean for reconstruction scores: ', np.mean(loss))
    return loss