In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt
from tensorflow.keras.regularizers import L1L2
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam, SGD, Adagrad
from tensorflow.keras.metrics import Accuracy, BinaryAccuracy, CategoricalAccuracy, mean_squared_error
from tensorflow.keras.layers import Conv2D, Conv1D, Dense, Flatten, Reshape, LeakyReLU, Dropout, UpSampling2D, UpSampling1D, Softmax, MaxPool1D

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '1'
tqdm.pandas()

print(pd.__version__)
print(tf.__version__)
print("GPUs Available: ", tf.config.list_physical_devices('GPU'))

2023-07-10 21:23:09.078351: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-10 21:23:09.346278: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-10 21:23:09.425874: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-10 21:23:09.428901: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2.0.2
2.12.0
GPUs Available:  []


2023-07-10 21:23:14.224703: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-10 21:23:15.755978: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [2]:
data_path = save_path = os.path.join('..', '..', 'data', 'mfcc_data', 'split_samples')
data_path

'../../data/mfcc_data/split_samples'

In [3]:
phonon_dirs = os.listdir(data_path)
phonon_dirs

['aa', 'yy', 'ee', 'uu', 'oo']

In [4]:
dfs = {}
for _, phonon in tqdm(enumerate(phonon_dirs), total=5, desc='loading dataframes to environment'):
    dfp = {}
    dfp['train'] = pd.read_csv(os.path.join(data_path, phonon, 'train.csv'), sep=',', index_col=False)
    dfp['val'] = pd.read_csv(os.path.join(data_path, phonon, 'validation.csv'), sep=',', index_col=False)
    dfp['test'] = pd.read_csv(os.path.join(data_path, phonon, 'test.csv'), sep=',', index_col=False)
    dfs[phonon] = dfp

loading dataframes to environment: 100%|██████████| 5/5 [00:11<00:00,  2.29s/it]


In [5]:
def conv_to_32(df, cols): df[cols] = df[cols].astype(np.float32)

for key, dfp in tqdm(dfs.items(), total=5, desc='dropping columns'):
    for type_df, df in dfp.items():
        df.drop(['per_frame_idx', 'mb_name', 'phonon'], axis=1, inplace=True)

        # convert float64 to float32
        conv_to_32(df, ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12'])

        # convert catagory column to encoded
        df['asthma_status'] = df['asthma_status'].astype('category')
        df['asthma_status'] = df['asthma_status'].cat.codes

dropping columns: 100%|██████████| 5/5 [00:01<00:00,  4.73it/s]


In [6]:
dfs['aa']['train']['asthma_status'].value_counts()

asthma_status
0    326244
1    204440
Name: count, dtype: int64

In [7]:
scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)

In [8]:
dfs_processed = {}
for key, dfp in tqdm(dfs.items(), total=5, desc='scaling and splitting'):
    dfp_processed = {}

    for type_df, df in dfp.items():
        dfp_processed[type_df] = {}
        X = df.loc[:, df.columns != 'asthma_status'].to_numpy()
        Y = df.loc[:, 'asthma_status'].to_numpy()
        X_scaled = scaler.fit_transform(X)
        
        dfp_processed[type_df]['x'] = X_scaled
        dfp_processed[type_df]['y'] = Y
    dfs_processed[key] = dfp_processed

scaling and splitting:   0%|          | 0/5 [00:00<?, ?it/s]

scaling and splitting: 100%|██████████| 5/5 [00:01<00:00,  4.83it/s]


### Autoencoder

In [26]:
class AutoEncoder:
    def __init__(self, input_dim):
        self.input_dim = input_dim

        self.encoder = self.make_encoder()
        self.decoder = self.make_decoder()
        self.autoencoder = keras.Sequential([self.encoder, self.decoder], name='Phonon_Autoencoder')

        self.autoencoder.compile(optimizer='adam', loss='mse')

    def make_encoder(self):
        encoder = keras.Sequential([
            Dense(12, activation='linear', input_shape=(self.input_dim,)),
            Dense(12, activation='linear'),
        ], name='phonon_encoder')
        return encoder

    
    def make_decoder(self):
        decoder = keras.Sequential([
            Dense(12, activation='linear', input_shape=(12,)),
            Dense(12, activation='linear'),
            Dense(self.input_dim, activation='linear')
        ], name='phonon_decoder')
        return decoder

ae = AutoEncoder(12)

In [20]:
train_aa = dfs_processed['aa']['train']
val_aa = dfs_processed['aa']['val']

In [27]:
ae.autoencoder.fit(train_aa['x'], train_aa['y'], validation_data=(val_aa['x'], val_aa['y']), batch_size=128, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30

KeyboardInterrupt: 

In [None]:
metric_hist = ae.autoencoder.history.history

# summarize history for loss
plt.plot(metric_hist['loss'])
plt.plot(metric_hist['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

### Neural Network

In [83]:
def reconstructionScore(originalDF, reducedDF):
    loss = np.sum((np.array(originalDF) - np.array(reducedDF))**2, axis=0)
    # loss = pd.Series(data=loss,index=originalDF.index)
    loss = (loss-np.min(loss))/(np.max(loss)-np.min(loss))
    print('Mean for reconstruction scores: ', np.mean(loss))
    return loss