In [1]:
import pandas as pd
import os
import json
from scipy.io import wavfile
import glob
from pydub import AudioSegment
from pydub.playback import play
from IPython.display import Audio
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
import librosa
from sklearn.model_selection import train_test_split
import wave
import matplotlib.pyplot as plt
import librosa.display



# Preprocessing

Load Data, going to try out Mel-frequency cepstral coefficients (MFCCs) and spectrograms
Then split data into train test split. (Hopefully will use a nested cross val)

In [2]:
df = pd.read_csv('cough_data.csv')
df.head()

Unnamed: 0,cough_detected,age,is_female,is_male,COVID-19,symptomatic,season_autumn,season_spring,season_summer,Africa,Asia,Europe,North America,Oceania,South America,file_path,wav_file_name
0,0.6911,31,1,0,0,1,0,1,0,0,0,1,0,0,0,C:\Users\drake\Documents\DL Machine Learning C...,00432f00-2477-4faf-a440-cc1276065b4a.wav
1,0.9751,44,0,1,0,1,0,1,0,0,0,1,0,0,0,C:\Users\drake\Documents\DL Machine Learning C...,008c1c9e-aeef-40c5-846c-24f1b964f884.wav
2,0.9775,17,0,1,0,1,1,0,0,0,0,1,0,0,0,C:\Users\drake\Documents\DL Machine Learning C...,00ac8cb0-3384-46b2-9db5-5b5e911615f0.wav
3,0.99,41,1,0,0,1,0,1,0,0,0,0,0,0,1,C:\Users\drake\Documents\DL Machine Learning C...,00ce5b06-c302-4387-bbd7-86355a4a8c12.wav
4,0.9632,39,1,0,1,0,1,0,0,0,0,1,0,0,0,C:\Users\drake\Documents\DL Machine Learning C...,01424527-9c3b-4b6e-96f1-9eea3150819b.wav


In [3]:
print(df.loc[2,'file_path'])

C:\Users\drake\Documents\DL Machine Learning Class\Final\coughvid_dataset\00ac8cb0-3384-46b2-9db5-5b5e911615f0.wav


In [4]:
# right now I am going to drop every columns besides 
# COVID-19 (target variable) and File_path (feature/ has the audio wav file)
df = df.drop(columns=[col for col in df.columns if col not in ['COVID-19',
                                                               'file_path']])
df.head(1)

Unnamed: 0,COVID-19,file_path
0,0,C:\Users\drake\Documents\DL Machine Learning C...


In [5]:
df['COVID-19'].value_counts()

0    1524
1     620
Name: COVID-19, dtype: int64

In [6]:
# setting up features and target variable
X = df.drop('COVID-19',axis=1)
y = df['COVID-19']

# Oversampling since we have few actual covid cases
from imblearn.over_sampling import RandomOverSampler

X_resampled, y_resampled = RandomOverSampler().fit_resample(X, y)

In [7]:
X_resampled.head()

Unnamed: 0,file_path
0,C:\Users\drake\Documents\DL Machine Learning C...
1,C:\Users\drake\Documents\DL Machine Learning C...
2,C:\Users\drake\Documents\DL Machine Learning C...
3,C:\Users\drake\Documents\DL Machine Learning C...
4,C:\Users\drake\Documents\DL Machine Learning C...


In [8]:
y_resampled.head()

0    0
1    0
2    0
3    0
4    1
Name: COVID-19, dtype: int64

In [9]:
y_resampled.value_counts()

0    1524
1    1524
Name: COVID-19, dtype: int64

# Mel Frequency Cepstral Coefficients (MFCCs)
## 8k Hz

# Reshaping Splits so they can fit in LSTM Model

In [10]:
# Define the desired number of MFCC coefficients and time steps
sr = 8000
n_mfcc = 13
hop_length = 512
time_steps = int(10 * sr / hop_length)

# Define a function to extract features from audio files
def extract_features(file_path):
    audio_file, sr = librosa.load(file_path, sr=8000)
    mfccs = librosa.feature.mfcc(y=audio_file, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length)
    return mfccs

# Apply the function to all audio files in the X_resampled dataframe
X_mfccs = X_resampled['file_path'].apply(extract_features)

# Pad or truncate the MFCCs to the desired number of time steps
X_mfccs_padded = []
for mfccs in X_mfccs:
    if mfccs.shape[1] < time_steps:
        padded_mfccs = np.pad(mfccs.T, ((0, time_steps - mfccs.shape[1]), (0, 0)), mode='constant')
        X_mfccs_padded.append(padded_mfccs[:time_steps, :])
    else:
        X_mfccs_padded.append(mfccs.T[:time_steps, :])
X_mfccs = np.array(X_mfccs_padded)

# Reshape the data to fit the input shape of a LSTM model
X_lstm = np.reshape(X_mfccs, (X_mfccs.shape[0], time_steps, n_mfcc))


In [11]:
# checking shapes to make sure above code actually worked
print("X_mfccs shape:", X_mfccs.shape)
print("X_lstm shape:", X_lstm.shape)

X_mfccs shape: (3048, 156, 13)
X_lstm shape: (3048, 156, 13)


In [13]:
# split into train test val split
from sklearn.model_selection import train_test_split
x_train_and_val, x_test, y_train_and_val, y_test = train_test_split(X_lstm, y_resampled, test_size = 0.20, random_state=42)

In [22]:
print("x_train_and_val shape", x_train_and_val.shape)
print("x test shape", x_test.shape)
print("y_train_and_val shape", y_train_and_val.shape)
print("y test shape", y_test.shape)

x_train_and_val shape (2438, 156, 13)
x test shape (610, 156, 13)
y_train_and_val shape (2438,)
y test shape (610,)


In [33]:
print("x")
print(type(x_train_and_val))
print(type(x_test))

print("y")
print(type(y_train_and_val))
print(type(y_test))

x
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
y
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [31]:
# now I will convert my two pandas df into a numpy array
y_train_and_val = y_train_and_val.to_numpy().flatten()
y_test = y_test.to_numpy().flatten()

print(type(y_train_and_val))
print(type(y_test))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [32]:
print("x_train_and_val shape", x_train_and_val.shape)
print("x test shape", x_test.shape)
print("y_train_and_val shape", y_train_and_val.shape)
print("y test shape", y_test.shape)

x_train_and_val shape (2438, 156, 13)
x test shape (610, 156, 13)
y_train_and_val shape (2438,)
y test shape (610,)


# Define KFOLD and Number of folds

In [39]:
from sklearn.model_selection import KFold
# Define the number of folds
num_folds = 5
# Split the data into K folds
kf = KFold(n_splits=num_folds, shuffle=True)

# LSTM MODEL

In [35]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Define the LSTM model architecture
model = Sequential()
model.add(LSTM(units=64, input_shape=(time_steps, n_mfcc)))
model.add(Dense(units=1, activation='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [41]:
# Iterate over each fold
for train_index, val_index in kf.split(x_train_and_val, y_train_and_val):
    fold = 0
    fold += 1
    print(f"Fold {fold}:")

    # Split the data into training and validation sets for this fold
    x_train, x_val = x_train_and_val[train_index], x_train_and_val[val_index]
    y_train, y_val = y_train_and_val[train_index], y_train_and_val[val_index]

    # Train the model on this fold's training data
    model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, batch_size=32)

    # Evaluate the model on this fold's validation data
    loss, accuracy = model.evaluate(x_test, y_test)
    print(f"Test loss on this fold: {loss:.4f}, Test accuracy on this fold: {accuracy:.4f}")

# Evaluate the model on the test set
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Test loss: {loss:.4f}, Test accuracy: {accuracy:.4f}")

Fold 1:
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss on this fold: 1.2498, Test accuracy on this fold: 0.7361
Fold 1:
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss on this fold: 1.0359, Test accuracy on this fold: 0.7426
Fold 1:
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss on this fold: 0.9196, Test accuracy on this fold: 0.7459
Fold 1:
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss on this fold: 0.9554, Test accuracy on this fold: 0.7443
Fold 1:
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss on this fold: 1.0608, Test accuracy on this fold: 0.7426
Test loss: 1.0608, Test accuracy: 0.7426


In [42]:
from sklearn.metrics import f1_score, recall_score, roc_auc_score

# Predict the labels for the test set
y_pred = model.predict(x_test)

# Convert the predicted probabilities to binary labels
y_pred = (y_pred > 0.5).astype(int)

# Calculate the F1 score, recall, and AUC ROC score
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred)

print(f"F1 score: {f1:.4f}, Recall: {recall:.4f}, AUC ROC score: {auc_roc:.4f}")


F1 score: 0.7535, Recall: 0.8108, AUC ROC score: 0.7446


In [45]:
print("y_pred shape:",y_pred.shape)

y_pred shape: (610, 1)


In [46]:
y_pred = y_pred.flatten()
y_pred.shape

(610,)

In [47]:
y_pred

array([1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,

In [48]:
import numpy as np

# Save the predicted values as a numpy array
np.save("lstm_pred_new.npy", y_pred)


In [49]:
y_test.shape

(610,)

In [50]:
np.save("y_test_for_voting", y_test)