In [54]:
import glob
import numpy as np
import pandas as pd
import csv
import tensorflow as tf
from keras.layers import Dense, Activation, Flatten, Dropout, LSTM, Conv1D, MaxPool1D
from keras.models import Sequential, Model
from keras import regularizers, optimizers
import keras
import sklearn
from sklearn.model_selection import train_test_split

In [2]:
cough_df = pd.read_csv("cough_dataset.csv")
del cough_df['filename']
cough_df['label']="Cough"
no_cough_df = pd.read_csv("nocough_dataset.csv")
del no_cough_df['filename']
no_cough_df['label']="No Cough"
no_cough_df = no_cough_df.sample(frac=1) #Shuffle dataset to ensure no-cough 
no_cough_df = no_cough_df.head(len(cough_df)) #The two datasets should be the same length 

In [3]:
full_df = pd.concat([cough_df,no_cough_df])
features = full_df.columns[0:-1]
labels = pd.get_dummies(full_df['label']).values
"""
I chose to one-hot-encode the labels; originally I had assigned 0 to Covid and 1 to No Covid, but this made interpreting results incredibly difficult.
pd.get_dummies() turns the results into two columns - if the original value was "Covid," the row is [1,0]. Otherwise, the value is [0,1].
Intepreting model predictions becomes much easier; if a higher value is in the first column, the model has predicted Cough, otherwise, the model has predicted No Cough
"""
input_data = full_df[features].values

In [9]:
input_data.shape

(12000, 56)

In [4]:
X_train, X_test, y_train, y_test =  train_test_split(input_data,labels,test_size=0.1)

In [78]:
"""
There isn't a whole lot of logic to this so far - the model needs improving. 
"""

def get_basic_model():
  model = Sequential()
  model.add(Conv1D(254, 3, padding='same',input_shape=(56,1),activation='relu'))
  model.add(Dense(136,kernel_regularizer='l1'))
  model.add(Dropout(.2))
  model.add(LSTM(105,return_sequences=True,dropout=0.2))
  model.add(Conv1D(75, 2, padding='valid'))
  model.add(Dropout(.2))
  model.add(Dense(20,kernel_regularizer='l1'))
  model.add(Flatten())
  model.add(Dense(2,activation="sigmoid")) #Output has two columns (Covid, Non-Covid). Sigmoid activation is also supposed to be best for class-based entropy

  model.compile(optimizer='rmsprop',
                loss=tf.keras.losses.CategoricalCrossentropy(),
                metrics=['accuracy',tf.keras.metrics.Precision(),tf.keras.metrics.Recall(), tf.keras.metrics.CategoricalAccuracy()])
  model.summary()
  return model

In [79]:
model = get_basic_model()

Model: "sequential_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_20 (Conv1D)          (None, 56, 254)           1016      
                                                                 
 dense_54 (Dense)            (None, 56, 136)           34680     
                                                                 
 dropout_39 (Dropout)        (None, 56, 136)           0         
                                                                 
 lstm_26 (LSTM)              (None, 56, 105)           101640    
                                                                 
 conv1d_21 (Conv1D)          (None, 55, 75)            15825     
                                                                 
 dropout_40 (Dropout)        (None, 55, 75)            0         
                                                                 
 dense_55 (Dense)            (None, 55, 20)          

In [81]:
model.fit(X_train,y_train,epochs=20,validation_split=0.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x20398d9bfd0>

In [77]:
"""
Want to make sure that predictions are correct! More importantly, want to make sure that coughs are correctly classified as coughs. 
Overall accuracy isn't the best metric - we care more about f-score (precision/recall). 
Precision: What was correctly classified as cough?  True Coughs/(True Coughs + False Coughs)
Recall: Of what was actually a cough, what percentage was correctly classified?  True Coughs/(True Coughs + False Non-Coughs )
"""

predictions = model.predict(X_test)
reals = []
preds = []
for i in range(len(predictions)):
    reals.append(np.argmax(y_test[i]))
    preds.append(np.argmax(predictions[i]))
sklearn.metrics.precision_recall_fscore_support(reals,preds) #First column is cough, second column is non-cough



(array([0.68871595, 0.84848485]),
 array([0.8909396 , 0.60264901]),
 array([0.77688369, 0.70474347]),
 array([596, 604], dtype=int64))