START

In [55]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
import numpy as np
import librosa
import os
import pickle

In [57]:
# set path to your cough sound files in your Google Drive
covid_path ='/content/drive/My Drive/Colab Notebooks/CoughAnalysis/audio_data/covid'
not_covid_path ='/content/drive/My Drive/Colab Notebooks/CoughAnalysis/audio_data/not_covid'

# Set the path to the data folder
data_path = '/content/drive/My Drive/Colab Notebooks/CoughAnalysis/audio_data'

In [105]:
#data structure
data={
    "MFCC":np.array([]),# Mel-Frequency Cepstral Coefficients (MFCC)
    "LPC":np.array([]),# Linear Predictive Coding (LPC)
    #"ZCR":np.array([]),# Zero-Crossing Rate (ZCR)
    "energy":np.array([]),#Energy
    #"SC":np.array([]),# Spectral Centroid(SC)
    #"SF":np.array([]),# Spectral Flux(SF)
    "SRo":np.array([]),# Spectral Roll-off(SRo) 
    "Cep":np.array([]),# Cepstrum
    "labels":np.array([])    
   }


In [None]:
sample ='/content/drive/My Drive/Colab Notebooks/CoughAnalysis/audio_data/covid/-5dCv5_nvU8_ 200.000_ 210.000.wav'
print(sample)

In [106]:
def get_mfcc(filename, max_length=1000, n_mfcc=20):
    # Load the audio file
    y, sr = librosa.load(filename)

    # Extract the MFCC features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfcc = mfcc.flatten()
    return mfcc


In [61]:
def get_lpc(audio_file, lpc_order=12):
    # Load audio signal
    y, sr = librosa.load(audio_file)

    # Extract LPC coefficients
    lpc = librosa.lpc(y, lpc_order)

    return lpc

In [62]:
def get_zcr(audio_file, frame_length=1024, hop_length=512):

    # Load audio file (mono channel only)
    y, sr = librosa.load(audio_file, mono=True)

    # Calculate ZCR
    zcr = librosa.feature.zero_crossing_rate(y, frame_length=frame_length, hop_length=hop_length)

    # Convert ZCR feature matrix to numpy array
    zcr = np.squeeze(zcr)

    return zcr

In [63]:
def get_energy(audio_file):
    # Load audio signal
    y, sr = librosa.load(audio_file)

    # Compute the energy of the signal
    energy = np.sum(np.abs(y) ** 2)

    return energy

In [64]:
def get_sc(audio_file, frame_length=2048, hop_length=512, sr=22050):
    # Load audio file (mono channel only)
    y, _ = librosa.load(audio_file, sr=sr, mono=True)

    # Calculate short-time Fourier transform (STFT)
    stft = np.abs(librosa.stft(y, n_fft=frame_length, hop_length=hop_length))

    # Calculate frequency bins
    freqs = librosa.fft_frequencies(sr=sr, n_fft=frame_length)

    # Calculate spectral centroid
    sc = librosa.feature.spectral_centroid(S=stft, freq=freqs)

    # Convert spectral centroid feature matrix to numpy array
    sc = np.squeeze(sc)

    return sc

In [65]:
def get_sf(audio_file_path, hop_length=512, sr=22050):
    # Load audio file
    y, sr = librosa.load(audio_file_path, sr=sr)

    # Compute STFT
    D = librosa.stft(y, hop_length=hop_length)

    # Compute magnitude spectrogram
    mag_spec = np.abs(D)

    # Compute spectral flux
    sf = np.zeros(mag_spec.shape[1])
    for i in range(1, mag_spec.shape[1]):
        sf[i] = np.sum(np.square(mag_spec[:,i] - mag_spec[:,i-1]))

    return sf

In [66]:
def get_sro(audio_file, roll_percent=0.85):
    # Load audio signal
    y, sr = librosa.load(audio_file)

    # Compute the power spectrum of the signal
    power_spectrum = np.abs(np.fft.fft(y)) ** 2

    # Compute the cumulative sum of the power spectrum
    cumulative_power = np.cumsum(power_spectrum)

    # Compute the SRo value
    sr_idx = np.where(cumulative_power >= roll_percent * cumulative_power[-1])[0][0]
    sr_value = librosa.fft_frequencies(sr, sr_idx)[0]

    return sr_value


In [67]:
def get_cep(audio_file, num_coeffs=12):
    # Load audio signal
    y, sr = librosa.load(audio_file)

    # Compute the power spectrum of the signal
    power_spectrum = np.abs(np.fft.fft(y)) ** 2

    # Compute the log of the power spectrum
    log_power_spectrum = np.log(power_spectrum)

    # Compute the cepstral coefficients using the DCT
    cepstral_coeffs = np.real(np.fft.ifft(log_power_spectrum))

    # Only keep the first num_coeffs coefficients
    cepstral_coeffs = cepstral_coeffs[:num_coeffs]

    return cepstral_coeffs


In [107]:

# Initialize empty arrays for the filenames and labels
filenames = []
labels =[]
mfccs=[]
lpc=[]
zcr=[]
energy=[]
sc=[]
sf=[]
sro=[]
cep=[]


# Iterate over the covid and not_covid subfolders
for label in ['covid', 'not_covid']:
    # Get the path to the subfolder
    subfolder_path = os.path.join(data_path, label)
    
    # Iterate over the files in the subfolder
    for filename in os.listdir(subfolder_path):
        # Skip any non-wav files
        if not filename.endswith('.wav'):
            continue
        
        # Add the filename and label to the arrays
        filenames.append(os.path.join(label, filename))
        if label=='covid':
          labels.append(1)
          mfccs.append(get_mfcc(os.path.join(covid_path, filename)))
          lpc.append(get_lpc(os.path.join(covid_path, filename)))
          #zcr.append(get_zcr(os.path.join(covid_path, filename)))
          energy.append(get_energy(os.path.join(covid_path, filename)))
          #sc.append(get_sc(os.path.join(covid_path, filename)))
          #sf.append(get_sf(os.path.join(covid_path, filename)))
          sro.append(get_sro(os.path.join(covid_path, filename)))
          cep.append(get_cep(os.path.join(covid_path, filename)))


        elif label=='not_covid':
          labels.append(0)
          mfccs.append(get_mfcc(os.path.join(not_covid_path, filename)))
          lpc.append(get_lpc(os.path.join(not_covid_path, filename)))
          #zcr.append(get_zcr(os.path.join(not_covid_path, filename)))
          energy.append(get_energy(os.path.join(not_covid_path, filename)))
          #sc.append(get_sc(os.path.join(not_covid_path, filename)))
          #sf.append(get_sf(os.path.join(not_covid_path, filename)))
          sro.append(get_sro(os.path.join(not_covid_path, filename)))
          cep.append(get_cep(os.path.join(not_covid_path, filename)))


#print(f"Found {len(filenames)} files and {len(labels)} labels.")

In [112]:
labels =np.array(labels)
mfccs=np.array(mfccs)
lpc=np.array(lpc)
#zcr=np.array(zcr)
energy=np.array(energy)
#sc=np.array(sc)
#sf=np.array(sf)
sro=np.array(sro)
cep=np.array(cep)

In [136]:
print(mfccs.shape)

(13, 431)


In [116]:
data['labels']=labels
data['MFCC']=mfccs
data['LPC']=lpc
#data['ZCR']=zcr
data['energy']=energy
#data['SC']=sc
#data['SF']=sf
data['SRo']=sro
data['Cep']=cep

In [None]:
print(data)

In [43]:
#data2=np.load('data.npz')

In [96]:
# file path to save the dictionary
data_save_file_path = '/content/drive/My Drive/Colab Notebooks/CoughAnalysis/processed_data.pkl'

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(data_save_file_path), exist_ok=True)

# Write the dictionary to a file using pickle
with open(data_save_file_path, 'wb') as f:
    pickle.dump(data, f)