In [1]:
import wave
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline
from pydub import AudioSegment
from tqdm import tqdm
import librosa
from scipy.io import wavfile

### EDA

In [2]:
df_no_diagnosis = pd.read_csv('C:/Users/Christeena/Desktop/Respiratory_Sound_Database/Respiratory_Sound_Database/demographic_info.txt', names = 
                 ['Patient number', 'Age', 'Sex' , 'Adult BMI (kg/m2)', 'Child Weight (kg)' , 'Child Height (cm)'],
                 delimiter = ' ')

diagnosis = pd.read_csv('C:/Users/Christeena/Desktop/Respiratory_Sound_Database/Respiratory_Sound_Database/patient_diagnosis.csv', names = ['Patient number', 'Diagnosis'])

In [3]:
df =  df_no_diagnosis.join(diagnosis.set_index('Patient number'), on = 'Patient number', how = 'left')
df['Diagnosis'].value_counts()

COPD              64
Healthy           26
URTI              14
Bronchiectasis     7
Pneumonia          6
Bronchiolitis      6
LRTI               2
Asthma             1
Name: Diagnosis, dtype: int64

In [4]:
root = 'C:/Users/Christeena/Desktop/Respiratory_Sound_Database/Respiratory_Sound_Database/audio_and_txt_files/'
filenames = [s.split('.')[0] for s in os.listdir(path = root) if '.txt' in s]

In [None]:
def Extract_Annotation_Data(file_name, root):
    tokens = file_name.split('_')
    recording_info = pd.DataFrame(data = [tokens], columns = ['Patient number', 'Recording index', 'Chest location','Acquisition mode','Recording equipment'])
    recording_annotations = pd.read_csv(os.path.join(root, file_name + '.txt'), names = ['Start', 'End', 'Crackles', 'Wheezes'], delimiter= '\t')
    return (recording_info, recording_annotations)

In [None]:
i_list = []
rec_annotations = []
rec_annotations_dict = {}
for s in filenames:
    (i,a) = Extract_Annotation_Data(s, root)
    i_list.append(i)
    rec_annotations.append(a)
    rec_annotations_dict[s] = a
recording_info = pd.concat(i_list, axis = 0)
recording_info.head()

In [None]:
no_label_list = []
crack_list = []
wheeze_list = []
filename_list = []
no_cycles_list=[]
for f in tqdm(filenames):
    d = rec_annotations_dict[f]
    no_labels = len(d[(d['Crackles'] == 0) & (d['Wheezes'] == 0)].index)
    n_crackles = len(d[(d['Crackles'] == 1) & (d['Wheezes'] == 0)].index)
    n_wheezes = len(d[(d['Crackles'] == 0) & (d['Wheezes'] == 1)].index)
    no_cycles = no_labels+n_crackles+n_wheezes+both_sym
    no_label_list.append(no_labels)
    crack_list.append(n_crackles)
    wheeze_list.append(n_wheezes)
    no_cycles_list.append(no_cycles)
    filename_list.append(f)

In [None]:
file_label_df = pd.DataFrame(data = {'filename':filename_list, 'no label':no_label_list, 'crackles only':crack_list, 'wheezes only':wheeze_list, 'crackles and wheezees':both_sym_list,'Total cycles':no_cycles_list})

In [None]:
file_label_df.to_csv('labels.csv', encoding='utf-8', index=False)


In [None]:
os.chdir("C:/Users/Admin/Desktop/Respiratory_Sound_Database/Respiratory_Sound_Database")

### Splitting audios based on respiratory cycles and Creating a csv file to store class information

In [None]:
label=[]
sfname=[]

In [None]:
aud_name=os.listdir('C:/Users/Christeena/Desktop/Respiratory_Sound_Database/Respiratory_Sound_Database/audios')

In [None]:

for f,j in tqdm(zip(filenames,aud_name)):
    d=rec_annotations_dict.get(f)
    for i in range(len(d)):
        s=d['Start'][i]
        e=d['End'][i]
        c=d['Crackles'][i]
        w=d['Wheezes'][i]
        t1 = s * 1000 #Works in milliseconds
        t2 = e * 1000
        newAudio = AudioSegment.from_wav("C:/Users/Christeena/Desktop/Respiratory_Sound_Database/Respiratory_Sound_Database/audios/"+j)
        newAudio = newAudio[t1:t2]
        i=str(i)
        newAudio.export(f+i+'.wav', format="wav") #Exports to a wav file in the current path.
        sfname.append(f+i)
        if (c == 0) & (w == 0):
            label.append('Healthy')
        elif (c == 1) & (w == 0):
            label.append('Crackles')
        else:
            label.append('Wheezes')


In [None]:
label_df = pd.DataFrame(data={'Filename':sfname,'Class':label})

In [None]:
label_df.to_csv('label.csv', index=False)

In [9]:
#audio pre-processing function
def envelope(y, rate, threshold):
    mask = []
    y = pd.Series(y).apply(np.abs)
    y_mean = y.rolling(window=int(rate/10),min_periods=1,center=True).mean()
    for mean in y_mean:
        if mean> threshold:
            mask.append(True)
        else:
            mask.append(False)
    return mask

###  saving the pre-processed audios in to a new folder

In [21]:
if len(os.listdir('C:/Users/Christeena/Desktop/Respiratory_Sound_Database/Respiratory_Sound_Database/clean')) == 0:
    for f in tqdm(df1.index):
        signal,rate = librosa.load('C:/Users/Christeena/Desktop/Respiratory_Sound_Database/Respiratory_Sound_Database/splitted/'+f+'.wav',sr=16000)
        mask = envelope(signal,rate,0.0005)
        wavfile.write(filename='C:/Users/Christeena/Desktop/Respiratory_Sound_Database/Respiratory_Sound_Database/clean/'+f+'.wav',rate=rate,data=signal[mask])

100%|██████████████████████████████████████████████████████████████████████████████| 6898/6898 [24:21<00:00,  4.72it/s]
