In [1]:
import pandas as pd 
import os 
import glob
import numpy as np
import IPython
from features import features
from tqdm import tqdm
import librosa as lb

In [2]:
data = pd.read_csv(r"C:\AI project\coughvid_20211012\metadata_compiled.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34434 entries, 0 to 34433
Data columns (total 52 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             34434 non-null  int64  
 1   uuid                   34434 non-null  object 
 2   datetime               34434 non-null  object 
 3   cough_detected         34434 non-null  float64
 4   latitude               19431 non-null  float64
 5   longitude              19431 non-null  float64
 6   age                    19396 non-null  float64
 7   gender                 20664 non-null  object 
 8   respiratory_condition  20664 non-null  object 
 9   fever_muscle_pain      20664 non-null  object 
 10  status                 20664 non-null  object 
 11  status_SSL             8331 non-null   object 
 12  quality_1              820 non-null    object 
 13  cough_type_1           820 non-null    object 
 14  dyspnea_1              820 non-null    object 
 15  wh

In [4]:
data.isnull().sum()

Unnamed: 0                   0
uuid                         0
datetime                     0
cough_detected               0
latitude                 15003
longitude                15003
age                      15038
gender                   13770
respiratory_condition    13770
fever_muscle_pain        13770
status                   13770
status_SSL               26103
quality_1                33614
cough_type_1             33614
dyspnea_1                33614
wheezing_1               33614
stridor_1                33614
choking_1                33614
congestion_1             33614
nothing_1                33614
diagnosis_1              33614
severity_1               33614
quality_2                33614
cough_type_2             33615
dyspnea_2                33614
wheezing_2               33614
stridor_2                33614
choking_2                33614
congestion_2             33614
nothing_2                33614
diagnosis_2              33614
severity_2               33614
quality_

In [5]:
data.groupby('status')['uuid'].count()

status
COVID-19        1315
healthy        15476
symptomatic     3873
Name: uuid, dtype: int64

In [6]:
for element in ['diagnosis_1','diagnosis_2', 'diagnosis_3', 'diagnosis_4']:
    temp_data = data.copy(deep=True)
    if element == 'diagnosis_1':
        Labeled_data = temp_data.dropna(subset=element)
    else:
        print(len(Labeled_data))
        Labeled_data = pd.concat([Labeled_data, temp_data.dropna(subset =element)])     
    print('labels Used by Professional')
    print(data[element].unique())
    print('Number of Labeled instances : ',data[element].notnull().sum())
    print(data[element].value_counts())
    print('\n\n')

labels Used by Professional
[nan 'healthy_cough' 'lower_infection' 'COVID-19' 'obstructive_disease'
 'upper_infection']
Number of Labeled instances :  820
COVID-19               279
healthy_cough          259
lower_infection        244
upper_infection         23
obstructive_disease     15
Name: diagnosis_1, dtype: int64



820
labels Used by Professional
[nan 'lower_infection' 'COVID-19' 'healthy_cough' 'obstructive_disease'
 'upper_infection']
Number of Labeled instances :  820
COVID-19               285
upper_infection        183
lower_infection        173
obstructive_disease    112
healthy_cough           67
Name: diagnosis_2, dtype: int64



1640
labels Used by Professional
[nan 'healthy_cough' 'lower_infection' 'upper_infection'
 'obstructive_disease' 'COVID-19']
Number of Labeled instances :  793
upper_infection        364
healthy_cough          199
lower_infection        194
obstructive_disease     35
COVID-19                 1
Name: diagnosis_3, dtype: int64



2433
labels Used

In [7]:
Labeled_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3224 entries, 51 to 34425
Data columns (total 52 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             3224 non-null   int64  
 1   uuid                   3224 non-null   object 
 2   datetime               3224 non-null   object 
 3   cough_detected         3224 non-null   float64
 4   latitude               2039 non-null   float64
 5   longitude              2039 non-null   float64
 6   age                    2653 non-null   float64
 7   gender                 2832 non-null   object 
 8   respiratory_condition  2832 non-null   object 
 9   fever_muscle_pain      2832 non-null   object 
 10  status                 2832 non-null   object 
 11  status_SSL             1155 non-null   object 
 12  quality_1              1203 non-null   object 
 13  cough_type_1           1203 non-null   object 
 14  dyspnea_1              1203 non-null   object 
 15  wh

In [8]:
def multi_label_ids(df):
    repeated_counts = Labeled_data['uuid'].value_counts()
    uuid = []
    for item in repeated_counts.index:
        if repeated_counts[item] > 1:
            uuid.append(item)
    return uuid
multi_label_uuids = multi_label_ids(Labeled_data)
Labeled_data = Labeled_data.drop_duplicates(subset=['uuid'])
print(len(Labeled_data))
print(len(multi_label_uuids))
print(Labeled_data['status'].notnull().sum())

2841
130
2506


In [9]:
print(len(Labeled_data[Labeled_data['cough_detected']> 0.25]))

2788


In [10]:
def validate_path(DATA_PTH: str, uuid:str)-> str:
    if os.path.exists(os.path.join(DATA_PTH, uuid+'.webm')):
        path = os.path.join(DATA_PTH, uuid+'.webm')
    elif os.path.exists(os.path.join(DATA_PTH, uuid+'.wav')):
        path = os.path.join(DATA_PTH, uuid+'.wav')
    elif os.path.exists(os.path.join(DATA_PTH, uuid+'.ogg')):
        path = os.path.join(DATA_PTH, uuid+'.ogg')
    else:
        path = None
    return path

In [11]:
def multi_label_selection(df):
    expert_label_cols = ['diagnosis_1','diagnosis_2', 'diagnosis_3', 'diagnosis_4']
    quality_cols = ['quality_1', 'quality_2', 'quality_3', 'quality_4']
    severity_cols = ['severity_1', 'severity_2', 'severity_3', 'severity_4']
    expert_labels = []
    quality = []
    severity = []
    for item in expert_label_cols:
        if type(df[item]) != float:
            expert_labels.append(df[item])
    for item in quality_cols:
        if type(df[item]) != float:
            quality.append(df[item])
    for item in severity_cols:
        if type(df[item]) != float:
            severity.append(df[item])
    user_label     = df['status']
    expert_label = max(set(expert_labels), key=expert_labels.count)
    quality = max(set(quality), key=quality.count)
    severity = max(set(severity), key=severity.count)
    return expert_label, user_label, quality, severity

In [12]:
def single_label_extraction(df):
    expert_label_cols = ['diagnosis_1','diagnosis_2', 'diagnosis_3', 'diagnosis_4']
    quality_cols = ['quality_1', 'quality_2', 'quality_3', 'quality_4']
    severity_cols = ['severity_1', 'severity_2', 'severity_3', 'severity_4']
    expert_label = None
    quality = None
    severity = None
    user_label = df['status']
    for item in expert_label_cols:
        if expert_label is None:
            if type(df[item]) != float:
                expert_label = df[item]
    for item in quality_cols:
        if quality is None:
            if type(df[item]) != float:
                quality = df[item]
    for item in severity_cols:
        if severity is None:
            if type(df[item]) != float:
                severity = df[item]
    return expert_label, user_label, quality, severity

In [13]:
DATA_PTH = r"C:\AI project\coughvid_20211012"
#transformed_data = df.DataFrame(columns=['Path', 'expert_label', 'user_label', 'cough_detected', 'quality', 'age', 'gender', 'respiratory_condition', 'fever_muscle_pain', 'status_SSL', 'severity'])
featrue_ext = features()
def label_extraction(df):
    data_dict = {}
    uuids = df['uuid'].unique()
    df = df.set_index('uuid')
    #print(df.head(2))
    for uuid in tqdm(uuids):
        temp_dict = {}
        data_sample = df.loc[uuid]
        temp_dict['path'] = validate_path(DATA_PTH, uuid)
        #print(temp_dict['path'])
        aud, fs = lb.load(temp_dict['path'])
        f_mfcc = featrue_ext.MFCC([fs,aud])
        for pos,feature in enumerate(f_mfcc[1]):
            temp_dict[feature]= f_mfcc[0][pos]
        temp_dict['D_frq'] = featrue_ext.DF([fs,aud])
        if uuid in multi_label_uuids:
            temp_dict['expert_label'], temp_dict['user_label'], temp_dict['quality'], temp_dict['severity'] = multi_label_selection(data_sample)
        else:
            temp_dict['expert_label'], temp_dict['user_label'], temp_dict['quality'], temp_dict['severity'] = single_label_extraction(data_sample)
        temp_dict['age'], temp_dict['gender'],temp_dict['cough_detected'],   \
        temp_dict['respiratory_condition'], temp_dict['fever_muscle_pain'],  \
        temp_dict['status_SSL'] = data_sample['age'], data_sample['gender'], \
        data_sample['cough_detected'], data_sample['respiratory_condition'], \
        data_sample['fever_muscle_pain'], data_sample['status_SSL']
        data_dict[uuid] = temp_dict
    return pd.DataFrame.from_dict(data_dict, orient='index')
Tnsfd_data = label_extraction(Labeled_data)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
100%|███████████████████████████████████████████████████████████████████████████| 2841/2841 [17:14:25<00:00, 21.85s/it]


In [14]:
!pip install ffmpeg



In [15]:
Tnsfd_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2841 entries, 006d8d1c-2bf6-46a6-8ef2-1823898a4733 to ffedc843-bfc2-4ad6-a749-2bc86bdac84a
Data columns (total 38 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   path                   2841 non-null   object 
 1   MFCC_mean0             2841 non-null   float32
 2   MFCC_mean1             2841 non-null   float32
 3   MFCC_mean2             2841 non-null   float32
 4   MFCC_mean3             2841 non-null   float32
 5   MFCC_mean4             2841 non-null   float32
 6   MFCC_mean5             2841 non-null   float32
 7   MFCC_mean6             2841 non-null   float32
 8   MFCC_mean7             2841 non-null   float32
 9   MFCC_mean8             2841 non-null   float32
 10  MFCC_mean9             2841 non-null   float32
 11  MFCC_mean10            2841 non-null   float32
 12  MFCC_mean11            2841 non-null   float32
 13  MFCC_mean12            2841 non-null   float32

In [16]:
x = np.array(Tnsfd_data)[:,1:27]

In [17]:
y = np.array(Tnsfd_data)[:,28]

In [18]:
def label_parser(df):
    labels = []
    for label in df:
        if label == 'COVID-19':
            labels.append(1)
        else:
            labels.append(0)
    return labels

In [19]:
labels = label_parser(y)

In [20]:
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold,StratifiedKFold, ShuffleSplit, StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import r2_score,mean_squared_error

In [21]:
imput = SimpleImputer(missing_values=np.nan, strategy='mean')

In [22]:
Scaler = MinMaxScaler()

In [23]:
cv = KFold(n_splits=5)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(x, labels, test_size=0.25, random_state=1)

In [25]:
from sklearn.ensemble import RandomForestClassifier
Rf_est = RandomForestClassifier()

In [26]:
RF_pipe = Pipeline(steps=[('scaler', Scaler), ('classifier', Rf_est)])

In [27]:
param_grid = {
    'classifier__n_estimators' : np.arange(10,100,20),
    'classifier__max_depth': np.arange(20,27,3),
    'classifier__criterion':['gini','entropy'],
    'classifier__min_samples_split':np.arange(2,20,8),   
}

In [28]:
RF_grid = GridSearchCV(RF_pipe, param_grid,cv = cv, verbose = 2, n_jobs=-1)

In [29]:
RF_grid = RF_grid.fit(X_train,y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


In [30]:
print('\nBest R2 score : %.2f'%RF_grid.best_score_, ' Best Params : ', str(RF_grid.best_params_))


Best R2 score : 0.80  Best Params :  {'classifier__criterion': 'entropy', 'classifier__max_depth': 20, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 90}


In [31]:
print(RF_grid.best_estimator_.score(X_test,y_test))

0.8171589310829818


In [33]:
from numpy import nan
dir_path = r"C:\AI project\coughvid_20211012"
sts = data['status'].unique()
paths = {}
for status in data['status'].unique():
    #print(status)
    if type(status) == float:
        continue
    Temp_data = data[data['status'] == status]
    Temp_data = Temp_data[Temp_data['cough_detected'] > 0.5]
    sample = Temp_data.iloc[[0]]
    if os.path.exists(os.path.join(dir_path, list(sample['uuid'])[0]+'.webm')):
        path = os.path.join(dir_path, list(sample['uuid'])[0]+'.webm')
    else:
        path = os.path.join(dir_path, list(sample['uuid'])[0]+'.wav')
    paths[status] = path
print('healthy')
IPython.display.display(IPython.display.Audio(paths['healthy']))
print('\n')
print('COVID-19')
IPython.display.display(IPython.display.Audio(paths['COVID-19']))
print('\n')
print('symptomatic')
IPython.display.display(IPython.display.Audio(paths['symptomatic']))
print('\n')

healthy




COVID-19




symptomatic




