In [1]:
import pandas as pd
import plotly.graph_objects as ply_go
import os 

In [2]:
root = os.getcwd()

## Referencia 

[LINK](https://www.kaggle.com/code/buffyhridoy/cough-analysis-part-1-data-exploration)

## Cargando la metadata

In [3]:
data_dir = root + '/dataset/subset_dataset/covid19-cough-audio-classification/'
metadata_file = "metadata_compiled.csv"
metadata=pd.read_csv(data_dir+metadata_file,sep=",")
print(metadata.columns)

# convert strings 'True'/'False' to genuine booleans
cols_to_boolean = (['respiratory_condition', 'fever_muscle_pain',
                     'dyspnea_1', 'wheezing_1', 'stridor_1','choking_1', 'congestion_1', 'nothing_1',
                     'dyspnea_2', 'wheezing_2', 'stridor_2','choking_2', 'congestion_2', 'nothing_2',
                     'dyspnea_3', 'wheezing_3', 'stridor_3','choking_3', 'congestion_3', 'nothing_3',
                     'dyspnea_4', 'wheezing_4', 'stridor_4','choking_4', 'congestion_4', 'nothing_4'])
#metadata[cols_to_boolean] = metadata[cols_to_boolean].apply(lambda x: x.astype(bool))
for c in cols_to_boolean:
    metadata.loc[metadata[c].notnull(),c] = metadata.loc[metadata[c].notnull(),c].astype(bool) 

print("NULL or NA records for each column:")
print( metadata.isnull().sum() )
    
cols_to_fillna = ['gender', 'status','diagnosis_1','diagnosis_2','diagnosis_3','diagnosis_4']
metadata[cols_to_fillna]=metadata[cols_to_fillna].fillna('n/a')

#print(metadata.dtypes)
#print(metadata.shape)
metadata.head(5)

Index(['uuid', 'datetime', 'cough_detected', 'SNR', 'latitude', 'longitude',
       'age', 'gender', 'respiratory_condition', 'fever_muscle_pain', 'status',
       'quality_1', 'cough_type_1', 'dyspnea_1', 'wheezing_1', 'stridor_1',
       'choking_1', 'congestion_1', 'nothing_1', 'diagnosis_1', 'severity_1',
       'quality_2', 'cough_type_2', 'dyspnea_2', 'wheezing_2', 'stridor_2',
       'choking_2', 'congestion_2', 'nothing_2', 'diagnosis_2', 'severity_2',
       'quality_3', 'cough_type_3', 'dyspnea_3', 'wheezing_3', 'stridor_3',
       'choking_3', 'congestion_3', 'nothing_3', 'diagnosis_3', 'severity_3',
       'quality_4', 'cough_type_4', 'dyspnea_4', 'wheezing_4', 'stridor_4',
       'choking_4', 'congestion_4', 'nothing_4', 'diagnosis_4', 'severity_4'],
      dtype='object')
NULL or NA records for each column:
uuid                         0
datetime                     0
cough_detected               0
SNR                          0
latitude                 11466
longitude    

Unnamed: 0,uuid,datetime,cough_detected,SNR,latitude,longitude,age,gender,respiratory_condition,fever_muscle_pain,...,quality_4,cough_type_4,dyspnea_4,wheezing_4,stridor_4,choking_4,congestion_4,nothing_4,diagnosis_4,severity_4
0,00014dcc-0f06-4c27-8c7b-737b18a2cf4c,2020-11-25T18:58:50.488301+00:00,0.0155,7.326171,48.9,2.4,,,,,...,,,,,,,,,,
1,00039425-7f3a-42aa-ac13-834aaa2b6b92,2020-04-13T21:30:59.801831+00:00,0.9609,16.151433,31.3,34.8,15.0,male,False,False,...,,,,,,,,,,
2,0007c6f1-5441-40e6-9aaf-a761d8f2da3b,2020-10-18T15:38:38.205870+00:00,0.1643,16.217201,,,46.0,female,False,False,...,,,,,,,,,,
3,0009eb28-d8be-4dc1-92bb-907e53bc5c7a,2020-04-12T04:02:18.159383+00:00,0.9301,20.146058,40.0,-75.1,34.0,male,True,False,...,,,,,,,,,,
4,0012c608-33d0-4ef7-bde3-75a0b1a0024e,2020-04-15T01:03:59.029326+00:00,0.0482,0.0,-16.5,-71.5,,,,,...,,,,,,,,,,


## Visualizacion

In [4]:
my_title_layout = dict({"text":"my distribution", 'xanchor':'center', 'x':0.5, 'y':0.9, 'font':{'size':24}})
my_xaxis_layout = dict(title=dict(text="my x axis", font={'size':16}))
my_layout = dict(title=my_title_layout,
                xaxis= my_xaxis_layout)
bin_size_dict = dict(cough_detected=0.001,SNR=0.5, age=1, gender=1, respiratory_condition=1, fever_muscle_pain=1, status=1 )
xaxis_title_dict = dict(cough_detected="Cough Detection Score",SNR="Signal-to-Noise Ratio" , age="Age", 
                        gender="Gender", respiratory_condition="Resp. Condition", fever_muscle_pain="Fever", status="Status" )

for c in ['cough_detected','SNR', 'age', 'gender','respiratory_condition','fever_muscle_pain', 'status' ]:
    hist_data = ply_go.Histogram(x=metadata[c], name=c, showlegend=False, xbins={'size':bin_size_dict[c]})
    fig = ply_go.Figure(data=[hist_data], layout=my_layout)
    fig.update_layout(title={'text': c+" distribution"}, xaxis={"title":{"text":xaxis_title_dict[c]}})
    fig.show()
###


fig = ply_go.Figure( layout=my_layout)
for tmp_diag in metadata['status'].unique():
    violin_data = ply_go.Violin(x=metadata.loc[metadata['status']==tmp_diag, 'status'],
                                y=metadata.loc[metadata['status']==tmp_diag, 'age'],
                                name=tmp_diag,
                                box_visible=True,
                                meanline_visible=True)
    fig.add_trace(violin_data)
    #end for
fig.update_layout(title={'text': "Distribution of AGE by type of DIAGNOSYS"}, xaxis={"title":{"text":None}}, 
                  yaxis={"title":{"text":"AGE [years]"}})
fig.show()


fig = ply_go.Figure( layout=my_layout)
for tmp_diag in metadata['status'].unique():
    violin_data = ply_go.Violin(x=metadata.loc[metadata['status']==tmp_diag, 'status'],
                                y=metadata.loc[metadata['status']==tmp_diag, 'cough_detected'],
                                name=tmp_diag,
                                box_visible=True,
                                meanline_visible=True)
    fig.add_trace(violin_data)
    #end for loop on unique statuses

    
fig.update_layout(title={'text': "Distribution of cough detection classifier by type of DIAGNOSYS"}, 
                  xaxis={"title":{"text":None}}, 
                  yaxis={"title":{"text":"Cough Detection Score"}})
fig.show()



fig = ply_go.Figure( layout=my_layout)
for tmp_diag in metadata['status'].unique():
    violin_data = ply_go.Violin(x=metadata.loc[(metadata['status']==tmp_diag)&(metadata['SNR']<100), 'status'],
                                y=metadata.loc[(metadata['status']==tmp_diag)&(metadata['SNR']<100), 'SNR'],
                                name=tmp_diag,
                                box_visible=True,
                                meanline_visible=True)
    fig.add_trace(violin_data)
    #end for loop on unique statuses

    
fig.update_layout(title={'text': "Distribution of SNR by type of DIAGNOSYS"}, 
                  xaxis={"title":{"text":None}}, 
                  yaxis={"title":{"text":"Signal-to-Noise Ratio"}})
fig.show()

### Resumen de los datos

In [6]:
import plotly.colors as ply_colors
import plotly.figure_factory as ply_ff

In [7]:
def summarise_pivot_df(df, xcols, ycols, valcol):
    summary_df = df[xcols+ycols+valcol]
    summary_df.loc[summary_df[xcols[0]].isnull(),xcols] = 'n/a' #replace NA with a default string
    summary_df.loc[summary_df[ycols[0]].isnull(),ycols] = 'n/a' #replace NA with a default string
    summary_df = summary_df.groupby(xcols+ycols).count().reset_index()
    print(summary_df)
    pivot_df = pd.pivot_table(data=summary_df,values=valcol, index=xcols,columns=ycols)
    pivot_df.columns = [ c[1] for c in pivot_df.columns ] # get rid of multiindex
    return pivot_df

def pandas_to_plotly_heatdata(df):
    #print(df.index)
    return {'x': df.columns.tolist(),
            'y': df.index.tolist(),
            'z': df.values.tolist()}

# Heatmap Fever vs status
meta_summary_df = summarise_pivot_df(metadata, ['fever_muscle_pain'], ['status'], ['uuid'])
meta_summary_df = meta_summary_df[['healthy','symptomatic','COVID-19','n/a']]
n = meta_summary_df.sum().sum()
print(meta_summary_df.head(5) )

heat_data = ply_go.Heatmap(pandas_to_plotly_heatdata(meta_summary_df), 
                           colorscale=ply_colors.sequential.Oranges,
                           colorbar={'title':"Entries", 'titleside':"top"} ,
                           text=meta_summary_df.values)
rounded_annotation = [ ["NA" if pd.isnull(c) else "{:.0f}".format(c) for c in r] for r in heat_data['z']]
fig = ply_ff.create_annotated_heatmap(z=heat_data['z'], 
                                      x=heat_data['x'],
                                      y=[i for i,t in enumerate(heat_data['y'])],
                                      annotation_text=rounded_annotation,
                                      colorscale=heat_data['colorscale'],
                                      showscale=True,
                                      colorbar=heat_data['colorbar']  )
fig.update_layout( yaxis={"title":{"text":"Muscle Pain"},
                          "tickmode":'array',"tickvals":[2,1,0],"ticktext":['n/a','Yes','No']})
fig.show()

heat_data = ply_go.Heatmap(pandas_to_plotly_heatdata(100.0*meta_summary_df/n) ,
                           colorscale=ply_colors.sequential.Oranges,
                          colorbar={'title':"Percentage", 'titleside':"top"})
rounded_annotation = [ [ "NA" if pd.isna(c)  else "{:.2f}%".format(c)  for c in r] for r in heat_data['z']]
fig = ply_ff.create_annotated_heatmap(z=heat_data['z'], 
                                      x=heat_data['x'],
                                      y=[i for i,t in enumerate(heat_data['y'])],
                                      annotation_text=rounded_annotation,
                                      colorscale=heat_data['colorscale'],
                                      showscale=True,
                                      colorbar=heat_data['colorbar'])
fig.update_layout( yaxis={"title":{"text":"Muscle Pain"},
                          "tickmode":'array',"tickvals":[2,1,0],"ticktext":['n/a','Yes','No']})
fig.show()

# Heatmap RespCond vs status
meta_summary_df = summarise_pivot_df(metadata, ['respiratory_condition'], ['status'], ['uuid'])
meta_summary_df = meta_summary_df[['healthy','symptomatic','COVID-19','n/a']]
n = meta_summary_df.sum().sum()
#print(meta_summary_df.head(5) )
#print( pandas_to_plotly_heatdata(meta_summary_df) )
heat_data = ply_go.Heatmap(pandas_to_plotly_heatdata(meta_summary_df), 
                           colorscale=ply_colors.sequential.Oranges,
                           colorbar={'title':"Entries", 'titleside':"top"} ,
                           text=meta_summary_df.values)
rounded_annotation = [ ["NA" if pd.isnull(c) else "{:.0f}".format(c) for c in r] for r in heat_data['z']]
fig = ply_ff.create_annotated_heatmap(z=heat_data['z'], 
                                      x=heat_data['x'],
                                      #y=heat_data['y'],#
                                      y=[int(i) for i,t in enumerate(heat_data['y']) ],
                                      annotation_text=rounded_annotation,
                                      colorscale=heat_data['colorscale'],
                                      showscale=True,
                                      colorbar=heat_data['colorbar']  )
fig.update_layout( yaxis={"title":{"text":"REspiratory Condition"},
                          "tickmode":'array',"tickvals":[2,1,0,],"ticktext":['n/a','Yes','No']})
fig.show()

  fever_muscle_pain       status   uuid
0             False     COVID-19    821
1             False      healthy  11545
2             False  symptomatic   1930
3              True     COVID-19    334
4              True      healthy    934
5              True  symptomatic    660
6               n/a          n/a  11326
                   healthy  symptomatic  COVID-19      n/a
fever_muscle_pain                                         
False              11545.0       1930.0     821.0      NaN
True                 934.0        660.0     334.0      NaN
n/a                    NaN          NaN       NaN  11326.0


  respiratory_condition       status   uuid
0                 False     COVID-19    761
1                 False      healthy  10842
2                 False  symptomatic   1800
3                  True     COVID-19    394
4                  True      healthy   1637
5                  True  symptomatic    790
6                   n/a          n/a  11326


## Resumen por grupo etario

In [8]:
# Heatmap Age vs status
metadata['age_class'] = 0 # NAs will end up here
metadata.loc[ (metadata['age']<40),'age_class'] = 1
metadata.loc[ (metadata['age']>=40) &(metadata['age']<60),'age_class'] = 2
metadata.loc[ (metadata['age']>=60),'age_class'] = 3

meta_summary_df = summarise_pivot_df(metadata, ['age_class'], ['status'], ['uuid'])
meta_summary_df = meta_summary_df[['healthy','symptomatic','COVID-19','n/a']]
n = meta_summary_df.sum().sum()
print(meta_summary_df.head(5) )
#print( pandas_to_plotly_heatdata(meta_summary_df) )
heat_data = ply_go.Heatmap(pandas_to_plotly_heatdata(meta_summary_df), 
                           colorscale=ply_colors.sequential.Oranges,
                           colorbar={'title':"Entries", 'titleside':"top"} ,
                           text=meta_summary_df.values)
rounded_annotation = [ ["NA" if pd.isnull(c) else "{:.0f}".format(c) for c in r] for r in heat_data['z']]
fig = ply_ff.create_annotated_heatmap(z=heat_data['z'], 
                                      x=heat_data['x'],
                                      y=heat_data['y'],#
                                      #y=[int(i) for i,t in enumerate(heat_data['y']) ],
                                      annotation_text=rounded_annotation,
                                      colorscale=heat_data['colorscale'],
                                      showscale=True,
                                      colorbar=heat_data['colorbar']  )
fig.update_layout( yaxis={"title":{"text":"Age"},
                          "tickmode":'array',"tickvals":[3,2,1,0,],"ticktext":['> 60 yo','40 - 60 yo','< 40 yo', 'n/a']})
fig.show()

    age_class       status   uuid
0           0     COVID-19    190
1           0      healthy    747
2           0          n/a  11326
3           0  symptomatic     69
4           1     COVID-19    638
5           1      healthy   6947
6           1  symptomatic   1652
7           2     COVID-19    268
8           2      healthy   3754
9           2  symptomatic    762
10          3     COVID-19     59
11          3      healthy   1031
12          3  symptomatic    107
           healthy  symptomatic  COVID-19      n/a
age_class                                         
0            747.0         69.0     190.0  11326.0
1           6947.0       1652.0     638.0      NaN
2           3754.0        762.0     268.0      NaN
3           1031.0        107.0      59.0      NaN



Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'n/a' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.

