In [124]:
import os
import numpy as np
print("numpy version: ", np.__version__)
import matplotlib.pyplot as plt
import librosa
import audiofile as af
import json
import soundfile as sf
import pandas as pd

numpy version:  1.26.4


In [125]:
# go into each json file in the directory and extract the data into a df and create rows with each label at event level with id the id of the subject

# get the list of json files
def read_json_files(directory):
    json_files = [f for f in os.listdir(directory) if f.endswith('.json')]

    list_of_dicts = []
    

    # Loop through each JSON file and load its content into the dictionary
    for json_file in json_files:
        file_path = os.path.join(directory, json_file)
        with open(file_path, 'r', encoding='utf-8') as f:
            data_dict = {}
            data = json.load(f)
            data_dict[json_file] = data
            list_of_dicts.append(data_dict)

    return json_files,data_dict,list_of_dicts

def gender_transform(gender):
    if gender=='0':
        gender = 'Male'
    else:
        gender = 'Female'
    return gender

def location_transform(location):
    if location =='p1':
        return 'left posterior'
    elif location =='p2':
        return 'left lateral'
    elif location =='p3':
        return 'right posterior'
    else:
        return 'right lateral'
    
def record_level_df_create(list):

    # List to collect rows for DataFrame
    rows = []

    # Access and print the 'record_annotation' for each dictionary in the list
    for dictionary in list:
        for file_name, data in dictionary.items():
            parts = file_name.split('_')
            # Add a row with the filename and record_annotation
            rows.append({
                # the patient number is the file name until the first _
                'recording_number':parts[4].split('.')[0],
                'patient_number': parts[0],
                'age':float(parts[1]),
                'gender':gender_transform(parts[2]),
                'recording_location':location_transform(parts[3]),
                'record_annotation': data.get('record_annotation', 'N/A')
            })

    record_level_df = pd.DataFrame(rows)
    record_level_df.set_index('recording_number', inplace=True)
    return record_level_df


def event_level_df_create(list):

    # List to collect rows for DataFrame
    rows = []
    # Access and print the 'record_annotation' for each dictionary in the list
    for dictionary in list:
        for file_name, data in dictionary.items():
            parts = file_name.split('_')
            # Add a row with the filename and record_annotation
            if not data.get('event_annotation', []):
                rows.append({
                    # the patient number is the file name until the first _
                    'recording_number':parts[4].split('.')[0],
                    'patient_number': parts[0],
                    'age':float(parts[1]),
                    'gender':gender_transform(parts[2]),
                    'recording_location':location_transform(parts[3]),
                    'start': 'N/A',
                    'end': 'N/A',
                    'event_annotation': data.get('event_annotation', 'N/A')
                })
            else:
                for event in data.get('event_annotation', []):
                    rows.append({
                        # the patient number is the file name until the first _
                        'recording_number':parts[4].split('.')[0],
                        'patient_number': parts[0],
                        'age':float(parts[1]),
                        'gender':gender_transform(parts[2]),
                        'recording_location':location_transform(parts[3]),
                        'start': event.get('start', 'N/A'),
                        'end': event.get('end', 'N/A'),
                        'event_annotation': event.get('type', 'N/A'),
                    })

    event_level_df = pd.DataFrame(rows)
    event_level_df.set_index('recording_number', inplace=True)

    return event_level_df

In [126]:
# Specify your directory here
directory_path = 'train_classification_json'
_,_,train_json_list = read_json_files(directory_path)

In [127]:
train_record_level_df = record_level_df_create(train_json_list)
# count the number of males and females
print(train_record_level_df['record_annotation'].value_counts())

train_record_level_df

record_annotation
Normal          1303
DAS              248
Poor Quality     177
CAS              126
CAS & DAS         95
Name: count, dtype: int64


Unnamed: 0_level_0,patient_number,age,gender,recording_location,record_annotation
recording_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1340,42272778,7.5,Male,left posterior,Normal
2306,41166273,3.9,Male,right posterior,Normal
2178,41275381,2.6,Male,right lateral,Normal
3326,65091224,2.9,Male,right lateral,Normal
2410,65091359,10.4,Male,left lateral,Normal
...,...,...,...,...,...
2754,41267024,0.3,Male,right lateral,Normal
245,41260684,3.8,Female,left posterior,Normal
2022,41074892,10.0,Female,left lateral,Normal
3115,41161556,1.7,Male,left posterior,DAS


In [128]:
train_event_level_df = event_level_df_create(train_json_list)
print(train_event_level_df['event_annotation'].value_counts())
train_event_level_df

event_annotation
Normal            5159
Fine Crackle       912
Wheeze             452
[]                 177
Coarse Crackle      49
Rhonchi             39
Wheeze+Crackle      30
Stridor             15
Name: count, dtype: int64


Unnamed: 0_level_0,patient_number,age,gender,recording_location,start,end,event_annotation
recording_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1340,42272778,7.5,Male,left posterior,6422,7457,Normal
1340,42272778,7.5,Male,left posterior,8436,9169,Normal
1340,42272778,7.5,Male,left posterior,4409,5786,Normal
1340,42272778,7.5,Male,left posterior,79,4164,Normal
2306,41166273,3.9,Male,right posterior,796,2069,Normal
...,...,...,...,...,...,...,...
3115,41161556,1.7,Male,left posterior,6672,7176,Fine Crackle
3115,41161556,1.7,Male,left posterior,7991,8615,Fine Crackle
97,41112999,3.2,Male,left posterior,3776,5043,Normal
97,41112999,3.2,Male,left posterior,6456,7488,Normal


In [129]:
# Specify your directory here
directory_path = 'test_classification_json/2022/inter_test_json'
_,_,inter_test_json_list = read_json_files(directory_path)
inter_test_record_level_df = record_level_df_create(inter_test_json_list)
print(inter_test_record_level_df['record_annotation'].value_counts())
inter_test_record_level_df

record_annotation
Normal          241
CAS              65
DAS              24
CAS & DAS        17
Poor Quality      8
Name: count, dtype: int64


Unnamed: 0_level_0,patient_number,age,gender,recording_location,record_annotation
recording_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4147,41209060,4.4,Female,right posterior,Normal
3576,65109516,0.2,Male,right posterior,DAS
3579,64714172,3.6,Female,right lateral,Normal
4166,65118898,0.7,Male,right lateral,CAS
3581,64714172,3.6,Female,left lateral,Normal
...,...,...,...,...,...
4110,65121853,1.5,Male,right lateral,DAS
1146,40888395,3.4,Male,left posterior,Normal
3624,65071184,4.8,Male,left lateral,Normal
3664,65109516,0.2,Male,right lateral,Normal


In [130]:
inter_test_event_level_df = event_level_df_create(inter_test_json_list)
print(inter_test_event_level_df['event_annotation'].value_counts())
inter_test_event_level_df

event_annotation
Normal            1040
Wheeze             305
Fine Crackle        80
[]                   8
Coarse Crackle       3
Wheeze+Crackle       1
Name: count, dtype: int64


Unnamed: 0_level_0,patient_number,age,gender,recording_location,start,end,event_annotation
recording_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4147,41209060,4.4,Female,right posterior,81,2532,Normal
4147,41209060,4.4,Female,right posterior,4773,6877,Normal
4147,41209060,4.4,Female,right posterior,6934,9383,Normal
4147,41209060,4.4,Female,right posterior,9426,11178,Normal
4147,41209060,4.4,Female,right posterior,11254,13551,Normal
...,...,...,...,...,...,...,...
1146,40888395,3.4,Male,left posterior,8025,8803,Normal
3624,65071184,4.8,Male,left lateral,2637,4916,Normal
3664,65109516,0.2,Male,right lateral,2059,3319,Normal
4207,41225759,7.2,Female,left posterior,13677,15186,Normal


In [131]:
# Specify your directory here
directory_path = 'test_classification_json/2022/intra_test_json'
_,_,intra_test_json_list = read_json_files(directory_path)
intra_test_record_level_df = record_level_df_create(intra_test_json_list)
print(intra_test_record_level_df['record_annotation'].value_counts())
intra_test_record_level_df

record_annotation
Normal          241
DAS              75
CAS              42
CAS & DAS        19
Poor Quality      2
Name: count, dtype: int64


Unnamed: 0_level_0,patient_number,age,gender,recording_location,record_annotation
recording_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2793,65045423,5.2,Female,right lateral,DAS
2915,41281333,0.2,Female,left lateral,CAS & DAS
1280,63613821,9.2,Female,left lateral,DAS
2927,41279835,14.1,Male,left lateral,DAS
190,41159145,3.3,Male,left lateral,Normal
...,...,...,...,...,...
260,65044049,0.9,Female,right lateral,DAS
2624,64743918,7.0,Male,right posterior,CAS
1095,65064302,3.2,Female,right posterior,Normal
1268,65066754,9.0,Male,right posterior,Normal


In [132]:
intra_test_event_level_df = event_level_df_create(intra_test_json_list)
print(intra_test_event_level_df['event_annotation'].value_counts())
intra_test_event_level_df

event_annotation
Normal            688
Fine Crackle      175
Wheeze            108
Coarse Crackle     14
Rhonchi            14
Wheeze+Crackle      3
Stridor             2
[]                  2
Name: count, dtype: int64


Unnamed: 0_level_0,patient_number,age,gender,recording_location,start,end,event_annotation
recording_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2793,65045423,5.2,Female,right lateral,5547,6020,Coarse Crackle
2793,65045423,5.2,Female,right lateral,6842,7464,Coarse Crackle
2793,65045423,5.2,Female,right lateral,3578,4531,Coarse Crackle
2915,41281333,0.2,Female,left lateral,7269,7686,Wheeze
2915,41281333,0.2,Female,left lateral,7755,8400,Fine Crackle
...,...,...,...,...,...,...,...
898,40801342,4.0,Female,left lateral,3463,4239,Fine Crackle
898,40801342,4.0,Female,left lateral,6101,6957,Fine Crackle
898,40801342,4.0,Female,left lateral,2108,2830,Fine Crackle
898,40801342,4.0,Female,left lateral,4918,5575,Fine Crackle


In [133]:
def get_appropriate_n_fft(signal_length):
            return 2**int(np.floor(np.log2(signal_length)))

def extract_mfccs(wav_files, n_mfcc=13):
    mfccs_dict = {}
    
    for wav_file in wav_files:
        # Load the audio file
        y, sr = librosa.load(wav_file, sr=4000)
        

        n_fft = get_appropriate_n_fft(len(y))
        # Compute MFCCs
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc,n_fft=n_fft)
        
        # Store the MFCCs in the dictionary
        mfccs_dict[wav_file] = mfccs
    
    return mfccs_dict

train_wav_list = [os.path.join('splitted_training', f) for f in os.listdir('splitted_training') if f.endswith('.wav')]
test_wav_list = [os.path.join('test_classification_wav', f) for f in os.listdir('test_classification_wav') if f.endswith('.wav')]

train_mfccs_dict = extract_mfccs(train_wav_list)

In [134]:
test_wav_list = [os.path.join('splitted_inter_testing', f) for f in os.listdir('splitted_inter_testing') if f.endswith('.wav')]
test_mfccs_dict = extract_mfccs(test_wav_list)

In [135]:
print(len(train_mfccs_dict))
len(test_mfccs_dict)

6656


1429

In [136]:
# calculate the mean of the mfccs and the stddev for each wav file and save it to a numpy array
def get_mfccs_stats(mfccs_dict):
    mfccs_stats = []
    for wav_file, mfccs in mfccs_dict.items():
        mfccs_stats.append({
            'wav_file': wav_file,
            'mfccs_mean': mfccs.mean(axis=1),
            'mfccs_stddev': mfccs.std(axis=1)
        })
    return mfccs_stats

train_mfccs_stats = get_mfccs_stats(train_mfccs_dict)
test_mfccs_stats = get_mfccs_stats(test_mfccs_dict)
# convert the list of dictionaries to a DataFrame
train_mfccs_stats_df = pd.DataFrame(train_mfccs_stats)
train_mfccs_stats_df.set_index('wav_file', inplace=True)
# create separate columns for each MFCC
train_mfccs_stats_df = pd.concat([train_mfccs_stats_df.drop(['mfccs_mean', 'mfccs_stddev'], axis=1),
                                  train_mfccs_stats_df['mfccs_mean'].apply(pd.Series).add_prefix('mfccs_mean_'),
                                  train_mfccs_stats_df['mfccs_stddev'].apply(pd.Series).add_prefix('mfccs_stddev_')],
                                 axis=1)

train_mfccs_stats_df
test_mfccs_stats_df = pd.DataFrame(test_mfccs_stats)
test_mfccs_stats = get_mfccs_stats(test_mfccs_dict)
test_mfccs_stats_df.set_index('wav_file', inplace=True)
# create separate columns for each MFCC
test_mfccs_stats_df = pd.concat([test_mfccs_stats_df.drop(['mfccs_mean', 'mfccs_stddev'], axis=1),
                                  test_mfccs_stats_df['mfccs_mean'].apply(pd.Series).add_prefix('mfccs_mean_'),
                                  test_mfccs_stats_df['mfccs_stddev'].apply(pd.Series).add_prefix('mfccs_stddev_')],
                                 axis=1)

test_mfccs_stats_df

# add a column for the label
train_mfccs_stats_df['label'] = train_mfccs_stats_df.index.str.split('_').str[-1].str.split('.').str[0]
train_mfccs_stats_df

# # convert the first column of the dataframes to have only the label,which is the last word of the file name
# train_mfccs_stats_df.index = train_mfccs_stats_df.index.str.split('_').str[-1].str.split('.').str[0]
# # rename the column to label from wav_file
# train_mfccs_stats_df.index.name = 'label'
# test_mfccs_stats_df.index = test_mfccs_stats_df.index.str.split('_').str[-1].str.split('.').str[0]
# test_mfccs_stats_df.index.name = 'label'
# train_mfccs_stats_df


Unnamed: 0_level_0,mfccs_mean_0,mfccs_mean_1,mfccs_mean_2,mfccs_mean_3,mfccs_mean_4,mfccs_mean_5,mfccs_mean_6,mfccs_mean_7,mfccs_mean_8,mfccs_mean_9,...,mfccs_stddev_4,mfccs_stddev_5,mfccs_stddev_6,mfccs_stddev_7,mfccs_stddev_8,mfccs_stddev_9,mfccs_stddev_10,mfccs_stddev_11,mfccs_stddev_12,label
wav_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
splitted_training/64618861_9.0_0_p3_2621_seg3_Normal.wav,-457.997467,213.167435,83.412956,4.431201,-17.138275,-10.074658,-5.471889,-8.064613,-8.776545,-5.561886,...,4.133246,4.546383,4.641439,3.843536,1.073516,2.342763,1.202637,0.931060,1.455228,Normal
splitted_training/65099422_0.5_0_p1_2541_seg6_Normal.wav,-509.067810,218.815643,64.833534,-22.786715,-32.501263,-8.034741,2.702834,0.605102,-4.676734,-7.568634,...,10.340343,2.771592,2.691870,3.412967,1.718779,2.099576,3.412116,2.640931,1.870206,Normal
splitted_training/42272778_7.5_0_p4_1292_seg3_Normal.wav,-569.072754,187.846756,93.511215,24.347721,-11.183994,-12.672734,-6.614258,-6.549531,-8.923048,-10.004530,...,8.457924,1.929552,1.805965,1.802993,2.053389,3.191765,1.212181,2.833020,0.990349,Normal
splitted_training/65045423_5.2_1_p2_2769_seg2_Normal.wav,-504.975922,193.575638,72.182053,0.667532,-16.468887,-7.206603,-3.106101,-4.665320,-7.777528,-5.703880,...,12.325104,3.830586,1.989550,2.015229,3.431088,1.090628,1.710448,1.488339,1.419768,Normal
splitted_training/65097128_5.6_1_p1_2539_seg7_Normal.wav,-600.645325,208.621384,81.831795,-5.644009,-37.072651,-26.529625,-10.099458,-3.277571,-7.870863,-10.431726,...,16.543606,5.450306,3.271698,1.463916,4.148243,4.146766,2.257790,1.318144,0.934701,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
splitted_training/64552260_3.3_1_p4_2216_seg1_Normal.wav,-558.230774,205.445663,89.144798,16.067612,-11.971696,-13.987014,-11.575561,-7.590700,-4.901174,-2.978667,...,4.678942,5.837123,3.597932,2.761834,1.128865,1.561415,0.865321,1.282879,1.781661,Normal
splitted_training/65091224_2.9_0_p4_3330_seg5_Normal.wav,-601.828674,226.140213,72.007957,-15.530037,-17.882122,2.902568,-0.390579,-8.634903,-13.330599,-12.999675,...,6.709040,6.691598,5.363192,4.420913,3.157967,2.951305,3.726453,5.277624,3.785033,Normal
splitted_training/64960343_1.3_0_p3_579_seg5_Wheeze+Crackle.wav,-491.445648,263.389893,71.919334,-38.951969,-43.607185,-14.222660,-6.761450,-13.972475,-17.128479,-16.527920,...,7.541380,5.750318,4.645059,5.209085,3.924101,3.979548,4.029910,4.035165,3.898819,Wheeze+Crackle
splitted_training/64727249_7.1_1_p1_105_seg1_Normal.wav,-606.948303,211.462402,95.737877,11.062070,-22.475885,-15.475955,-3.013548,-0.904237,-9.925117,-12.348415,...,6.829253,2.840652,2.242581,1.004902,2.669898,1.959590,0.749147,1.559106,1.261001,Normal


In [137]:
test_mfccs_stats_df['label'] = test_mfccs_stats_df.index.str.split('_').str[-1].str.split('.').str[0]
test_mfccs_stats_df



Unnamed: 0_level_0,mfccs_mean_0,mfccs_mean_1,mfccs_mean_2,mfccs_mean_3,mfccs_mean_4,mfccs_mean_5,mfccs_mean_6,mfccs_mean_7,mfccs_mean_8,mfccs_mean_9,...,mfccs_stddev_4,mfccs_stddev_5,mfccs_stddev_6,mfccs_stddev_7,mfccs_stddev_8,mfccs_stddev_9,mfccs_stddev_10,mfccs_stddev_11,mfccs_stddev_12,label
wav_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
splitted_inter_testing/64585803_5.8_0_p3_3692_seg2_Normal.wav,-479.703064,214.473053,79.809425,-9.522080,-26.612986,-7.594539,2.778215,0.326426,-7.304316,-9.166107,...,4.258336,3.137140,3.011619,5.755418,4.240484,2.354336,1.523598,0.486989,2.309266,Normal
splitted_inter_testing/41223618_1.0_0_p4_3615_seg1_Normal.wav,-629.483643,203.876846,76.701248,-3.423374,-20.106846,-6.967263,-0.246494,-0.299674,-2.320928,-4.762024,...,12.649752,2.452117,2.839221,3.615216,4.453467,3.956262,0.568500,3.951740,4.747894,Normal
splitted_inter_testing/40890405_3.3_0_p3_3660_seg1_Normal.wav,-567.134521,189.524445,81.514267,8.856335,-11.193975,-0.940734,2.796327,-0.087908,-5.565511,-9.081986,...,6.110800,4.903756,2.372826,1.761667,2.315545,1.788912,1.720787,2.127307,1.487266,Normal
splitted_inter_testing/65049153_0.8_0_p3_3761_seg5_Fine Crackle.wav,-608.288025,237.732315,60.262318,-32.854992,-37.371658,-13.668091,-5.501730,-6.386058,-4.680010,-0.001670,...,0.577457,1.098034,2.582586,1.884762,0.250907,2.126797,3.308431,1.366743,2.618812,Fine Crackle
splitted_inter_testing/65049153_0.8_0_p2_3770_seg3_Normal.wav,-559.451416,235.866867,51.833706,-22.251730,-20.357319,-2.725971,2.537154,0.125572,-6.389750,-6.977366,...,6.990504,2.459715,2.323293,1.681737,1.034589,3.713703,3.786094,1.079826,0.461779,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
splitted_inter_testing/41225759_7.2_1_p4_4220_seg3_Normal.wav,-544.470581,208.821808,83.589783,0.668379,-22.559740,-11.337210,-3.172846,-3.173271,-7.155065,-7.826337,...,13.492688,2.045062,1.970033,2.839780,1.636077,1.062987,1.516556,1.199157,2.670312,Normal
splitted_inter_testing/63902360_5.5_0_p3_3512_seg1_Normal.wav,-496.620392,197.236343,81.168655,18.772299,3.244847,-2.021282,-9.768476,-9.914264,-9.755523,-10.166524,...,2.706494,3.822857,2.242374,4.147999,3.981291,1.962782,2.026108,4.284266,0.521609,Normal
splitted_inter_testing/65118898_0.7_0_p2_4161_seg7_Wheeze.wav,-505.970886,218.057831,25.551840,-56.475983,-36.869324,-4.881452,-6.779813,-18.915586,-21.945103,-13.397514,...,4.542900,7.241876,8.358851,9.152205,4.297548,1.396133,1.591756,1.584130,3.029173,Wheeze
splitted_inter_testing/40890405_3.3_0_p4_3679_seg4_Normal.wav,-529.056091,230.923248,79.339859,-3.684891,-30.200176,-28.423559,-21.474655,-11.595930,-10.944349,-9.830450,...,6.215632,10.507877,8.653880,3.056833,3.449218,3.587137,2.381405,1.879206,2.944812,Normal


In [138]:
# find the the instances of a specific name from the wav file column
def find_instances(df, name):
    return df[df.index.str.contains(name)]

find_instances(test_mfccs_stats_df, 'splitted_inter_testing/64585803_5.8_0_p3_3692_seg1_Normal.wav')

Unnamed: 0_level_0,mfccs_mean_0,mfccs_mean_1,mfccs_mean_2,mfccs_mean_3,mfccs_mean_4,mfccs_mean_5,mfccs_mean_6,mfccs_mean_7,mfccs_mean_8,mfccs_mean_9,...,mfccs_stddev_4,mfccs_stddev_5,mfccs_stddev_6,mfccs_stddev_7,mfccs_stddev_8,mfccs_stddev_9,mfccs_stddev_10,mfccs_stddev_11,mfccs_stddev_12,label
wav_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
splitted_inter_testing/64585803_5.8_0_p3_3692_seg1_Normal.wav,-446.588715,220.567825,74.083641,-10.361392,-19.758821,-1.511281,2.670626,-3.97762,-8.053535,-9.877378,...,2.928602,3.714105,3.49788,5.357415,4.427191,2.592401,1.269221,1.478379,1.988212,Normal


In [139]:
#  Can you make a proccess that checks the dataframe and in the name of the file it has the seg section, i want a column named group that for all files with the same recording number, the group is the same
def add_group_column(df):
    df['group'] = df.index.str.split('_').str[-3]
    return df

train_mfccs_stats_df = add_group_column(train_mfccs_stats_df)
test_mfccs_stats_df = add_group_column(test_mfccs_stats_df)

train_mfccs_stats_df

Unnamed: 0_level_0,mfccs_mean_0,mfccs_mean_1,mfccs_mean_2,mfccs_mean_3,mfccs_mean_4,mfccs_mean_5,mfccs_mean_6,mfccs_mean_7,mfccs_mean_8,mfccs_mean_9,...,mfccs_stddev_5,mfccs_stddev_6,mfccs_stddev_7,mfccs_stddev_8,mfccs_stddev_9,mfccs_stddev_10,mfccs_stddev_11,mfccs_stddev_12,label,group
wav_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
splitted_training/64618861_9.0_0_p3_2621_seg3_Normal.wav,-457.997467,213.167435,83.412956,4.431201,-17.138275,-10.074658,-5.471889,-8.064613,-8.776545,-5.561886,...,4.546383,4.641439,3.843536,1.073516,2.342763,1.202637,0.931060,1.455228,Normal,2621
splitted_training/65099422_0.5_0_p1_2541_seg6_Normal.wav,-509.067810,218.815643,64.833534,-22.786715,-32.501263,-8.034741,2.702834,0.605102,-4.676734,-7.568634,...,2.771592,2.691870,3.412967,1.718779,2.099576,3.412116,2.640931,1.870206,Normal,2541
splitted_training/42272778_7.5_0_p4_1292_seg3_Normal.wav,-569.072754,187.846756,93.511215,24.347721,-11.183994,-12.672734,-6.614258,-6.549531,-8.923048,-10.004530,...,1.929552,1.805965,1.802993,2.053389,3.191765,1.212181,2.833020,0.990349,Normal,1292
splitted_training/65045423_5.2_1_p2_2769_seg2_Normal.wav,-504.975922,193.575638,72.182053,0.667532,-16.468887,-7.206603,-3.106101,-4.665320,-7.777528,-5.703880,...,3.830586,1.989550,2.015229,3.431088,1.090628,1.710448,1.488339,1.419768,Normal,2769
splitted_training/65097128_5.6_1_p1_2539_seg7_Normal.wav,-600.645325,208.621384,81.831795,-5.644009,-37.072651,-26.529625,-10.099458,-3.277571,-7.870863,-10.431726,...,5.450306,3.271698,1.463916,4.148243,4.146766,2.257790,1.318144,0.934701,Normal,2539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
splitted_training/64552260_3.3_1_p4_2216_seg1_Normal.wav,-558.230774,205.445663,89.144798,16.067612,-11.971696,-13.987014,-11.575561,-7.590700,-4.901174,-2.978667,...,5.837123,3.597932,2.761834,1.128865,1.561415,0.865321,1.282879,1.781661,Normal,2216
splitted_training/65091224_2.9_0_p4_3330_seg5_Normal.wav,-601.828674,226.140213,72.007957,-15.530037,-17.882122,2.902568,-0.390579,-8.634903,-13.330599,-12.999675,...,6.691598,5.363192,4.420913,3.157967,2.951305,3.726453,5.277624,3.785033,Normal,3330
splitted_training/64960343_1.3_0_p3_579_seg5_Wheeze+Crackle.wav,-491.445648,263.389893,71.919334,-38.951969,-43.607185,-14.222660,-6.761450,-13.972475,-17.128479,-16.527920,...,5.750318,4.645059,5.209085,3.924101,3.979548,4.029910,4.035165,3.898819,Wheeze+Crackle,579
splitted_training/64727249_7.1_1_p1_105_seg1_Normal.wav,-606.948303,211.462402,95.737877,11.062070,-22.475885,-15.475955,-3.013548,-0.904237,-9.925117,-12.348415,...,2.840652,2.242581,1.004902,2.669898,1.959590,0.749147,1.559106,1.261001,Normal,105


In [146]:
# count the number of instances of each label
print(train_mfccs_stats_df['label'].value_counts())
# drop from training the Rhonchi and Stridor instances
train_mfccs_stats_df = train_mfccs_stats_df.drop(train_mfccs_stats_df[train_mfccs_stats_df['label']=='Rhonchi'].index)
train_mfccs_stats_df = train_mfccs_stats_df.drop(train_mfccs_stats_df[train_mfccs_stats_df['label']=='Stridor'].index)
print(train_mfccs_stats_df['label'].value_counts())

label
Normal            5159
Fine Crackle       912
Wheeze             452
Coarse Crackle      49
Rhonchi             39
Wheeze+Crackle      30
Stridor             15
Name: count, dtype: int64
label
Normal            5159
Fine Crackle       912
Wheeze             452
Coarse Crackle      49
Wheeze+Crackle      30
Name: count, dtype: int64


In [145]:
print(test_mfccs_stats_df['label'].value_counts())


label
Normal            1040
Wheeze             305
Fine Crackle        80
Coarse Crackle       3
Wheeze+Crackle       1
Name: count, dtype: int64


In [141]:
# convert the label column from string to one hot encoding
# train_mfccs_stats_df = pd.get_dummies(train_mfccs_stats_df, columns=['label'], prefix='label')
# train_mfccs_stats_df

In [142]:
# test_mfccs_stats_df = pd.get_dummies(test_mfccs_stats_df, columns=['label'], prefix='label')
# test_mfccs_stats_df

In [143]:
# find all rows with group column equal to something
# def find_group(df, group):
#     return df[df['group'] == group]

# find_group(test_mfccs_stats_df, '3692')

In [147]:
# save both dataframes to csv files
train_mfccs_stats_df.to_csv('train_mfccs_stats.csv')
test_mfccs_stats_df.to_csv('test_mfccs_stats.csv')
