In [1]:
import os
from multiprocessing import Pool

import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn import preprocessing

tqdm.pandas()

In [2]:
data_folder = os.path.join('..', '..', 'data', 'mfcc_data')
data_folder

'../../data/mfcc_data'

In [3]:
data_path = os.path.join(data_folder, 'split_samples')
data_path

'../../data/mfcc_data/split_samples'

In [4]:
phonon_dirs = os.listdir(data_path)
phonon_dirs

['aa', 'yy', 'ee', 'uu', 'oo']

In [5]:
dfs = {}
for _, phonon in tqdm(enumerate(phonon_dirs), total=5, desc='loading dataframes to environment'):
    dfp = {}
    dfp['train'] = pd.read_csv(os.path.join(data_path, phonon, 'train.csv'), sep=',', index_col=False)
    dfp['val'] = pd.read_csv(os.path.join(data_path, phonon, 'validation.csv'), sep=',', index_col=False)
    dfp['test'] = pd.read_csv(os.path.join(data_path, phonon, 'test.csv'), sep=',', index_col=False)
    dfs[phonon] = dfp

loading dataframes to environment:   0%|          | 0/5 [00:00<?, ?it/s]

loading dataframes to environment: 100%|██████████| 5/5 [00:07<00:00,  1.56s/it]


In [6]:
dfs['aa']['test'].head()

Unnamed: 0,per_frame_idx,mb_name,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,phonon,asthma_status
0,0,rakeshchandir,60.66887,7.802181,10.811879,7.814297,7.754417,3.729708,2.2199,-0.375733,-1.652478,0.197033,1.516975,2.506025,aa_0,Control
1,1,rakeshchandir,94.439415,22.157387,18.656971,1.743329,7.241928,-6.788054,10.041304,10.452458,7.371104,-1.932848,1.436076,1.496013,aa_0,Control
2,2,rakeshchandir,105.45386,23.223387,15.914175,-8.980723,2.737044,-8.845027,9.470811,11.70737,1.294802,-4.36667,2.691685,1.936454,aa_0,Control
3,3,rakeshchandir,103.578804,20.992363,12.070422,-5.657372,3.078934,-8.174733,13.465215,13.941802,5.428069,-9.584655,1.801956,2.7293,aa_0,Control
4,4,rakeshchandir,102.34328,19.410513,12.802026,-4.302977,2.639948,-5.764059,16.228315,11.062158,5.537199,-2.830167,2.316537,4.929792,aa_0,Control


In [7]:
print(dfs.keys())
print(dfs['aa'].keys())

dict_keys(['aa', 'yy', 'ee', 'uu', 'oo'])
dict_keys(['train', 'val', 'test'])


In [8]:
min_cnts = {}
for key, dfp in dfs.items():
    min_cnts[key] = min([df.groupby('mb_name').count().min()[0] for _, df in dfp.items()])
min_cnts

{'aa': 419, 'yy': 1038, 'ee': 832, 'uu': 548, 'oo': 470}

In [15]:
dfs_sampled = {}
for key, dfp in tqdm(dfs.items(), total=5, desc='subsampling to minimum rows'):
    dfp_sampled = {}

    for type_df, df in dfp.items():
        df_grpd = df.groupby('mb_name')

        # df_sampled = df_grpd.apply(lambda group: group.sample(n=min_cnts[key], random_state=137))
        df_sampled = df_grpd.apply(lambda group: group.sample(n=400, random_state=42))
        df_sampled = df_sampled.reset_index(level=0, drop=True)
        dfp_sampled[type_df] = df_sampled
        
    dfs_sampled[key] = dfp_sampled

subsampling to minimum rows: 100%|██████████| 5/5 [00:01<00:00,  4.55it/s]


In [16]:
print('reduction in total number of rows per dataframe after subsampling: -')
for key, _ in dfs.items():
    train_d = (dfs[key]['train'].shape[0] - dfs_sampled[key]['train'].shape[0])
    val_d = (dfs[key]['val'].shape[0] - dfs_sampled[key]['val'].shape[0])
    test_d = (dfs[key]['test'].shape[0] - dfs_sampled[key]['test'].shape[0])

    print(f'{key}: train = {train_d}, val = {val_d}, test = {test_d}')

reduction in total number of rows per dataframe after subsampling: -
aa: train = 483884, val = 156741, test = 67601
yy: train = 467286, val = 157601, test = 56831
ee: train = 483922, val = 157455, test = 73737
uu: train = 458085, val = 157256, test = 61723
oo: train = 490982, val = 153991, test = 53980


In [17]:
def conv_to_32(df, cols): df[cols] = df[cols].astype(np.float32)

for key, dfp in tqdm(dfs_sampled.items(), total=5, desc='dropping columns'):
    for type_df, df in dfp.items():
        df.drop(['per_frame_idx', 'phonon'], axis=1, inplace=True)

        # convert float64 to float32
        conv_to_32(df, ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12'])

        # convert catagory column to encoded
        df['asthma_status'] = df['asthma_status'].astype('category')
        df['asthma_status'] = df['asthma_status'].cat.codes

dropping columns: 100%|██████████| 5/5 [00:00<00:00, 45.60it/s]


In [14]:
dfs_sampled['aa']['train'].head()

Unnamed: 0,mb_name,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,asthma_status
208968,aakash,111.881432,-6.761704,18.432259,-25.987196,-10.089402,-2.987719,5.414159,-0.438673,8.517943,-15.682688,-5.469613,4.987144,0
207590,aakash,106.462669,-17.036549,30.186279,-20.651434,-4.815188,-2.23413,7.631989,5.356962,1.254348,-13.375121,2.206365,6.864884,0
208826,aakash,117.627678,-5.613412,11.549555,-19.264484,-9.825359,-7.515401,6.382646,-2.680237,16.094612,-17.091774,-7.61035,6.943421,0
208188,aakash,114.597267,-17.889177,25.708261,-20.124599,-5.454746,-1.452647,6.379558,4.898445,3.003988,-14.524295,0.585186,-0.422372,0
208215,aakash,114.808334,-19.979095,25.328091,-19.652727,-3.730235,-2.999141,3.203736,5.934777,2.952255,-14.88531,0.763332,-0.670887,0


In [13]:
# scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)

In [18]:
def transformations(df):
    X = df.loc[:, df.columns != 'asthma_status'].to_numpy()
    Y = df.loc[:, 'asthma_status'].to_numpy()
    # X_scaled = scaler.fit_transform(X)

    df_x = pd.DataFrame(X, columns=['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12'])
    df_y = pd.DataFrame(Y, columns=['asthma_status'], dtype='int')
    result_df = pd.concat([df_x, df_y], axis=1)

    return result_df

In [19]:
save_folder = os.path.join(data_folder, 'grpd_samples')
save_folder

'../../data/mfcc_data/grpd_samples'

In [20]:
for key, dfp in tqdm(dfs_sampled.items(), total=5, desc='saving dfs'):
    phonon_folder = os.path.join(save_folder, key)
    for type_df, df in dfp.items():
        names = df['mb_name'].unique()
        type_df_folder = os.path.join(phonon_folder, type_df)
        for name in names:
            person_df = df.loc[df['mb_name'] == name, df.columns != 'mb_name']
            person_df = transformations(person_df)
            person_df.to_csv(os.path.join(type_df_folder, f'{name}.csv'), index=False)


saving dfs: 100%|██████████| 5/5 [00:06<00:00,  1.28s/it]


In [21]:
for key, type_dict in dfs_sampled.items():
    print(f'{key}: -')
    for type_df, part in type_dict.items():
        print(f'{type_df}: {part.shape}', end='\t')
    print()

aa: -
train: (46800, 14)	val: (13600, 14)	test: (6800, 14)	
yy: -
train: (46000, 14)	val: (13200, 14)	test: (5600, 14)	
ee: -
train: (46000, 14)	val: (13200, 14)	test: (6800, 14)	
uu: -
train: (43600, 14)	val: (13200, 14)	test: (6800, 14)	
oo: -
train: (46400, 14)	val: (13200, 14)	test: (6400, 14)	
