# Preprocessing

In [1]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.impute import KNNImputer

  from pandas.core import (


## For Audio Data

### Align the id of the csv file to the wav

In [3]:
# Load the .wav data
train_data_folder = "data/training_data"  
train_wav_files = [os.path.join(train_data_folder, f) for f in os.listdir(train_data_folder) if f.endswith(".wav")]
train_file_ids = [re.sub(r'^.*/|\.wav$', '', file) for file in train_wav_files]
# print(train_file_ids)

# test_data_folder = "data/test_data_public"
test_data_folder = "data/test_data_private"

test_wav_files = [os.path.join(test_data_folder, f) for f in os.listdir(test_data_folder) if f.endswith(".wav")]
test_file_ids = [re.sub(r'^.*/|\.wav$', '', file) for file in test_wav_files]
# print(test_file_ids)

In [4]:
# Load the .csv data
train_csv_file = "data/training_datalist.csv"
# test_csv_file = "data/test_datalist_public.csv"
test_csv_file = "data/test_datalist_private.csv"
df_train = pd.read_csv(train_csv_file)
df_test = pd.read_csv(test_csv_file)


In [6]:
# sort the csv file by the Audio file ids
train_csv_sorted = df_train.set_index('ID').loc[train_file_ids].reset_index()
test_csv_sorted = df_test.set_index('ID').loc[test_file_ids].reset_index()

# save the sorted csv files
train_csv_sorted.to_csv("data/training_datalist_sorted.csv", index=False)
# test_csv_sorted.to_csv("data/test_datalist_public_sorted.csv", index=False)
test_csv_sorted.to_csv("data/test_datalist_private_sorted.csv", index=False)



### Turn label into .npy

In [7]:
train_label_data = train_csv_sorted['Disease category'].to_numpy() 
test_label_data = test_csv_sorted['Disease category'].to_numpy()

np.save("use_data/train_label.npy", train_label_data)
np.save("use_data/test_label2.npy", test_label_data)


### Get the order of disease (only for visualization)

In [9]:
display(train_csv_sorted)

train_csv_sorted_disease = train_csv_sorted.sort_values(by="Disease category")

disease_sort = train_csv_sorted_disease['Disease category']
disease_sort_index = train_csv_sorted_disease.index.tolist() # disease from 1 to 5

# 儲存index
np.save("mid_data/disease_sort_index.npy", disease_sort_index)
np.save("mid_data/disease_sort.npy", disease_sort)
# 儲存 disease category

Unnamed: 0,ID,Sex,Age,Disease category,Narrow pitch range,Decreased volume,Fatigue,Dryness,Lumping,heartburn,...,Onset of dysphonia,Noise at work,Occupational vocal demand,Diabetes,Hypertension,CAD,Head and Neck Cancer,Head injury,CVA,Voice handicap index - 10
0,0702ulb,2,42,1,0,0,1,1,1,0,...,3,2,1,0,0,0,0,0,0,26.0
1,1101dr2,1,59,4,0,1,0,0,0,0,...,2,2,2,0,1,0,0,0,0,20.0
2,1101drs,2,50,2,1,1,1,1,1,1,...,2,2,1,0,0,0,0,0,0,33.0
3,1101drd,2,36,1,1,1,1,1,0,0,...,1,2,1,0,0,0,0,0,0,29.0
4,12028t2,2,36,1,1,0,0,0,0,0,...,1,2,2,0,0,0,0,0,0,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1100nun,1,31,1,1,0,1,1,0,0,...,2,3,1,0,0,0,0,0,0,22.0
996,0W01o99,1,27,4,1,1,1,0,1,0,...,1,2,1,0,0,0,0,0,0,37.0
997,1101qfc,2,59,2,0,0,0,0,1,0,...,3,2,3,0,0,0,0,0,0,13.0
998,1100k2s,2,47,1,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,32.0


## For csv Data

### Missing Value

In [10]:
# 將從未抽過煙及已戒煙的人補0，其他取中位數
non_zero_median = train_csv_sorted.loc[train_csv_sorted['PPD'] != 0, 'PPD'].median()

def fill_ppd(row):
    if row['Smoking'] in [0, 1]:  
        return 0 if pd.isna(row['PPD']) else row['PPD']
    elif row['Smoking'] in [2, 3]:  
        return non_zero_median if pd.isna(row['PPD']) else row['PPD']
    else:  
        return row['PPD']

train_csv_sorted['PPD'] = train_csv_sorted.apply(fill_ppd, axis=1)

In [11]:
# delete the ID and Disease category columns
train_csv_sorted = train_csv_sorted.drop(columns=['ID', 'Disease category'])

# KNN imputer
imputer = KNNImputer(n_neighbors=10)
imputed_data = imputer.fit_transform(train_csv_sorted)
train_csv_sorted_imputed = pd.DataFrame(imputed_data, columns=train_csv_sorted.columns)

print(train_csv_sorted_imputed.isna().sum())


Sex                          0
Age                          0
Narrow pitch range           0
Decreased volume             0
Fatigue                      0
Dryness                      0
Lumping                      0
heartburn                    0
Choking                      0
Eye dryness                  0
PND                          0
Smoking                      0
PPD                          0
Drinking                     0
frequency                    0
Diurnal pattern              0
Onset of dysphonia           0
Noise at work                0
Occupational vocal demand    0
Diabetes                     0
Hypertension                 0
CAD                          0
Head and Neck Cancer         0
Head injury                  0
CVA                          0
Voice handicap index - 10    0
dtype: int64


In [12]:
# impute the test data
test_csv_sorted = test_csv_sorted.drop(columns=['ID', 'Disease category'])
test_csv_sorted['PPD'] = test_csv_sorted.apply(fill_ppd, axis=1)
imputed_data = imputer.transform(test_csv_sorted)
test_csv_sorted_imputed = pd.DataFrame(imputed_data, columns=test_csv_sorted.columns)

In [13]:
# output the imputed data
train_csv_sorted_imputed.to_csv("use_data/train_history.csv", index=False)
test_csv_sorted_imputed.to_csv("use_data/test_history2.csv", index=False)