In [1]:
import os

import pandas as pd

## Train and Test Data (Unlabelled)

In [3]:
def read_data(folder_path: str, col_names: list):
    """
    Function to read and merge different csv's into a single dataframe
    """
    df_lst = []
    for file in os.listdir(folder_path):
        file_name_split = file.split('.')
        file_type = file_name_split[1]
        file_name = file_name_split[0]
        if file_type == 'csv':
            id = int(file_name.split('_')[1])
            temp_df = pd.read_csv(folder_path + '/' + file, names=col_names)
            temp_df['Participant_ID'] = id
            df_lst.append(temp_df)
    return pd.concat(df_lst, axis=0, ignore_index=True)

In [4]:
# Paths of folders containing train and test data
TRAIN_FOLDER_PATH = "features_train"
TEST_FOLDER_PATH = "features_test"

In [6]:
# Names of features based on GeMAPS feature set
meta_data = pd.read_csv('feature_description.csv', encoding='ISO-8859-1', header=None)
col_names = list(meta_data[0])

In [7]:
train_data_unlabelled = read_data(TRAIN_FOLDER_PATH, col_names)
train_data_unlabelled.head()

Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,Participant_ID
0,33.88331,0.18847,31.197649,31.198807,31.237043,0.039394,39.990345,28.040092,100.79878,0.0,...,0.018833,0.003996,1.960784,2.061856,0.24,0.01,0.225,0.005,-53.52938,448
1,23.442284,0.015231,23.083265,23.519197,23.79966,0.716394,0.0,0.0,-0.109734,0.0,...,0.009756,0.031714,3.703704,2.040816,0.08,0.0,0.38,0.0,-47.32697,448
2,26.658195,0.095594,24.27549,27.404346,29.017082,4.741592,61.82953,67.67356,20.46129,11.70544,...,0.010351,0.022239,3.131991,2.036199,0.344444,0.235472,0.115,0.087321,-41.121784,448
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.015024,0.010878,3.333334,0.0,0.0,0.0,0.24,0.0,-56.265,448
4,34.614662,0.00598,34.433628,34.559757,34.76416,0.330532,8.279264,5.828207,8.20837,6.596444,...,-0.025174,0.031772,5.084746,1.886793,0.42,0.0,0.04,0.02,-33.531155,448


In [8]:
test_data_unlabelled = read_data(TEST_FOLDER_PATH, col_names)
test_data_unlabelled.head()

Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,Participant_ID
0,31.240383,0.007705,31.195211,31.19749,31.198797,0.003586,2.502873,2.195133,6.924655,0.0,...,0.014447,0.006939,2.352941,2.531646,0.27,0.01,0.2,0.0,-43.03735,474
1,25.895664,0.022401,25.307716,26.016266,26.448654,1.140938,0.185591,0.0,0.0,0.0,...,0.009397,0.038542,5.0,1.818182,0.08,0.0,0.22,0.13,-40.5895,474
2,25.333618,0.093532,23.29451,24.970371,27.738895,4.444386,62.35421,62.804974,11.951715,16.090214,...,0.015488,0.024067,3.004292,2.631579,0.196667,0.169181,0.16,0.110303,-36.45428,474
3,22.73154,0.048258,22.157387,22.6437,23.537909,1.380522,43.013565,45.249077,12.193955,4.822075,...,0.008454,0.042888,4.024768,2.65625,0.175882,0.155263,0.180588,0.118692,-39.491077,474
4,22.226995,0.036559,21.806602,22.638226,22.795425,0.988823,6.11248,2.537038,10.932255,0.0,...,-0.01383,0.068657,7.142857,3.174603,0.15,0.1,0.135,0.015,-40.291077,474


## Depression and Gender labels for each participant

In [9]:
labels = pd.read_csv('labels.csv', skipfooter=1, engine='python')
labels['Participant_ID'] = labels['Participant_ID'].astype(int)
labels.head()

Unnamed: 0,Participant_ID,Depression,Gender
0,303,0,0
1,304,0,0
2,305,0,1
3,310,0,1
4,312,0,1


In [10]:
train_data = pd.merge(train_data_unlabelled, labels, on="Participant_ID")
train_data.head()

Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,Participant_ID,Depression,Gender
0,33.88331,0.18847,31.197649,31.198807,31.237043,0.039394,39.990345,28.040092,100.79878,0.0,...,1.960784,2.061856,0.24,0.01,0.225,0.005,-53.52938,448,1,1
1,23.442284,0.015231,23.083265,23.519197,23.79966,0.716394,0.0,0.0,-0.109734,0.0,...,3.703704,2.040816,0.08,0.0,0.38,0.0,-47.32697,448,1,1
2,26.658195,0.095594,24.27549,27.404346,29.017082,4.741592,61.82953,67.67356,20.46129,11.70544,...,3.131991,2.036199,0.344444,0.235472,0.115,0.087321,-41.121784,448,1,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.333334,0.0,0.0,0.0,0.24,0.0,-56.265,448,1,1
4,34.614662,0.00598,34.433628,34.559757,34.76416,0.330532,8.279264,5.828207,8.20837,6.596444,...,5.084746,1.886793,0.42,0.0,0.04,0.02,-33.531155,448,1,1


In [11]:
test_data = pd.merge(test_data_unlabelled, labels, on="Participant_ID")
test_data.head()

Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,Participant_ID,Depression,Gender
0,31.240383,0.007705,31.195211,31.19749,31.198797,0.003586,2.502873,2.195133,6.924655,0.0,...,2.352941,2.531646,0.27,0.01,0.2,0.0,-43.03735,474,0,1
1,25.895664,0.022401,25.307716,26.016266,26.448654,1.140938,0.185591,0.0,0.0,0.0,...,5.0,1.818182,0.08,0.0,0.22,0.13,-40.5895,474,0,1
2,25.333618,0.093532,23.29451,24.970371,27.738895,4.444386,62.35421,62.804974,11.951715,16.090214,...,3.004292,2.631579,0.196667,0.169181,0.16,0.110303,-36.45428,474,0,1
3,22.73154,0.048258,22.157387,22.6437,23.537909,1.380522,43.013565,45.249077,12.193955,4.822075,...,4.024768,2.65625,0.175882,0.155263,0.180588,0.118692,-39.491077,474,0,1
4,22.226995,0.036559,21.806602,22.638226,22.795425,0.988823,6.11248,2.537038,10.932255,0.0,...,7.142857,3.174603,0.15,0.1,0.135,0.015,-40.291077,474,0,1


# Save final data into csv

In [12]:
train_data.to_csv("train_data_final.csv")
test_data.to_csv("test_data_final.csv")