In [1]:
import os

import numpy as np
import pandas as pd

import tensorflow as tf

In [2]:
# Paths of folders containing train and test data
TRAIN_FOLDER_PATH = "features_train"
TEST_FOLDER_PATH = "features_test"

In [3]:
# Names of features based on GeMAPS feature set
meta_data = pd.read_csv('feature_description.csv', encoding='ISO-8859-1', header=None)
col_names = list(meta_data[0])

In [4]:
df_dict = {'Participant_ID': [], 'features': []}  # Dict to store values
max_rows = 0  # Max number of rows present in the data (used for padding)

for file in os.listdir(TRAIN_FOLDER_PATH):
    
    file_name_split = file.split('.')
    file_type = file_name_split[1]
    file_name = file_name_split[0]

    if file_type == 'csv':
        # Fetch participant ID
        id = int(file_name.split('_')[1])
        df_dict['Participant_ID'].append(id)

        # Fetch data
        temp_df = pd.read_csv(TRAIN_FOLDER_PATH + '/' + file, names=col_names)

        # Remove null values
        if temp_df.isna().sum().values[0]>0:
            print(f"Removing null values present in {file}")
            temp_df = temp_df.dropna(axis=0)
        
        # Filter out rows where more than half of the feature values are zero
        zero_percentages = (temp_df == 0).mean(axis=1)  # Calculate the percentage of zero values in each row
        threshold = 0.5  # More than half
        temp_df = temp_df[zero_percentages <= threshold]
        
        # Add the features to dict
        df_dict['features'].append(temp_df)
        
        # Update max rows
        if temp_df.shape[0]>max_rows:
            max_rows = temp_df.shape[0] 

Removing null values present in spk_402.csv


In [5]:
max_rows

378

In [6]:
# Pad zeroes
for i in range(len(df_dict['features'])):
    df_dict['features'][i] = df_dict['features'][i].reindex(np.arange(max_rows), fill_value=0).values

In [7]:
# Convert the dict into dataframe
df_features = pd.DataFrame(df_dict)

In [8]:
# Read labels dict
df_labels = pd.read_csv('labels.csv', skipfooter=1, engine='python')
df_labels['Participant_ID'] = df_labels['Participant_ID'].astype(int)
df_labels.head()

Unnamed: 0,Participant_ID,Depression,Gender
0,303,0,0
1,304,0,0
2,305,0,1
3,310,0,1
4,312,0,1


In [9]:
merged_df = pd.merge(df_features, df_labels, on='Participant_ID')

In [10]:
merged_df.head()

Unnamed: 0,Participant_ID,features,Depression,Gender
0,448,"[[33.88331, 0.18847024, 31.197649, 31.198807, ...",1,1
1,475,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0,1
2,313,"[[22.360157, 0.071260795, 20.776451, 22.337982...",0,1
3,449,"[[31.251434, 0.008560931, 31.196766, 31.198511...",0,0
4,339,"[[19.374094, 0.07061678, 18.158512, 18.906977,...",1,1


In [11]:
# Extract Features and Labels for tensorflow
features = merged_df['features'].to_list()
depression_labels = merged_df['Depression'].to_list()
gender_labels = merged_df['Gender'].to_list()

In [12]:
# Convert features to tensor object
features_tensor = tf.convert_to_tensor(features)

In [13]:
features_tensor[0]

<tf.Tensor: shape=(378, 88), dtype=float64, numpy=
array([[ 3.38833100e+01,  1.88470240e-01,  3.11976490e+01, ...,
         2.25000000e-01,  4.99999940e-03, -5.35293800e+01],
       [ 2.34422840e+01,  1.52308615e-02,  2.30832650e+01, ...,
         3.80000000e-01,  0.00000000e+00, -4.73269700e+01],
       [ 2.66581950e+01,  9.55935050e-02,  2.42754900e+01, ...,
         1.14999995e-01,  8.73212500e-02, -4.11217840e+01],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])>