In [1]:
import pandas as pd
import os
import shutil

In [11]:
df_original = pd.read_csv("structured_output.csv")
df = df_original

In [12]:
# List of file_ids to remove
remove_ids = [
    "GaYArbWL1Q_chunk12_data1_task1",
    "GaYArbWL1Q_chunk12_data1_task2",
    "GaYArbWL1Q_chunk12_data1_task3",
    "GaYArbWL1Q_chunk12_data1_task4",
    "osROod3Hmpg_chunk33_data4_task1",
    "nFwAiO22g4Y_chunk25_data2_task3",
    "zA-fqvC02oM_chunk13_data2_task3",
    "zA-fqvC02oM_chunk13_data1_task1",
    "zA-fqvC02oM_chunk13_data1_task2"
]

# Remove those rows
df = df[~df['file_id'].isin(remove_ids)]

# # Split file_id into 4 columns
# split_cols = df['file_id'].str.extract(r'(?P<file>[^_]+)_chunk(?P<chunk>\d+)_data(?P<datapoint>\d+)_task(?P<task>\d+)')

# # Convert extracted columns to proper types
# split_cols[['chunk', 'datapoint', 'task']] = split_cols[['chunk', 'datapoint', 'task']].astype(int)

# # Assign back to dataframe
# df = df.drop(columns=['file_id'])
# df = pd.concat([split_cols, df], axis=1)

# Correctly extract file, chunk, datapoint, and task
split_cols = df['file_id'].str.extract(
    r'^(?P<file>.+)_chunk(?P<chunk>\d+)_data(?P<datapoint>\d+)_task(?P<task>\d+)$'
)

# Convert numeric columns to int
split_cols[['chunk', 'datapoint', 'task']] = split_cols[['chunk', 'datapoint', 'task']].astype(int)

# Drop old file_id and merge the new columns
df = df.drop(columns=['file_id'])
df = pd.concat([split_cols, df], axis=1)


df = df[['file', 'chunk', 'datapoint', 'task', 'Question', 'Answer', 'Reasoning']]
# Save to new CSV (optional)
# df.to_csv("processed_file.csv", index=False)

df.head()

Unnamed: 0,file,chunk,datapoint,task,Question,Answer,Reasoning
0,0Ak1EonaL7g,1,1,3,What is the stress cause here?,This patient shows the stress cause related to...,The patient refers to a conversation with Debr...
1,0Ak1EonaL7g,1,2,3,What is the stress cause here?,This patient shows the stress cause related to...,The patient acknowledges a perception of annoy...
2,0Ak1EonaL7g,2,1,3,What is the stress cause here?,This patient shows the stress cause related to...,The text expresses a question about someone el...
3,0Ak1EonaL7g,4,1,3,What cause of depression does this show?,This patient shows causes of depression relate...,The patient expresses frustration about being ...
4,0Ak1EonaL7g,7,1,1,Does the patient suffer from stress?,"Yes, the patient suffers from stress.",The patient expresses feelings of anxiety rela...


In [13]:
print(df[['file', 'chunk', 'datapoint']].nunique())
print(f"Unique triplets: {df[['file', 'chunk', 'datapoint']].drop_duplicates().shape[0]}")

file         320
chunk         94
datapoint      7
dtype: int64
Unique triplets: 8106


In [14]:
# Group by triplets and count tasks
task_counts = df.groupby(['file', 'chunk', 'datapoint']).size().reset_index(name='task_count')

# Preview the result
print(task_counts.head())
len(task_counts)

          file  chunk  datapoint  task_count
0  -GaYArbWL1Q      1          1           3
1  -GaYArbWL1Q      3          1           1
2  -GaYArbWL1Q      3          2           2
3  -GaYArbWL1Q      4          1           1
4  -GaYArbWL1Q      4          2           1


8106

In [15]:
# Merge to get the full rows in df corresponding to multi-task triplets
df_multi_task = df.merge(task_counts, on=['file', 'chunk', 'datapoint'])
df_multi_task

Unnamed: 0,file,chunk,datapoint,task,Question,Answer,Reasoning,task_count
0,0Ak1EonaL7g,1,1,3,What is the stress cause here?,This patient shows the stress cause related to...,The patient refers to a conversation with Debr...,1
1,0Ak1EonaL7g,1,2,3,What is the stress cause here?,This patient shows the stress cause related to...,The patient acknowledges a perception of annoy...,1
2,0Ak1EonaL7g,2,1,3,What is the stress cause here?,This patient shows the stress cause related to...,The text expresses a question about someone el...,1
3,0Ak1EonaL7g,4,1,3,What cause of depression does this show?,This patient shows causes of depression relate...,The patient expresses frustration about being ...,1
4,0Ak1EonaL7g,7,1,1,Does the patient suffer from stress?,"Yes, the patient suffers from stress.",The patient expresses feelings of anxiety rela...,3
...,...,...,...,...,...,...,...,...
18098,zyIN61kQ6VY,20,1,3,What is the stress cause here?,This patient shows the stress cause related to...,The patient mentions feeling stressed with dea...,3
18099,zyIN61kQ6VY,21,1,1,Does the patient suffer from stress?,"Yes, the patient suffers from stress.","The patient expresses feelings of frustration,...",4
18100,zyIN61kQ6VY,21,1,2,What mental disorder symptoms does the patient...,This shows mental disorder symptoms related to...,The text reflects feelings of frustration but ...,4
18101,zyIN61kQ6VY,21,1,3,What is the stress cause here?,This patient shows the stress cause related to...,The patient expresses feelings of frustration ...,4


In [18]:
# Sample 810 rows randomly (10% of 18103)
df_test = df_multi_task.sample(n=1800, random_state=42)  # You can change seed for different results

# Preview
print(len(df_test))
df_test.head()

1800


Unnamed: 0,file,chunk,datapoint,task,Question,Answer,Reasoning,task_count
10604,mfe8OzzArGc,29,1,3,What cause of depression does this show?,This patient shows causes of depression relate...,The text raises a question about feeling at ri...,4
8896,jm6PG989Q_0,78,1,1,Does the patient suffer from loneliness?,"Yes, the patient suffers from loneliness.",The patient expresses feelings of helplessness...,4
8451,jkKm5Cym-ZY,4,2,4,Does the emotional wellness dimension exist here?,"Yes, this wellness dimension exists here.",The individual expresses a recognition of fluc...,4
8594,jlXmVqhaMds,5,2,4,Does the emotional wellness dimension exist here?,"Yes, this wellness dimension exists here.",The patient expresses a sense of foreboding an...,4
5227,-GaYArbWL1Q,11,1,3,What is the stress cause here?,This patient shows the stress cause related to...,"The text indicates that the patient, Julie, ha...",2


In [19]:
# Drop task_count column
df_test = df_test.drop(columns=['task_count'])

# Construct file_id column in the original format
df_test['file_id'] = df_test.apply(
    lambda row: f"{row['file']}_chunk{row['chunk']}_data{row['datapoint']}_task{row['task']}", axis=1
)

# Reorder columns to match original format
df_test = df_test[['file_id', 'Question', 'Answer', 'Reasoning']]

# Preview
df_test.head()

Unnamed: 0,file_id,Question,Answer,Reasoning
10604,mfe8OzzArGc_chunk29_data1_task3,What cause of depression does this show?,This patient shows causes of depression relate...,The text raises a question about feeling at ri...
8896,jm6PG989Q_0_chunk78_data1_task1,Does the patient suffer from loneliness?,"Yes, the patient suffers from loneliness.",The patient expresses feelings of helplessness...
8451,jkKm5Cym-ZY_chunk4_data2_task4,Does the emotional wellness dimension exist here?,"Yes, this wellness dimension exists here.",The individual expresses a recognition of fluc...
8594,jlXmVqhaMds_chunk5_data2_task4,Does the emotional wellness dimension exist here?,"Yes, this wellness dimension exists here.",The patient expresses a sense of foreboding an...
5227,-GaYArbWL1Q_chunk11_data1_task3,What is the stress cause here?,This patient shows the stress cause related to...,"The text indicates that the patient, Julie, ha..."


In [20]:
# Sort df_sample by file_id
df_test_sorted = df_test.sort_values(by='file_id').reset_index(drop=True)
df_test_sorted

Unnamed: 0,file_id,Question,Answer,Reasoning
0,-GaYArbWL1Q_chunk11_data1_task3,What is the stress cause here?,This patient shows the stress cause related to...,"The text indicates that the patient, Julie, ha..."
1,-GaYArbWL1Q_chunk13_data1_task1,Does the patient suffer from loneliness?,"Yes, the patient suffers from loneliness.",The patient expresses a desire for connection ...
2,-GaYArbWL1Q_chunk15_data2_task1,Does the patient suffer from loneliness?,"Yes, the patient suffers from loneliness.",The patient expresses a desire for connection ...
3,-GaYArbWL1Q_chunk16_data1_task4,Does the social wellness dimension exist here?,"Yes, this wellness dimension exists here.",The patient reflects on their past social inte...
4,-GaYArbWL1Q_chunk18_data1_task3,What cause of depression does this show?,This patient shows causes of depression relate...,The patient reflects on the desire for a deep ...
...,...,...,...,...
1795,zdsaW_TSzHU_chunk13_data1_task1,Does the patient suffer from stress?,"Yes, the patient suffers from stress.",The patient expresses a sense of being overwhe...
1796,zdsaW_TSzHU_chunk3_data1_task1,Does the patient suffer from stress?,"Yes, the patient suffers from stress.",The patient describes a situation where they a...
1797,zyIN61kQ6VY_chunk21_data1_task3,What is the stress cause here?,This patient shows the stress cause related to...,The patient expresses feelings of frustration ...
1798,zyIN61kQ6VY_chunk3_data1_task3,What is the stress cause here?,This patient shows the stress cause related to...,The patient expresses concern about not handin...


In [22]:
df_test.to_csv("test_mentalhealth.csv", index=False)

In [23]:
df_test = pd.read_csv("test_mentalhealth.csv")

# Source and target directories
source_dir = "final_audio_dataset"
target_dir = "test_mentalhealth"

# Create target directories if they don't exist
os.makedirs(target_dir, exist_ok=True)

In [24]:
# Function to copy files based on file_id
def copy_audio_files(df, target_dir):
    for file_id in df['file_id']:
        src_path = os.path.join(source_dir, file_id + ".wav")
        dst_path = os.path.join(target_dir, file_id + ".wav")
        if os.path.exists(src_path):
            shutil.copy(src_path, dst_path)
        else:
            print(f"Missing: {src_path}")
        
copy_audio_files(df_test, target_dir)

In [25]:
shutil.make_archive("test_mentalhealth", 'zip', "test_mentalhealth")

'/data/amey_2311cs10/debayan/test_mentalhealth.zip'

# now sample the test dataset for Doctor verification

In [27]:
df_test = pd.read_csv("test_mentalhealth.csv")
df = df_test

In [28]:
# Correctly extract file, chunk, datapoint, and task
split_cols = df['file_id'].str.extract(
    r'^(?P<file>.+)_chunk(?P<chunk>\d+)_data(?P<datapoint>\d+)_task(?P<task>\d+)$'
)

# Convert numeric columns to int
split_cols[['chunk', 'datapoint', 'task']] = split_cols[['chunk', 'datapoint', 'task']].astype(int)

# Drop old file_id and merge the new columns
df = df.drop(columns=['file_id'])
df = pd.concat([split_cols, df], axis=1)


df = df[['file', 'chunk', 'datapoint', 'task', 'Question', 'Answer', 'Reasoning']]

df.head()

Unnamed: 0,file,chunk,datapoint,task,Question,Answer,Reasoning
0,mfe8OzzArGc,29,1,3,What cause of depression does this show?,This patient shows causes of depression relate...,The text raises a question about feeling at ri...
1,jm6PG989Q_0,78,1,1,Does the patient suffer from loneliness?,"Yes, the patient suffers from loneliness.",The patient expresses feelings of helplessness...
2,jkKm5Cym-ZY,4,2,4,Does the emotional wellness dimension exist here?,"Yes, this wellness dimension exists here.",The individual expresses a recognition of fluc...
3,jlXmVqhaMds,5,2,4,Does the emotional wellness dimension exist here?,"Yes, this wellness dimension exists here.",The patient expresses a sense of foreboding an...
4,-GaYArbWL1Q,11,1,3,What is the stress cause here?,This patient shows the stress cause related to...,"The text indicates that the patient, Julie, ha..."


In [33]:
# Group by triplets and count tasks
task_counts = df.groupby(['file', 'chunk', 'datapoint']).size().reset_index(name='task_count')

# Preview the result
print(task_counts.head())
print(len(task_counts))
print()

# multi_task_triplets = task_counts[task_counts['task_count'] > 2]
# print(multi_task_triplets.head())
# print(len(multi_task_triplets))



          file  chunk  datapoint  task_count
0  -GaYArbWL1Q      6          2           1
1  -GaYArbWL1Q      8          2           1
2  -GaYArbWL1Q     11          1           1
3  -GaYArbWL1Q     13          1           1
4  -GaYArbWL1Q     15          2           1
1623



In [34]:
df_sample_sorted = df_test.sort_values(by='file_id').reset_index(drop=True)
df_sample_sorted

Unnamed: 0,file_id,Question,Answer,Reasoning
0,-GaYArbWL1Q_chunk11_data1_task3,What is the stress cause here?,This patient shows the stress cause related to...,"The text indicates that the patient, Julie, ha..."
1,-GaYArbWL1Q_chunk13_data1_task1,Does the patient suffer from loneliness?,"Yes, the patient suffers from loneliness.",The patient expresses a desire for connection ...
2,-GaYArbWL1Q_chunk15_data2_task1,Does the patient suffer from loneliness?,"Yes, the patient suffers from loneliness.",The patient expresses a desire for connection ...
3,-GaYArbWL1Q_chunk16_data1_task4,Does the social wellness dimension exist here?,"Yes, this wellness dimension exists here.",The patient reflects on their past social inte...
4,-GaYArbWL1Q_chunk18_data1_task3,What cause of depression does this show?,This patient shows causes of depression relate...,The patient reflects on the desire for a deep ...
...,...,...,...,...
1795,zdsaW_TSzHU_chunk13_data1_task1,Does the patient suffer from stress?,"Yes, the patient suffers from stress.",The patient expresses a sense of being overwhe...
1796,zdsaW_TSzHU_chunk3_data1_task1,Does the patient suffer from stress?,"Yes, the patient suffers from stress.",The patient describes a situation where they a...
1797,zyIN61kQ6VY_chunk21_data1_task3,What is the stress cause here?,This patient shows the stress cause related to...,The patient expresses feelings of frustration ...
1798,zyIN61kQ6VY_chunk3_data1_task3,What is the stress cause here?,This patient shows the stress cause related to...,The patient expresses concern about not handin...


In [35]:
df_sample1 = df_sample_sorted[180:360]
df_sample2 = df_sample_sorted[360:540]

len(df_sample1),len(df_sample2)

(180, 180)

In [38]:
df_sample1.to_csv("test_sample_for_verification1.csv", index=False)
df_sample2.to_csv("test_sample_for_verification2.csv", index=False)

In [39]:
# Load the sample data if not already in memory
df_sample1 = pd.read_csv("test_sample_for_verification1.csv")
df_sample2 = pd.read_csv("test_sample_for_verification2.csv")

# Source and target directories
source_dir = "final_audio_dataset"
target_dir_1 = "test_sample_for_verification1"
target_dir_2 = "test_sample_for_verification2"

# Create target directories if they don't exist
os.makedirs(target_dir_1, exist_ok=True)
os.makedirs(target_dir_2, exist_ok=True)

In [40]:
# Function to copy files based on file_id
def copy_audio_files(df, target_dir):
    for file_id in df['file_id']:
        src_path = os.path.join(source_dir, file_id + ".wav")
        dst_path = os.path.join(target_dir, file_id + ".wav")
        if os.path.exists(src_path):
            shutil.copy(src_path, dst_path)
        else:
            print(f"Missing: {src_path}")
        

# Copy for sample1 and sample2
copy_audio_files(df_sample1, target_dir_1)
print("################################################")
copy_audio_files(df_sample2, target_dir_2)

################################################


In [41]:
shutil.make_archive("test_sample_for_verification1", 'zip', "test_sample_for_verification1")
shutil.make_archive("test_sample_for_verification2", 'zip', "test_sample_for_verification2")

'/data/amey_2311cs10/debayan/test_sample_for_verification2.zip'

# Make corresponding training dataset

In [1]:
import pandas as pd

In [4]:
# Load the datasets
structured_df = pd.read_csv('structured_output.csv')
test_df = pd.read_csv('test_mentalhealth.csv')

# List of file_ids to remove explicitly
remove_ids = [
    "GaYArbWL1Q_chunk12_data1_task1",
    "GaYArbWL1Q_chunk12_data1_task2",
    "GaYArbWL1Q_chunk12_data1_task3",
    "GaYArbWL1Q_chunk12_data1_task4",
    "osROod3Hmpg_chunk33_data4_task1",
    "nFwAiO22g4Y_chunk25_data2_task3",
    "zA-fqvC02oM_chunk13_data2_task3",
    "zA-fqvC02oM_chunk13_data1_task1",
    "zA-fqvC02oM_chunk13_data1_task2"
]

# Combine remove_ids with file_ids in test set
all_remove_ids = set(remove_ids) | set(test_df['file_id'])

# Filter the structured dataset to exclude unwanted and test file_ids
train_df = structured_df[~structured_df['file_id'].isin(all_remove_ids)]

In [5]:
len(train_df)

16303

In [7]:
train_df.to_csv('train_mentalhealth.csv', index=False)

In [2]:
# Load the sample data if not already in memory
df_train = pd.read_csv("train_mentalhealth.csv")

# Source and target directories
source_dir = "final_audio_dataset"
target_dir = "train_mentalhealth"

# Create target directories if they don't exist
os.makedirs(target_dir, exist_ok=True)

In [3]:
def copy_audio_files(df, target_dir):
    for file_id in df['file_id']:
        src_path = os.path.join(source_dir, file_id + ".wav")
        dst_path = os.path.join(target_dir, file_id + ".wav")
        if os.path.exists(src_path):
            shutil.copy(src_path, dst_path)
        else:
            print(f"Missing: {src_path}")
        

copy_audio_files(df_train, target_dir)