Task: merge participant demographics info in `HOPE_paper2_demographics.csv` with `*_download_participant.csv` tables obtained from the Beiwe server
Why? need beiwe-id to study-id mapping in order to download data from the Beiwe server

In [23]:
import os
import glob
import pandas as pd

# Define the mapping of Study_Name to Study_ID -- extracted manually from Beiwe platform
study_id_to_name = {
    "test": "59c2b5b4388cd6715a958247",
    "test2": "5h7D9XT2vrN3BWkdcbYVNtpI",
    "trial": "588224eff4d48a76f488cdfd",
    "trial_phase2": "598365d5388cd66a62ac1f9e",
    "trial_phase2_passive_data_only": "5a2ae1dc03d3c425ef0ea752",
    "troubleshooting": "5a79f17d03d3c45080924ed4"
}

# get all CSV files matching the naming pattern "*_download_participants.csv" inside download_participants" directory
notebook_dir = os.getcwd()

# Construct the absolute path
data_dir = os.path.abspath(os.path.join(notebook_dir, "../../data/download_participants"))
print(f"Data directory: {data_dir}")
# define the relative path to the folder containing CSV files
csv_files = glob.glob(os.path.join(data_dir, "*_download_participants.csv"))


Data directory: /n/onnela_dp_l3/Lab/HOPE/beiwe/data/download_participants


In [24]:
# List to store dataframes
dataframes = []

for file_path in csv_files:
    # Extract the base filename 
    filename = os.path.basename(file_path)

    # Extract the study name from the filename before "_download_participants.csv"
    study_name = filename.replace("_download_participants.csv", "")

    # Check if the extracted study name exists in the dictionary
    if study_name in study_id_to_name:
        study_id = study_id_to_name[study_name]  # Lookup Study_ID
    else:
        print(f"Skipping {filename} (Study name '{study_name}' not found in dictionary)")
        continue  # Skip this file if study name is not in the dictionary

    # read to pandas df
    df = pd.read_csv(file_path)

    # Ensure "Patient ID" column exists before merging
    if "Patient ID" in df.columns:
        # Add the new columns
        df["Study_ID"] = study_id
        df["Study_Name"] = study_name

        # Append the modified dataframe to the list
        dataframes.append(df)

        print(f"Loaded {filename} ({study_name}, ID: {study_id}) with {df.shape[0]} rows")
    else:
        print(f"Skipping {filename} (Missing 'Patient ID' column)")

Loaded test2_download_participants.csv (test2, ID: 5h7D9XT2vrN3BWkdcbYVNtpI) with 4 rows
Loaded test_download_participants.csv (test, ID: 59c2b5b4388cd6715a958247) with 3 rows
Loaded trial_download_participants.csv (trial, ID: 588224eff4d48a76f488cdfd) with 16 rows
Loaded trial_phase2_download_participants.csv (trial_phase2, ID: 598365d5388cd66a62ac1f9e) with 70 rows
Loaded trial_phase2_passive_data_only_download_participants.csv (trial_phase2_passive_data_only, ID: 5a2ae1dc03d3c425ef0ea752) with 62 rows
Loaded troubleshooting_download_participants.csv (troubleshooting, ID: 5a79f17d03d3c45080924ed4) with 3 rows


In [25]:
# Merge all dfs vertically
if dataframes:
    combined_df = pd.concat(dataframes, axis=0, ignore_index=True) 
    print(f"\nFinal merged dataframe has {combined_df.shape[0]} rows and {combined_df.shape[1]} columns")

    # Save the merged DataFrame to a CSV file
    output_path = os.path.join(data_dir, "merged_participants.csv")
    combined_df.to_csv(output_path, index=False)
    print(f"Merged file saved to: {output_path}")
else:
    print("No valid CSV files to merge.")


Final merged dataframe has 158 rows and 17 columns
Merged file saved to: /n/onnela_dp_l3/Lab/HOPE/beiwe/data/download_participants/merged_participants.csv


In [43]:
combined_df.head()
combined_df.rename(columns={"Patient ID": "Beiwe_ID"}, inplace=True)

In [44]:
# pull demographic table
DEMO_TB_NAME = "HOPE_paper2_demographics.csv"
demo_tb_path = os.path.abspath(os.path.join(data_dir, "../", DEMO_TB_NAME))
demo_df = pd.read_csv(demo_tb_path) 

In [45]:
demo_df

Unnamed: 0.1,Unnamed: 0,ID,Gender,Age,Height (cm),Weight (kg),BMI,Health status (Type of cancer / Stage / Performance status),base_disease_site,Unnamed: 8,Unnamed: 9,Unnamed: 10,OS,Beiwe ID
0,0,1001,Female,64,162.6,79.4,30.00,Ovarian,White,,,,Android,jy8yzsap
1,1,1002,Female,70,154.9,67.3,28.05,Ovarian,Other (specify),,1.0,,iOS,md7fnll7
2,2,1003,Female,56,165.4,62.5,22.85,Ovarian,Black or African American,,1.0,,iOS,wrb5oh7u
3,3,1004,Female,56,160.0,54.5,21.29,Cervical,White,,2.0,,iOS,wgs5rptp
4,4,1005,Female,57,157.5,68.5,27.61,Ovarian,Black or African American,,1.0,,iOS,q21jny47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,80,2081,Female,66,150.3,51.3,22.71,Ovarian,White,,1.0,,iOS,pwby6ex2
81,81,2083,Female,66,162.6,106.5,40.30,Ovarian,White,,,,iOS,6ogn9wsa
82,82,2085,Female,68,154.2,60.4,25.40,Ovarian,White,,1.0,,iOS,3tg2dbdl
83,83,2087,Female,58,156.0,62.6,25.70,Uterine,White,,,,iOS,n7g4xhkg


In [46]:
demo_df.rename(columns={"Beiwe ID": "Beiwe_ID"}, inplace=True)

In [51]:
combined_df.columns

Index(['Created On', 'Beiwe_ID', 'Status', 'OS Type',
       'First Registration Date', 'Last Registration', 'Last Upload',
       'Last Survey Download', 'Last Set Password', 'Last Push Token Update',
       'Last Device Settings Update', 'Last OS Version', 'App Version Code',
       'App Version Name', 'Last Heartbeat', 'Study_ID', 'Study_Name'],
      dtype='object')

In [52]:
demo_df_merged = pd.merge(demo_df, combined_df[["Beiwe_ID", "Study_Name", "Study_ID"]], on="Beiwe_ID", how="inner")

In [53]:
demo_df_merged

Unnamed: 0.1,Unnamed: 0,ID,Gender,Age,Height (cm),Weight (kg),BMI,Health status (Type of cancer / Stage / Performance status),base_disease_site,Unnamed: 8,Unnamed: 9,Unnamed: 10,OS,Beiwe_ID,Study_Name,Study_ID
0,0,1001,Female,64,162.6,79.4,30.00,Ovarian,White,,,,Android,jy8yzsap,trial,588224eff4d48a76f488cdfd
1,1,1002,Female,70,154.9,67.3,28.05,Ovarian,Other (specify),,1.0,,iOS,md7fnll7,trial,588224eff4d48a76f488cdfd
2,2,1003,Female,56,165.4,62.5,22.85,Ovarian,Black or African American,,1.0,,iOS,wrb5oh7u,trial,588224eff4d48a76f488cdfd
3,3,1004,Female,56,160.0,54.5,21.29,Cervical,White,,2.0,,iOS,wgs5rptp,trial,588224eff4d48a76f488cdfd
4,4,1005,Female,57,157.5,68.5,27.61,Ovarian,Black or African American,,1.0,,iOS,q21jny47,trial,588224eff4d48a76f488cdfd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,80,2081,Female,66,150.3,51.3,22.71,Ovarian,White,,1.0,,iOS,pwby6ex2,trial_phase2_passive_data_only,5a2ae1dc03d3c425ef0ea752
81,81,2083,Female,66,162.6,106.5,40.30,Ovarian,White,,,,iOS,6ogn9wsa,trial_phase2_passive_data_only,5a2ae1dc03d3c425ef0ea752
82,82,2085,Female,68,154.2,60.4,25.40,Ovarian,White,,1.0,,iOS,3tg2dbdl,trial_phase2_passive_data_only,5a2ae1dc03d3c425ef0ea752
83,83,2087,Female,58,156.0,62.6,25.70,Uterine,White,,,,iOS,n7g4xhkg,trial_phase2_passive_data_only,5a2ae1dc03d3c425ef0ea752


In [55]:
demo_df_merged.to_csv(os.path.abspath(os.path.join(data_dir, "../", "HOPE_paper2_demographics_w_study.csv")))

In [57]:
# version just containing Beiwe_ID, Study_Name, Study_ID
demo_df_sharing = demo_df_merged[["Beiwe_ID", "Study_Name", "Study_ID"]]
demo_df_sharing.to_csv(os.path.abspath(os.path.join(data_dir, "../", "HOPE_paper2_studies_and_ids.csv")))