In [1]:
import os
import pandas as pd
import pyreadstat

In [2]:
# Define input and output directories
raw_data_dir = "/Users/aakashsuresh/fairness/nhanes_2017_2020_raw_data/"
output_dir = "/Users/aakashsuresh/fairness/processed_data_nhanes_lab/"

In [3]:
# Define relevant files to process
files_to_process = {
    "fasting_questionnaire": "P_FASTQX.XPT",
    "fasting_glucose": "P_GLU.XPT",
    "glycohemoglobin": "P_GHB.XPT",
    "insulin": "P_INS.XPT",
    "biochemistry_profile": "P_BIOPRO.XPT",
    "iron_status": "P_FETIB.XPT",
    "c_reactive_protein": "P_HSCRP.XPT",
    "cotinine": "P_COT.XPT",
}

In [5]:
for label, file_name in files_to_process.items():
    file_path = os.path.join(raw_data_dir, file_name)
    
    if os.path.exists(file_path):
        try:
            # Read the XPT file
            df, meta = pyreadstat.read_xport(file_path)

            # Drop missing values and handle NaNs
            df = df.dropna(axis=0, how='any')  # Drop rows with NaN values
            df.columns = df.columns.str.strip().str.lower()  # Clean column names
            
            # Save as CSV in the processed_data_new folder
            output_file = os.path.join(output_dir, f"{label}_processed.csv")
            df.to_csv(output_file, index=False)
            print(f"Processed and saved: {output_file}")
        
        except UnicodeDecodeError as e:
            print(f"UnicodeDecodeError while processing {file_path}: {e}")
        except Exception as e:
            print(f"Error while processing {file_path}: {e}")
    else:
        print(f"File not found: {file_path}")

Processed and saved: /Users/aakashsuresh/fairness/processed_data_nhanes_lab/fasting_questionnaire_processed.csv
Processed and saved: /Users/aakashsuresh/fairness/processed_data_nhanes_lab/fasting_glucose_processed.csv
Processed and saved: /Users/aakashsuresh/fairness/processed_data_nhanes_lab/glycohemoglobin_processed.csv
UnicodeDecodeError while processing /Users/aakashsuresh/fairness/nhanes_2017_2020_raw_data/P_INS.XPT: 'utf-8' codec can't decode byte 0xb5 in position 9: invalid start byte
Processed and saved: /Users/aakashsuresh/fairness/processed_data_nhanes_lab/biochemistry_profile_processed.csv
Processed and saved: /Users/aakashsuresh/fairness/processed_data_nhanes_lab/iron_status_processed.csv
Processed and saved: /Users/aakashsuresh/fairness/processed_data_nhanes_lab/c_reactive_protein_processed.csv
Processed and saved: /Users/aakashsuresh/fairness/processed_data_nhanes_lab/cotinine_processed.csv
