Format cohort data into matrix. (for logistic regression)

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

cohort = pd.read('cohort_18_to_90_df.csv')

In [None]:
cohort_demo = cohort[[
    'person_id','gender', 'date_of_birth', 'race', 'ethnicity','sex_at_birth',
    'birth_year', 'age_last_EHR', 'HS']
].drop_duplicates()

In [None]:
# --- Standardize demographic values ---
# sex
sex_mapping = {
    'Male': 'Male',
    'Female': 'Female',
    'I prefer not to answer': 'Other sex',
    'Intersex': 'Other sex',
    'No matching concept': 'Other sex'
}

cohort_demo['sex_revised'] = cohort_demo['sex_at_birth'].map(sex_mapping)

# race
race_mapping = {
    'White':'White' ,                                     
    'Black or African American':'Black or African American' , 
    'Asian':'Asian' ,                                    
    'More than one population':'More than one population',                      
    'None of these':'Other race',    
    'None Indicated':'No answer race',                                                              
    'PMI: Skip': 'No answer race',
    'I prefer not to answer':'No answer race',                        
    'Middle Eastern or North African':'Other race',               
    'Native Hawaiian or Other Pacific Islander ':'Other race'    
    }

cohort_demo['race_revised'] = cohort_demo['race'].map(race_mapping)

# ethnicity
ethnicity_mapping = {
    'Not Hispanic or Latino':'Not Hispanic or Latino',                            
    'Hispanic or Latino':'Hispanic or Latino',                                                                            
    'What Race Ethnicity: Race Ethnicity None Of These':'Other ethnicity', 
    'No matching concept': 'Other ethinicity',
    'PMI: Prefer Not To Answer':'No answer ethnicity', 
    'PMI: Skip':'No answer ethnicity'
    }

cohort_demo['ethnicity_revised'] = cohort_demo['ethnicity'].map(ethnicity_mapping)

In [None]:
# --- Create binary matrices for categorical variables ---

# Sex matrix (Male / Female / Other sex)
sex_matrix = cohort_demo.assign(value=1).pivot_table(
    index='person_id', columns='sex_revised', values='value', fill_value=0
)

# Race matrix (e.g., White / Asian / Other race / etc.)
race_matrix = cohort_demo.assign(value=1).pivot_table(
    index='person_id', columns='race_revised', values='value', fill_value=0
)

# Ethnicity matrix (note spelling correction)
ethnicity_matrix = cohort_demo.assign(value=1).pivot_table(
    index='person_id', columns='ethnicity_revised', values='value', fill_value=0
)


In [None]:
# --- Create final logistic regression matrix ---
def create_matrix(df):
    # Drop duplicate entries for person_id and Phecode
    df = df.drop_duplicates(subset=['person_id', 'Phecode'])
    
    # Create the binary matrix using pivot_table
    binary_matrix = df.assign(value=1).pivot_table(
        index='person_id',columns='Phecode', values='value', fill_value=0)
        
    return binary_matrix

# Apply z-score normalization to age
cohort_demo['age_normalized'] = stats.zscore(cohort_demo['age_last_EHR'])

df_features = cohort_demo[['person_id','HS','age_normalized']].drop_duplicates()

df_demo_matrix = (df_features
                   .merge(race_matrix, on='person_id', how='left')
                   .merge(sex_matrix, on='person_id', how='left')
                   .merge(ethinicity_matrix, on='person_id', how='left'))

df_phecode_matrix = create_matrix(cohort)
df_final_matrix = df_phecode_matrix.merge(df_demo_matrix,on = 'person_id', how = 'left')
df_final_matrix.to_csv('df_final_matrix.csv',index = False)