Cleaning data: Remove patients with missing sex and short EHR observation period (< 1 year)

In [None]:
import pandas as pd
import numpy as np

# --- Load data ---
df_cohort = pd.read_csv('df_cohort.csv', dtype={
    'source_concept_code': str, 'ICDcode': str, 'Phecode': str
})

demographic_data = pd.read_csv('demographic_dataset.csv')

# --- Compute first and last EHR record per person ---

df_cohort['condition_start_datetime'] = pd.to_datetime(
    df_cohort['condition_start_datetime'],format='mixed'
)
ehr_dates = df_cohort.groupby('person_id')['condition_start_datetime'].agg(
    first_EHR = 'min', 
    last_EHR = 'max'
)
df_cohort = df_cohort.merge(ehr_dates, on='person_id', how='left')

df_cohort['length_first_to_last'] = (
    (df_cohort['last_EHR'] - df_cohort['first_EHR']).dt.days/365
)

In [None]:
# --- Filter out patients with less than 1 year of data ---

person_ids_to_remove = df_cohort.loc[df_cohort['length_first_to_last'] < 1, 'person_id'].unique()
df_cohort_filter = df_cohort[~df_cohort['person_id'].isin(person_ids_to_remove)]

num_case = df_cohort_filter[df_cohort_filter['HS'] == 1]['person_id'].nunique()
num_control = df_cohort_filter[df_cohort_filter['HS'] == 0]['person_id'].nunique()

print('Number of case patients (HS=1):', num_case)
print('Number of control patients (HS=0):', num_control)


In [None]:
# --- Add demographic data ---
cohort_df= phe_condition_filter_df.merge(demographic_data,on='person_id',how='left')

cohort_df['age_first_visit'] = (cohort_df['first_EHR'] - cohort_df['date_of_birth']).dt.days/365

# --- Remove patient sex missing ---
sub_sex = cohort_df_sub[
    (cohort_df_sub['sex_at_birth'] == 'PMI: Skip') |
    (cohort_df_sub['sex_at_birth'] == 'None')
]
remove_sex = set(sub_sex['person_id'])
cohort_df = cohort_df[~cohort_df['person_id'].isin(remove_sex)]

num_case = cohort_df[cohort_df['HS'] == 1]['person_id'].nunique()
num_control = cohort_df[cohort_df['HS'] == 0]['person_id'].nunique()

print('Number of patients in case and control after removing missing sex info:')
print('Number of case patients (HS=1):', num_case)
print('Number of control patients (HS=0):', num_control)

In [None]:
# --- Remove patients with age < 18 or > 90 at last EHR date ---
cohort_df['last_EHR'] = pd.to_datetime(cohort_df['last_EHR'], format = 'mixed')
cohort_df['date_of_birth'] = pd.to_datetime(cohort_df['date_of_birth'])
cohort_df['age_last_EHR'] = (cohort_df['last_EHR']- cohort_df['date_of_birth']).dt.days/365

# Get patients outside 18–90 range
age_over_90 = cohort_df[cohort_df['age_last_EHR'] > 90][['person_id']].drop_duplicates()
age_less_18 = cohort_df[cohort_df['age_last_EHR'] <18 ][['person_id']].drop_duplicates()

age_over_90 = age_over_90['person_id'].to_list()
age_less_18 = age_less_18['person_id'].to_list()

age_less_18_over_90 = age_less_18 + age_over_90

print('Number of patients less than 18:', len(age_less_18))
print('Number of patients over 90:', len(age_over_90))
print('Total removed due to age out of range:', len(age_less_18_over_90))

In [None]:
# --- Filter final cohort ---
cohort_18_to_90_df = cohort_df[~cohort_df['person_id'].isin(age_less_18_over_90)]

print('\nRemaining patients by group (HS):')
print(cohort_18_to_90_df.groupby('HS')['person_id'].nunique())

cohort_18_to_90_df.to_csv('cohort_18_to_90_df.csv',index=False)