*This notebook aims to create a matrix for logistic regression. It follows the 'all_demographic_no_timewindow.ipynb' notebook and will process SNOMED/Phecode mapping.*

In [None]:
import pandas as pd
import os
import numpy as np
import gc
import pickle

# --- Load data ---
condition_data = pd.read_csv('condition_dataset.csv') 

In [None]:
# --- SNOMED to ICD mapping --- 
snomed2icd = pd.read_csv('snomed_icd_mapping.csv',dtype={'SNOMED': str, 'ICD10':str})
snomed2icd = snomed2icd.rename(columns ={'SNOMED':'source_concept_code',
                                         'ICD10CM':'ICDcode'}) # Ensure column name is matching

# Select interested columns
df_condition_all = condition_data[['person_id','standard_concept_code','source_concept_code','source_vocabulary','condition_start_datetime']]

# Subset records that already use ICD codes
df_condition_icd = df_condition_all.loc[df_condition_all['source_vocabulary'].isin(['ICD10CM', 'ICD9CM'])]

# Subset records that use SNOMED codes
df_condition_snomed = df_condition_all.loc[dataset_condition_df['source_vocabulary'] == 'SNOMED' ]
df_condition_snomed = df_condition_snomed.reset_index(drop = True)

# Map SNOMED codes to ICD using the reference table
df_condition_snomed_mapped = df_condition_snomed.merge(snomed2icd, how='left', on = 'source_concept_code')

# Remove records without ICD mapping
df_condition_snomed_mapped = df_condition_snomed_mapped.dropna(subset=['ICDcode'])

# Standardize ICD column format for both sources
df_condition_icd.loc[:, 'ICDcode'] = df_condition_icd['source_concept_code']
df_condition_icd.loc[:, 'ICDsource'] = df_condition_icd['source_vocabulary']

# Concatenate both sources into a unified ICD-coded dataset
df_condition_with_icd= pd.concat([df_condition_icd,df_condition_snomed_mapped],ignore_index=True)
df_condition_with_icd.head()

In [None]:
# --- ICD to Phecode mapping ---
df_icd9_to_phe = pd.read_csv('ICD_9CM_phecode.csv', dtype={'Phecode': str, 'ICD': str})
df_icd10_to_phe = pd.read_csv('ICD_10CM_phecode.csv', dtype={'Phecode': str, 'ICD': str})

# Standardize column names
df_icd9_to_phe = df_icd9_to_phe.rename(columns={'ICD': 'ICDcode'})
df_icd10_to_phe = df_icd10_to_phe.rename(columns={'ICD': 'ICDcode'})

# Subset ICD9-coded conditions
df_condition_icd9 = df_condition_with_icd[
    df_condition_with_icd['source_vocabulary'] == 'ICD9CM'
]

# Subset ICD10-coded conditions, including SNOMED-mapped
df_condition_icd10 = df_condition_with_icd[
    (df_condition_with_icd['source_vocabulary'] == 'ICD10CM') |
    (df_condition_with_icd['source_vocabulary'] == 'SNOMED')
]
# Map ICD-9-CM codes to Phecodes
df_phe_icd9 = df_condition_icd9.merge(df_icd9_to_phe, how='left', on='ICDcode')

# Map ICD-10-CM codes to Phecodes
df_phe_icd10 = df_condition_icd10.merge(df_icd10_to_phe, how='left', on='ICDcode')

# Concatenate all Phecode-mapped condition records
df_phe_condition = pd.concat([df_phe_icd9, df_phe_icd10])

# Identify unmapped ICD codes
df_unmapped_icd = df_phe_condition[df_phe_condition['Phecode'].isna()]
df_unmapped_icd = df_unmapped_icd[['ICDcode', 'ICDsource']].drop_duplicates()
print('ICD codes not mapped to any Phecode:', df_unmapped_icd.ICDcode.unique())

# Remove unmapped records and duplicates
df_phe_condition_mapped = df_phe_condition.dropna(subset=['Phecode'])
df_phe_condition_mapped = df_phe_condition_mapped.drop_duplicates()

print('Number of unique Phecodes:',len(set(df_phe_condition_mapped['Phecode'])))

In [None]:
df_phe_condition_mapped.to_csv('df_phe_condition_mapped.csv', index = False)