###  Generates additional condition history covariates using concept ancestors for following condition categories

Arthritis

Autoimmune

Cancer

Chronic Kidney

Diabetes Mellitus

HIV

Hypertension

Menopause

Thyroid Disorder


In [None]:
# potential refactoring change could be to merge covariate generation 1 and generation 2
in_name = "manuscript_covariates_2_final"
out_name = "manuscript_covariates_3_final"

In [None]:
import sys
import time
import importlib
import sparse
import datetime
import pandas as pd
import numpy as np
import scipy.sparse
import matplotlib.pyplot as plt

#
sys.path.append("..")
import Utils.dbutils as dbutils
import Utils.data_utils as data_utils
import Generators.CohortGenerator as CohortGenerator
import Generators.FeatureGenerator as FeatureGenerator
import config
local_imports = (
    dbutils,
    data_utils,
    CohortGenerator,
    FeatureGenerator,
    config
)
for i in local_imports:
    i = importlib.reload(i)

## Condition Name Creation

In [None]:
## database connection parameters
# username = config.PG_USERNAME #we use peer authentication so don't need use vars, but in theory would pass them into config_path
# password = config.PG_PASSWORD
database_name = config.DB_NAME
print(database_name)
config_path = 'postgresql://{database_name}'.format(
    database_name = database_name
)
connect_args = {"host": '/var/run/postgresql/'} # connect_args to pass to sqlalchemy create_engine function

# schemas 
schema_name = 'eol_test_ncjones' # all created tables will be created using this schema
cdm_schema_name = config.OMOP_CDM_SCHEMA # the name of the schema housing your OMOP CDM tables
print(f"cdm schema: {cdm_schema_name}")
# caching
reset_schema = False # if true, rebuild all data from scratch

# set up database, reset schemas as needed
db = dbutils.Database(config_path, schema_name, connect_args, cdm_schema_name)
# if reset_schema:
#     db.execute(
#         'drop schema if exists {} cascade'.format(schema_name)
#     )
# db.execute(
#     'create schema if not exists {}'.format(schema_name)
# )

In [None]:
%%time
## Below is stuff to query adverse events


# This is to generate adverse condition events based on condition names
sql = """
    select
        c.concept_name as concept_name,
        c.concept_id as concept_id,
        c.domain_id as domain_id
    from
        {omop_schema}.concept c
    where 
        c.domain_id = 'Condition' 
""".format(
    omop_schema=config.OMOP_CDM_SCHEMA,
)
diagnosis_names = db.query(sql)


In [None]:
%%time
##### Below is stuff to query adverse events


# This is to generate adverse condition events based on condition names
sql = """
    select
        c.concept_name as concept_name,
        c.concept_id as concept_id,
        c.domain_id as domain_id,
        ca.ancestor_concept_id as ancestor_concept_id,
        ca.descendant_concept_id as descendant_concept_id
    from
        {omop_schema}.concept c
    join {omop_schema}.concept_ancestor ca on c.concept_id = ca.descendant_concept_id
    where
       c.domain_id = 'Condition'
""".format(
    omop_schema=config.OMOP_CDM_SCHEMA,
)
diagnosis_names_ca = db.query(sql)


In [None]:
category_to_cond_ca = {"Arthritis" : 
                     (["arthritis"],
                     [4291025, 79109, 4117097, 4262590, 4167984, 80482, 80809]),
                 "Autoimmune" : 
                     (["sjögren",  "rheumatoid_arthritis", "reactive_arthritis", "lupus_erythematosus", "dermatomyositis"],
                     [4182582, 4224624, 434621, 4318558, 4215792, 4051056, 4034815]),
                 "Cancer" : 
                     (["cancer","carcinoma","malignan","leukemia","lymphoma","sarcoma"],
                     [443392, 764422, 764225, 45757107, 45773534, 432851, 46270083,40493428,4114222,197506,4153882,40491001,4312698,36716620,4116238,40492037,4154630,4147164]),
                 "Chronic Kidney" :
                     (["chronic_kidney","chronic_renal_failure"],
                     [46271022,45763854,44782429,443597,443611,43531578,443601,443612,443614,198185, 443961, 4322556, 4128067, 193782]),
                 "Diabetes Mellitus" : 
                     (["diabetes_mellitus"],
                     [201820, 4019513, 4024659, 36713275, 4212631]),
                 "Thyroid Disorder" : 
                     (["hashimoto_thyroiditis", "graves"],
                     [135215, 4130018, 4100629,4232076]),
                 "HIV" :
                     (["hiv"],
                     [4013106,4235563,36714516,4171125]),
                 "Hypertension":
                     (["hypertension"],
                     [316866, 37208172, 42709887, 4227517, 45768449, 43020424]),
                 "Menopause":
                     (["menopause"],
                     [4128329])
                }




## Getting names of the concept ancestors (optional)

In [None]:
import pandas as pd

# Function to get concept names from a specified table
def get_concept_names(concept_ids):
    concept_names = []
    for idx in concept_ids:
        # Replace this line with your table code
        concept_name = diagnosis_names_ca.loc[diagnosis_names_ca['descendant_concept_id'] == idx, 'concept_name'].iloc[0]
        concept_names.append(concept_name)
    return concept_names

# Create a Pandas Excel writer to save the output
with pd.ExcelWriter('category_concept_ancestors.xlsx') as writer:
    # Iterate through the dictionary
    for category in category_to_cond_ca:
        # Get the list of condition concept ids and names
        condition_concept_ids = category_to_cond_ca[category][1]
        condition_concept_names = get_concept_names(condition_concept_ids)

        # Create a DataFrame with the required columns
        dataframe = pd.DataFrame({
            'concept_ancestor_condition_id': condition_concept_ids,
            'concept_ancestor_name': condition_concept_names
        })

        # Write the DataFrame to an Excel sheet
        dataframe.to_excel(writer, sheet_name=category, index=False)

print("Excel file created with sheets for each condition category.")

### Deriving concept ancestors

In [None]:
%%time

all_condition_ids = []
condition_name_map = dict()
condition_id_name_map = dict()
condition_id_fullname_map = dict()
pn_to_id_map = dict()
condition_name_hits = []
condition_ids = []
# Obesity, Morbid obesity
# condition_name_hits.append(["morbid", "obesity"])
# # Malignancies/cancer
condition_name_hits.append("cancer")

condition_name_hits.append("malignan")


# # HIV
condition_name_hits.append("hiv")
# # Chronic Kidney disease stage 4 or 5
condition_name_hits.append(["chronic", "kidney"])
# # Hemodialysis 

# # Diabetes mellitus (with or without any complication)
condition_name_hits.append(["diabetes", "mellitus"])

# # Menopause
condition_name_hits.append("menopause")

condition_name_hits.append("hypertension")

condition_name_hits.append("carcinoma")

condition_name_hits.append("dermatomyositis")
condition_name_hits.append("graves")
condition_name_hits.append(["hashimoto", "thyroiditis"])

#Arthritis
condition_name_hits.append(["reactive", "arthritis"])
condition_name_hits.append(["rheumatoid", "arthritis"])
condition_name_hits.append("arthritis")
condition_name_hits.append("sjögren")
condition_name_hits.append(["lupus", "erythematosus"])
condition_name_hits.append(["chronic","renal","failure"])
condition_name_hits.append("leukemia")
condition_name_hits.append("lymphoma")
condition_name_hits.append("sarcoma")
i = 0
for ind, (name, cid) in diagnosis_names[["concept_name", "concept_id"]].iterrows(): 
    name = name.lower()
    for pn in condition_name_hits:
        def addit(cid, name, pn):
            all_condition_ids.append(cid)
            condition_id_fullname_map[cid] = name
            condition_id_name_map[cid] = pn
            if pn in pn_to_id_map:
                pn_to_id_map[pn].add(cid)
            else:
                pn_to_id_map[pn]= set([cid])
        
        if isinstance(pn, list) and all([x in name for x in pn]):
            if (cid == 192855 and name.startswith("cancer in situ")):
                print("add is attempted")
            addit(cid,name,"_".join(pn))
            break
        if isinstance(pn, str) and pn in name:
            addit(cid,name,pn)
            break
print(f"Found {len(all_condition_ids)} condition events")

condition_names = sorted(list(pn_to_id_map.keys()))

# ca_to_fullname = dict()
# for category,cond_ca in category_to_cond_ca.items():
#     concept_ancestor_fullnames = []
#     for concept_ancestor_id in cond_ca[1]:
#         concept_ancestor_fullname = (diagnosis_names_ca.loc[(diagnosis_names_ca.ancestor_concept_id == concept_ancestor_id),
#                                                                 'concept_name'].values[0])
#         ca_to_fullname[concept_ancestor_id] = concept_ancestor_fullname
        
        

In [None]:
def join_sets(sets):
  """Joins multiple sets into a single set.

  Args:
    *sets: A list of sets to join.

  Returns:
    A set that contains all of the elements from the given sets.
  """

  joined_set = set()
  for s in sets:
    joined_set |= s

  return joined_set

def create_new_condition_categories(p_to_id,id_to_p,cat_name):
    '''Uses the concept ancestor table and the conditions within each category to create
    a category of new conditions doesn't overlap with prior conditions in category
    generated from collecting descendants of concept ancestors'''
    #get old condition ids for category
    condition_ids = join_sets([p_to_id[condition] for condition in category_to_cond_ca[cat_name][0]])
    
    #get concept ancestor query for category
    concept_ancestor_ids = category_to_cond_ca[cat_name][1]
    #look at concept ancestor for ids with all of the concept ancestors
    new_condition_ids = set(diagnosis_names_ca.loc[
        diagnosis_names_ca['ancestor_concept_id'].isin(concept_ancestor_ids),'concept_id']
    )
    new_condition_ids = new_condition_ids.difference(condition_ids)
    
    #update dictionarys
    new_cat_name = f"{cat_name}_new"
    p_to_id[new_cat_name] = new_condition_ids
    
    for cid in new_condition_ids:
        id_to_p[cid] = new_cat_name
    
    return p_to_id,id_to_p

In [None]:
for category,_, in category_to_cond_ca.items():
    pn_to_id_map,condition_id_name_map = create_new_condition_categories(pn_to_id_map,condition_id_name_map,category)

In [None]:
for category,cond in category_to_cond_ca.items():
    old_codes =  cond[0]
    new_codes = category + '_new'
    print(category)
    total_condition_ids = []
    for code in old_codes:
        total_condition_ids += pn_to_id_map[code]
    print(f"Original number of queried condition codes: {len(total_condition_ids)}. Concept ancestor search contributed {len(pn_to_id_map[new_codes])} codes")


In [None]:
condition_names = sorted([x for x in pn_to_id_map.keys() if x.endswith('_new')])

### Get Cohort

In [None]:
%%time
# Get the full condition item table
sql = """
    select
        *
    from
        {omop_schema}.{in_name} c
""".format(
    in_name=in_name,
    omop_schema=config.OMOP_CDM_SCHEMA
)
final_cohort_raw = db.query(sql)
final_cohort_raw

## Full condition item table

In [None]:
#refactoring todo add some code to rewrite the condition_id_name_map with only the codes that I need
condition_id_name_map = {condition_id : name for condition_id,name in condition_id_name_map.items() if name in condition_names}

In [None]:
%%time
# Get the full condition item table
person_ids = list(final_cohort_raw.person_id)
person_ids = person_ids
print(len(person_ids))
person_ids_str = ", ".join([f"'{x}'" for x in person_ids])
all_condition_ids = list(condition_id_name_map.keys())
all_condition_ids_str = ", ".join([f"'{x}'" for x in all_condition_ids])


sql = """
    select
        c.*
    from
        {omop_schema}.condition_occurrence c
    inner join
        {omop_schema}.{in_name} u
    on
        c.person_id = u.person_id
""".format(
    omop_schema=config.OMOP_CDM_SCHEMA,
    in_name=in_name,
    person_id = person_ids_str,
)
condition_candidates = db.query(sql)
condition_candidates.set_index(["person_id","condition_concept_id"], inplace=True, drop=False)
condition_candidates.sort_index(inplace=True)
filtered_condition_candidates=condition_candidates[condition_candidates.condition_concept_id.isin(all_condition_ids)].copy()

## Creating new column for time periods

In [None]:
# Create new columns for time periods
# 
time_periods = [
    (0, 180, "0_6_months"),
    (180, 365, "6_months_1_yr"),
    (365, 365 * 2, "1_2_yr")
]
new_columns = []
for name in condition_names:
    for _, time_period, time_name in time_periods:
        column_name = name + time_name
        new_columns.append((column_name,name, time_period))
    new_columns.append((name+"_full_condition_name",name, 365*30))
final_cohort = final_cohort_raw.copy()
for col_name, _, _ in new_columns:
    if "full_condition_name" in col_name:
        col = {col_name: None}
    else:
        col = {col_name: False}
    final_cohort  = final_cohort.assign(**col)
final_cohort  = final_cohort.assign(confounder_found=False)
new_column_names = [x for x,_,_ in new_columns] + ["confounder_found"]
final_cohort 

## Confounder_time_period generation

In [None]:
condition_name_dictionary = {k : [] for k in condition_names}

In [None]:
for cid in all_condition_ids:
    condition_id_fullname_map[cid] = None

In [None]:
c = 0
start = datetime.datetime.now()
from datetime import timedelta
l = len(final_cohort)
modified_count = 0
for index, row in final_cohort.iterrows():
    p = row.person_id
    d = row.condition_start_date
    conditions = filtered_condition_candidates[filtered_condition_candidates.person_id == p ]
    c += 1
    if c % 1000 == 0:
        print(f"Time elapsed: {(datetime.datetime.now() - start)}")
        print(f"Iter: {c} / {l}")
    modified = False
    
    for start_day, days, postfix in time_periods:
        prefix_date = d - timedelta(days=days)

        #changing d to be offset by the amount of days so what you have for e.g 0 to 6 months is
        # 0 days ago <= period < 180 days ago
        # 180 days ago <= period < 365 days ago
        # 365 days ago <= period < 365*2 days ago
        d = d - timedelta(days=start_day)

        mask = (conditions.condition_start_date > prefix_date) & (conditions.condition_start_date <= d)
        day_filtered_conditions = conditions.loc[mask]
        for _, found_event in day_filtered_conditions.iterrows():
            modified = True
            found_condition_id = found_event.condition_concept_id
            condition_name = condition_id_name_map[found_condition_id]
            condition_fullname = condition_id_fullname_map[found_condition_id]
            condition_name_dictionary[condition_name].append(condition_fullname)
            final_cohort.loc[index, condition_name + postfix] = True
            final_cohort.loc[index, condition_name + "_full_condition_name"] = condition_fullname
            final_cohort.loc[index, "confounder_found"] = True
    if modified:
        modified_count += 1
pd.set_option('display.max_columns', None)
print(f"Modified count: {modified_count}")
final_cohort.head(30)

In [None]:
condition_final_name_dictionary = {k : set(v) for k,v in condition_name_dictionary.items()}

In [None]:
feat_cols = pn_to_id_map.keys()
features = []
for col in feat_cols:
    for cohort_col in final_cohort.columns:
        if cohort_col.startswith(col):
            features.append(cohort_col)
    

In [None]:
final_cohort[features] = 1*final_cohort[features]

In [None]:
np.sum(final_cohort.confounder_found)

cols = final_cohort.columns

time_6 = [x for x in cols if "0_6_months" in x]
time_1yr = [x for x in cols if "6_months_1_yr" in x]
time_1_2yr = [x for x in cols if "1_2_yr" in x]

years = {"first 6 months": time_6, "6 months to 1 year": time_1yr, "1 to 2 years": time_1_2yr}

for name, period in years.items():
    cumulative = np.sum((final_cohort[period]).any(axis='columns'))
    print(name, cumulative)

### Rename the columns

In [None]:
from collections import defaultdict
code_to_newcode = defaultdict()
#create a dictionary remodifying the names
for code in condition_names:
    code_to_newcode[code] = '_'.join(code.lower().split()).replace('new','concept_ancestor')

#display it
code_to_newcode

In [None]:
new_cohort_columns = []
for i, x in enumerate(final_cohort.columns):
    for code in condition_names:
        if x.startswith(code):
            x = x.replace(code, code_to_newcode[code])
    new_cohort_columns.append(x)

new_cohort_columns
final_cohort.columns = new_cohort_columns

In [None]:
final_cohort = final_cohort.drop(columns=['level_0'])

In [None]:
final_cohort.head()

In [None]:
final_cohort.shape

In [None]:
final_cohort.to_sql(out_name,con=db.engine, if_exists="replace", schema="cdm_6871_21")
cmd = f'grant select on table cdm_6871_21.{out_name} to cdm_6871_21'
db.execute(cmd)

## Save names (optional)

In [None]:
# %%time
# import pandas as pd
# path = r"condition_log.xlsx"
# writer = pd.ExcelWriter(path)
# fullname_to_conditionid = {v:k for k, v in condition_id_fullname_map.items()}
# #begin

# all_ids = []
# all_names = []
# for categorical_name in condition_names:
#     if '_' in categorical_name:
#         categorical_names = categorical_name.split("_")
#         cat_to_fullname = [name for name in list(condition_id_fullname_map.values()) if all(list(map(lambda x: x in name, categorical_names)))]
#         cat_to_ids = [fullname_to_conditionid[x] for x in cat_to_fullname]
#         all_ids += cat_to_ids
#         all_names += cat_to_fullname
#     else:
       
#         cat_to_fullname = [name for name in list(condition_id_fullname_map.values()) if categorical_name in name]
#         cat_to_ids = [fullname_to_conditionid[x] for x in cat_to_fullname]
#         all_ids += cat_to_ids
#         all_names += cat_to_fullname

#     data = {"condition_names":None,"condition_ids":None}

#     data["condition_names"] = cat_to_fullname
#     data["condition_ids"] = cat_to_ids

    
#     df = pd.DataFrame(data)

#     df.to_excel(writer, sheet_name = categorical_name,index=False)
# #end

# #outer loop
# writer.save()
# writer.close()

# all_df = pd.DataFrame({'condition_names':all_names,'condition_ids':all_ids})
# all_df.to_csv("condition_names.csv")

In [None]:
# temp_names = []
# for categorical_name in condition_names:
#     if '_' in categorical_name:
#         categorical_names = categorical_name.split("_")
#         cat_to_ids = [name for name in list(condition_id_fullname_map.keys()) if all(list(map(lambda x: x in name, categorical_names)))]
#         cat_to_fullname = [name for name in list(condition_id_fullname_map.values()) if all(list(map(lambda x: x in name, categorical_names)))]
#     else:
#         cat_to_ids = [name for name in list(condition_id_fullname_map.keys()) if categorical_name in name]
#         cat_to_fullname = [name for name in list(condition_id_fullname_map.values()) if categorical_name in name]
#     print(categorical_name)
    
    

## Visualize names (optional)

In [None]:
# temp_names = []
# for categorical_name in condition_names:
#     if '_' in categorical_name:
#         categorical_names = categorical_name.split("_")
#         cat_to_ids = [name for name in list(condition_id_fullname_map.keys()) if all(list(map(lambda x: x in name, categorical_names)))]
#         cat_to_fullname = [name for name in list(condition_id_fullname_map.values()) if all(list(map(lambda x: x in name, categorical_names)))]
#     else:
#         cat_to_ids = [name for name in list(condition_id_fullname_map.keys()) if categorical_name in name]
#         cat_to_fullname = [name for name in list(condition_id_fullname_map.values()) if categorical_name in name]
#     print(categorical_name)
#     print(f"length: {len(cat_to_fullname)}")
#     print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>.")
#     print()
#     for fullname in cat_to_fullname:
#         print(fullname)
#     print(".<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
#     print("\n\n")

In [None]:
# temp_names = []
# for categorical_name in condition_names:
#     if '_' in categorical_name:
#         categorical_names = categorical_name.split("_")
#         cat_to_fullname = [name for name in list(condition_id_fullname_map.values()) if all(list(map(lambda x: x in name, categorical_names)))]
#     else:
#         cat_to_fullname = [name for name in list(condition_id_fullname_map.values()) if categorical_name in name]
#     print(categorical_name)
#     print(f"length: {len(cat_to_fullname)}")
#     print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>.")
#     print()
#     for fullname in cat_to_fullname:
#         print(fullname)
#     print(".<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
#     print("\n\n")