###  Generates Condition 2 year history Features

Obesity, Morbid obesity

Malignancies/cancer

Prescription of corticosteroids

Transplant  (kidney, liver, pancreas, heart, lung, bone marrow)

Autoimmune diseases

HIV

Hypertension

Chronic Kidney disease stage 4 or 5

Hemodialysis 

Diabetes mellitus (with or without any complication)

Menopause

Incontinence 

Cancer of urinary tract or gynecologic malignancy (remove these patients)

Neurogenic bladder (remove these patients)

Spina Bifida (remove these patients)

Catheter (remove these patients)


In [None]:
in_name = "manuscript_covariates_1_final"
out_name = "manuscript_covariates_2_final"

In [None]:
import sys
import time
import importlib
import sparse
import datetime
import pandas as pd
import numpy as np
import scipy.sparse
import matplotlib.pyplot as plt

sys.path.append("..")
import Utils.dbutils as dbutils
import Utils.data_utils as data_utils
import Generators.CohortGenerator as CohortGenerator
import Generators.FeatureGenerator as FeatureGenerator
import config
local_imports = (
    dbutils,
    data_utils,
    CohortGenerator,
    FeatureGenerator,
    config
)
for i in local_imports:
    i = importlib.reload(i)

## Condition Name Creation

In [None]:
## database connection parameters
# username = config.PG_USERNAME #we use peer authentication so don't need use vars, but in theory would pass them into config_path
# password = config.PG_PASSWORD
database_name = config.DB_NAME
print(database_name)
config_path = 'postgresql://{database_name}'.format(
    database_name = database_name
)
connect_args = {"host": '/var/run/postgresql/'} # connect_args to pass to sqlalchemy create_engine function

# schemas 
schema_name = 'eol_test_ncjones' # all created tables will be created using this schema
cdm_schema_name = config.OMOP_CDM_SCHEMA # the name of the schema housing your OMOP CDM tables
print(f"cdm schema: {cdm_schema_name}")
# caching
reset_schema = False # if true, rebuild all data from scratch

# set up database, reset schemas as needed
db = dbutils.Database(config_path, schema_name, connect_args, cdm_schema_name)
# if reset_schema:
#     db.execute(
#         'drop schema if exists {} cascade'.format(schema_name)
#     )
# db.execute(
#     'create schema if not exists {}'.format(schema_name)
# )

In [None]:
%%time
##### Below is stuff to query adverse events


# This is to generate adverse condition events based on condition names
sql = """
    select
        c.concept_name as concept_name,
        c.concept_id as concept_id,
        c.domain_id as domain_id
    from
        {omop_schema}.concept c
    where 
        c.domain_id = 'Condition' 
""".format(
    omop_schema=config.OMOP_CDM_SCHEMA,
)
diagnosis_names = db.query(sql)


In [None]:
%%time

all_condition_ids = []
condition_id_name_map = dict()
condition_id_fullname_map = dict()
pn_to_id_map = dict()
condition_name_hits = []
condition_ids = []
# Obesity, Morbid obesity
condition_name_hits.append(["morbid", "obesity"])
# # Malignancies/cancer
condition_name_hits.append("cancer")
condition_name_hits.append(["cancer", "urinary"])

condition_name_hits.append(["leukemia"])

condition_name_hits.append("malignan")

condition_name_hits.append(["malignan","gynecologic"])
# Prescription of corticosteroids
condition_name_hits.append("corticosteroid")
# Transplant  (kidney, liver, pancreas, heart, lung, bone marrow)
condition_name_hits.append("transplant")
# Autoimmune diseases- ? diagnosis codes
# HIV
condition_name_hits.append("hiv")
# Chronic Kidney disease stage 4 or 5
condition_name_hits.append(["chronic", "kidney"])
# Hemodialysis 
condition_name_hits.append("hemodialysis")
# Diabetes mellitus (with or without any complication)
condition_name_hits.append(["diabetes", "mellitus"])
# Cancer of urinary tract or gynecologic malignancy (remove these patients)
# Menopause
condition_name_hits.append("menopause")
# Neurogenic bladder (remove these patients)
condition_name_hits.append(["neurogenic", "bladder"])
# Incontinence 
condition_name_hits.append(["urinary", "incontinence"])
# Spina Bifida (remove these patients)
condition_name_hits.append(["spina", "pifida"])
# Catheter (remove these patients)
condition_name_hits.append("catheter"),
condition_name_hits.append("hypertension")
# Addison disease
condition_name_hits.append("addison")
condition_name_hits.append("carcinoma")
condition_name_hits.append(["spina", "bifida"])
condition_name_hits.append("celiac")
condition_name_hits.append("dermatomyositis")
condition_name_hits.append("graves")
condition_name_hits.append(["hashimoto", "thyroiditis"])
condition_name_hits.append("sclerosis")
condition_name_hits.append(["myasthenia", "gravis"])
condition_name_hits.append(["pernicious", "anemia"])
condition_name_hits.append(["reactive", "arthritis"])
condition_name_hits.append(["rheumatoid", "arthritis"])
condition_name_hits.append("arthritis")
condition_name_hits.append("sjögren")
condition_name_hits.append(["lupus", "erythematosus"])
condition_name_hits.append(["chronic","renal","failure"])
condition_name_hits.append("leukemia")
condition_name_hits.append("lymphoma")
condition_name_hits.append("sarcoma")
i = 0
for ind, (name, cid) in diagnosis_names[["concept_name", "concept_id"]].iterrows(): 
    name = name.lower()
    for pn in condition_name_hits:
        def addit(cid, name, pn):
            all_condition_ids.append(cid)
            condition_id_fullname_map[cid] = name
            condition_id_name_map[cid] = pn
            if pn in pn_to_id_map:
                pn_to_id_map[pn].add(cid)
            else:
                pn_to_id_map[pn]= set([cid])
        

        if isinstance(pn, list) and all([x in name for x in pn]):
            if (cid == 192855 and name.startswith("cancer in situ")):
                print("add is attempted")
            addit(cid,name,"_".join(pn))
            break
        if isinstance(pn, str) and pn in name:
            addit(cid,name,pn)
            break
print(f"Found {len(all_condition_ids)} condition events")

condition_names = sorted(list(pn_to_id_map.keys()))

In [None]:
condition_dictionary = {k : [len(v)] for k,v in pn_to_id_map.items()}

## Get the cohort table

In [None]:
%%time
# Get the full condition item table
sql = """
    select
        *
    from
        {omop_schema}.{in_name} c
""".format(
    omop_schema=config.OMOP_CDM_SCHEMA,
    in_name=in_name
)
final_cohort_raw = db.query(sql)
final_cohort_raw

## Full condition item table

In [None]:
%%time
# Get the full condition item table
person_ids = list(final_cohort_raw.person_id)
person_ids = person_ids
print(len(person_ids))
person_ids_str = ", ".join([f"'{x}'" for x in person_ids])
all_condition_ids = list(condition_id_name_map.keys())
all_condition_ids_str = ", ".join([f"'{x}'" for x in all_condition_ids])


sql = """
    select
        c.*
    from
        {omop_schema}.condition_occurrence c
    inner join
        {omop_schema}.{in_name} u
    on
        c.person_id = u.person_id
""".format(
    omop_schema=config.OMOP_CDM_SCHEMA,
    in_name=in_name,
    person_id = person_ids_str,
)
condition_candidates = db.query(sql)
condition_candidates.set_index(["person_id","condition_concept_id"], inplace=True, drop=False)
condition_candidates.sort_index(inplace=True)
filtered_condition_candidates=condition_candidates[condition_candidates.condition_concept_id.isin(all_condition_ids)].copy()

## Creating new column for time periods

In [None]:
# Create new columns for time periods
# 
time_periods = [
    (0, 180, "0_6_months"),
    (180, 365, "6_months_1_yr"),
    (365, 365 * 2, "1_2_yr")
]
new_columns = []
for name in condition_names:
    for _, time_period, time_name in time_periods:
        column_name = name + time_name
        new_columns.append((column_name,name, time_period))
    new_columns.append((name+"_full_condition_name",name, 365*30))
final_cohort = final_cohort_raw.copy()
for col_name, _, _ in new_columns:
    if "full_condition_name" in col_name:
        col = {col_name: None}
    else:
        col = {col_name: False}
    final_cohort  = final_cohort.assign(**col)
final_cohort  = final_cohort.assign(confounder_found=False)
new_column_names = [x for x,_,_ in new_columns] + ["confounder_found"]
final_cohort 

## Confounder_time_period generation

In [None]:
condition_name_dictionary = {k : [] for k,v in condition_dictionary.items()}

In [None]:
import pdb
c = 0
start = datetime.datetime.now()
from datetime import timedelta
l = len(final_cohort)
modified_count = 0
for index, row in final_cohort.iterrows():
    p = row.person_id
    d = row.condition_start_date
    conditions = filtered_condition_candidates[filtered_condition_candidates.person_id == p ]
    c += 1
    if c % 1000 == 0:
        print(f"Time elapsed: {(datetime.datetime.now() - start)}")
        print(f"Iter: {c} / {l}")
    modified = False
    
    for start_day, days, postfix in time_periods:
        prefix_date = d - timedelta(days=days)

        #changing d to be offset by the amount of days so what you have for e.g 0 to 6 months is
        # 0 days ago <= period < 180 days ago
        # 180 days ago <= period < 365 days ago
        # 365 days ago <= period < 365*2 days ago
        d = d - timedelta(days=start_day)

        mask = (conditions.condition_start_date > prefix_date) & (conditions.condition_start_date <= d)
        day_filtered_conditions = conditions.loc[mask]
        for _, found_event in day_filtered_conditions.iterrows():
            modified = True
            found_condition_id = found_event.condition_concept_id
            condition_name = condition_id_name_map[found_condition_id]
            condition_fullname = condition_id_fullname_map[found_condition_id]
            condition_name_dictionary[condition_name].append(condition_fullname)
            final_cohort.loc[index, condition_name + postfix] = True
            final_cohort.loc[index, condition_name + "_full_condition_name"] = condition_fullname
            final_cohort.loc[index, "confounder_found"] = True

    if modified:
        modified_count += 1
pd.set_option('display.max_columns', None)
print(f"Modified count: {modified_count}")
final_cohort.head(30)

### Add menopause complete history variable (Optional)

In [None]:
menopause_candidates =condition_candidates[condition_candidates.condition_concept_id.isin(pn_to_id_map['menopause'])].copy()

In [None]:
menopause_cohort = final_cohort.copy()

In [None]:
import pdb
c = 0
start = datetime.datetime.now()
from datetime import timedelta
l = len(final_cohort)
modified_count = 0
for index, row in menopause_cohort.iterrows():
    p = row.person_id
    d = row.condition_start_date
    conditions = menopause_candidates[menopause_candidates.person_id == p ]
    c += 1
    if c % 1000 == 0:
        print(f"Time elapsed: {(datetime.datetime.now() - start)}")
        print(f"Iter: {c} / {l}")
    modified = False
    
    for start_day, days, postfix in time_periods:

        mask = (conditions.condition_start_date <= d)
        day_filtered_conditions = conditions.loc[mask]
        for _, found_event in day_filtered_conditions.iterrows():
            modified = True
            found_condition_id = found_event.condition_concept_id
            condition_name = condition_id_name_map[found_condition_id]
            condition_fullname = condition_id_fullname_map[found_condition_id]
            condition_name_dictionary[condition_name].append(condition_fullname)
            menopause_cohort.loc[index, condition_name + '_any'] = True
            menopause_cohort.loc[index, "confounder_found"] = True

    if modified:
        modified_count += 1
pd.set_option('display.max_columns', None)
print(f"Modified count: {modified_count}")

In [None]:
final_cohort = menopause_cohort

In [None]:
feat_cols = pn_to_id_map.keys()
features = []
for col in feat_cols:
    for cohort_col in final_cohort.columns:
        if cohort_col.startswith(col):
            features.append(cohort_col)
    

In [None]:
final_cohort[features] = 1*final_cohort[features]

In [None]:
np.sum(final_cohort.confounder_found)

cols = final_cohort.columns

time_6 = [x for x in cols if "0_6_months" in x]
time_1yr = [x for x in cols if "6_months_1_yr" in x]
time_1_2yr = [x for x in cols if "1_2_yr" in x]

years = {"first 6 months": time_6, "6 months to 1 year": time_1yr, "1 to 2 years": time_1_2yr}

for name, period in years.items():
    cumulative = np.sum((final_cohort[period]).any(axis='columns'))
    print(name, cumulative)

In [None]:
final_cohort = final_cohort.drop(columns=['level_0'])

In [None]:
final_cohort.head()

In [None]:
final_cohort.shape

In [None]:
final_cohort.to_sql(out_name,con=db.engine, if_exists="replace", schema="cdm_6871_21")
cmd = 'grant select on table cdm_6871_21.{out_name} to cdm_6871_21'.format(out_name=out_name)
db.execute(cmd)

## Save names (optional)

In [None]:
# %%time
# import pandas as pd
# path = r"condition_log.xlsx"
# writer = pd.ExcelWriter(path)
# fullname_to_conditionid = {v:k for k, v in condition_id_fullname_map.items()}
# #begin

# all_ids = []
# all_names = []
# for categorical_name in condition_names:
#     if '_' in categorical_name:
#         categorical_names = categorical_name.split("_")
#         cat_to_fullname = [name for name in list(condition_id_fullname_map.values()) if all(list(map(lambda x: x in name, categorical_names)))]
#         cat_to_ids = [fullname_to_conditionid[x] for x in cat_to_fullname]
#         all_ids += cat_to_ids
#         all_names += cat_to_fullname
#     else:
       
#         cat_to_fullname = [name for name in list(condition_id_fullname_map.values()) if categorical_name in name]
#         cat_to_ids = [fullname_to_conditionid[x] for x in cat_to_fullname]
#         all_ids += cat_to_ids
#         all_names += cat_to_fullname

#     data = {"condition_names":None,"condition_ids":None}

#     data["condition_names"] = cat_to_fullname
#     data["condition_ids"] = cat_to_ids

    
#     df = pd.DataFrame(data)

#     df.to_excel(writer, sheet_name = categorical_name,index=False)
# #end

# #outer loop
# writer.save()
# writer.close()

# all_df = pd.DataFrame({'condition_names':all_names,'condition_ids':all_ids})
# all_df.to_csv("condition_names.csv")

In [None]:
# temp_names = []
# for categorical_name in condition_names:
#     if '_' in categorical_name:
#         categorical_names = categorical_name.split("_")
#         cat_to_ids = [name for name in list(condition_id_fullname_map.keys()) if all(list(map(lambda x: x in name, categorical_names)))]
#         cat_to_fullname = [name for name in list(condition_id_fullname_map.values()) if all(list(map(lambda x: x in name, categorical_names)))]
#     else:
#         cat_to_ids = [name for name in list(condition_id_fullname_map.keys()) if categorical_name in name]
#         cat_to_fullname = [name for name in list(condition_id_fullname_map.values()) if categorical_name in name]
#     print(categorical_name)
    
    

## Visualize names (optional)

In [None]:
# temp_names = []
# for categorical_name in condition_names:
#     if '_' in categorical_name:
#         categorical_names = categorical_name.split("_")
#         cat_to_ids = [name for name in list(condition_id_fullname_map.keys()) if all(list(map(lambda x: x in name, categorical_names)))]
#         cat_to_fullname = [name for name in list(condition_id_fullname_map.values()) if all(list(map(lambda x: x in name, categorical_names)))]
#     else:
#         cat_to_ids = [name for name in list(condition_id_fullname_map.keys()) if categorical_name in name]
#         cat_to_fullname = [name for name in list(condition_id_fullname_map.values()) if categorical_name in name]
#     print(categorical_name)
#     print(f"length: {len(cat_to_fullname)}")
#     print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>.")
#     print()
#     for fullname in cat_to_fullname:
#         print(fullname)
#     print(".<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
#     print("\n\n")

In [None]:
# temp_names = []
# for categorical_name in condition_names:
#     if '_' in categorical_name:
#         categorical_names = categorical_name.split("_")
#         cat_to_fullname = [name for name in list(condition_id_fullname_map.values()) if all(list(map(lambda x: x in name, categorical_names)))]
#     else:
#         cat_to_fullname = [name for name in list(condition_id_fullname_map.values()) if categorical_name in name]
#     print(categorical_name)
#     print(f"length: {len(cat_to_fullname)}")
#     print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>.")
#     print()
#     for fullname in cat_to_fullname:
#         print(fullname)
#     print(".<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
#     print("\n\n")