In [2]:
### Imports
import matplotlib.pyplot as plt
import numpy as np
from typing import List, Dict, Optional, Union
import sys
import time
import importlib
import sparse
import datetime
import pandas as pd
import numpy as np
import scipy.sparse
import matplotlib.pyplot as plt
import os
#
sys.path.append("..")
import Utils.dbutils as dbutils
import Utils.data_utils as data_utils
import Generators.CohortGenerator as CohortGenerator
import Generators.FeatureGenerator as FeatureGenerator
import config
local_imports = (
    dbutils,
    data_utils,
    CohortGenerator,
    FeatureGenerator,
    config
)
for i in local_imports:
    i = importlib.reload(i)

In [3]:
### Define config
in_name_1 = "manuscript_covariates_4_v4"
#out_name tbd
cohort_name = in_name_1

In [4]:
### Setting up database
# username = config.PG_USERNAME #we use peer authentication so don't need use vars, but in theory would pass them into config_path
# password = config.PG_PASSWORD
database_name = config.DB_NAME
print(database_name)
config_path = 'postgresql://{database_name}'.format(
    database_name = database_name
)
connect_args = {"host": '/var/run/postgresql/'} # connect_args to pass to sqlalchemy create_engine function

# schemas 
schema_name = 'eol_test_ncjones' # all created tables will be created using this schema
cdm_schema_name = config.OMOP_CDM_SCHEMA # the name of the schema housing your OMOP CDM tables
print(f"cdm schema: {cdm_schema_name}")
# caching
reset_schema = False # if true, rebuild all data from scratch

# set up database, reset schemas as needed
db = dbutils.Database(config_path, schema_name, connect_args, cdm_schema_name)
# if reset_schema:
#     db.execute(
#         'drop schema if exists {} cascade'.format(schema_name)
#     )
# db.execute(
#     'create schema if not exists {}'.format(schema_name)
# )

localhost/omop_v6
cdm schema: cdm_6871_21


In [5]:
%%time
### Loading cohort
# Get the full condition item table
sql = """
    select
        *
    from
        {omop_schema}.{cohort_name} c
""".format(
    omop_schema=config.OMOP_CDM_SCHEMA,
    cohort_name=cohort_name
)
cohort = db.query(sql)
filtered_cohort = cohort.copy().loc[cohort.antibiotic_type!='inappropriate']

CPU times: user 9.9 s, sys: 1.2 s, total: 11.1 s
Wall time: 13.4 s


In [188]:
cols_to_exclude_for_missing_analysis = [
'no_previous_180_excluded_event',
'no_previous_180_day_event',
'no_two_previous_365_day_event',
 'no_previous_excluded_event_ever',
'previous_uti_condition_occurence_id',
 'multi',
 'year_of_birth',
'level_0',
 'index',
 'condition_occurrence_id',
 'person_id',
 'condition_concept_id',
 'condition_start_date',
 'condition_start_datetime',
 'condition_end_date',
 'condition_end_datetime',
 'visit_occurrence_id',
 'visit_detail_id',
 'drug_concept_id',
 'drug_name',
 'antibiotic_name',
#  'antibiotic_type',
 'visit_provider_id',
 'drug_exposure_id',
 'drug_exposure_start_date',
 'drug_exposure_start_datetime',
 'provider_id',
 'provider_name',
 'npi',
 'post_UTI_codes',
 'recurrent_uti',
 'first_uti',
 'followup_time',
 't_sum',
 't_bin',
 't_uti_bin',
 't_neph_bin',
 'AE_c_diff',
 'AE_GI',
 'AE_skin',
 'AE_AKI',
 'AE_other',
 't_sepsis_sum',
 't_i_sepsis_sum',
 't_i_uti_sum',
 't_i_neph_sum',
 't_sepsis_bin',
 't_i_uti_bin',
 't_i_neph_bin',
 't_i_sepsis_bin',
 't_i_sum',
 'previous_utis', #excluding previous utis
 't_uti_sum',
 't_neph_sum',
 't_i_bin','AE_any','less_15','less_30','less_90','followup_time'] + \
 [x for x in filtered_cohort.columns if 'full_condition_name' in x] + \
 [x for x in cohort_features.columns if x.endswith('_0')] + \
 ['age','years_since_diagnosis','days_since_previous_uti'] 

#we additionally exclude lab values
#NOAH EXCLUDED THESE FEATURES BECAUSE ONLY RUNNING GRID SEARCH ON NON OMOP FEATURES
#  ['fibro_' + m + '_mon_outcome' for m in ['1','3','6']] + \
#  ['hernia_' + m + '_mon_outcome' for m in ['1','3','6']] + \
#  ['fracture_' + m + '_mon_outcome' for m in ['1','3','6']]
cohort_features = filtered_cohort.drop(columns=cols_to_exclude_for_missing_analysis)


In [189]:
outcome_cols = ['antibiotic_type'] + [x for x in filtered_cohort.columns if x.endswith('bin') or x.startswith('AE')] # we need antibiotic type in the dataframe
outcome_vars = filtered_cohort[outcome_cols]


In [190]:
censor_cols =  ['less_15','less_30','less_90'] # we need antibiotic type in the dataframe
censor_vars = 1 - filtered_cohort[censor_cols]  #importantly am changing how these are defined to indicate where we DON'T have data for certain variables
censor_vars['antibiotic_type'] = filtered_cohort['antibiotic_type']

In [191]:
def compute_antibiotic_stratified_missing_dataframe(feature_set):
    """Assumes the dataframe includes the column antibiotic_type
       Assumes the features are boolean
       
       Computes the amount of False values in a set of features.
       returns the list of features, the first_line/second_line/alternatives featureset, and missing dataframe stratified by antibiotic group"""
    #compute amount of False values in boolean features
    cohort_features = feature_set.copy()
    all_features = cohort_features.drop(['antibiotic_type'],axis=1)
    all_features_prevalence = pd.Series(1 - np.mean(all_features))

    cohort_features.loc[cohort_features.antibiotic_type.isin(['nitrofurantoin','trimethoprim-sulfamethoxazole']),'antibiotic_type'] = 'first_line'
    first_line_features = cohort_features.loc[cohort_features['antibiotic_type'] == 'first_line']
    first_line_features = first_line_features.drop(['antibiotic_type'],axis=1)
    first_line_features_prevalence = pd.Series(1 - np.mean(first_line_features))
    first_line_features_sum = first_line_features.sum()

    second_line_features = cohort_features.loc[cohort_features['antibiotic_type'] == 'second_line']
    second_line_features = second_line_features.drop(['antibiotic_type'],axis=1)
    second_line_features_prevalence = pd.Series(1 - np.mean(second_line_features))
    second_line_features_sum = second_line_features.sum()

    alternatives_features = cohort_features.loc[cohort_features['antibiotic_type'] == 'alternatives']
    alternatives_features = alternatives_features.drop(['antibiotic_type'],axis=1)
    alternatives_features_prevalence = pd.Series(1 - np.mean(alternatives_features))
    alternatives_features_sum = alternatives_features.sum()

    missing_dataframe = pd.DataFrame(index=all_features_prevalence.index)
    missing_dataframe['first_line'] = first_line_features_prevalence
    missing_dataframe['second_line'] = second_line_features_prevalence
    missing_dataframe['alternatives'] = alternatives_features_prevalence
 
    missing_dataframe['first_line_presence_count'] = first_line_features_sum
    missing_dataframe['second_line_presence_count'] = second_line_features_sum
    missing_dataframe['alternatives_presence_count'] = alternatives_features_sum
    
    feature_names = list(all_features.columns)
    return feature_names, first_line_features, second_line_features, alternatives_features, missing_dataframe


# Compute p values in comparison to first_line
from scipy.stats import ttest_ind
def compute_p_value_array(df,base):
    p_values = []
    for col in df.columns:
        p_values.append(compute_p_value(df,col,base))
        
    return p_values
    
def compute_p_value(df,col,base):
    """Take a,b columns and compute the p value"""
    
    args,p = ttest_ind(df[col],base[col])
    if np.isnan(p):
        p = 1
    return p

def compute_flag(p_value):
    if p_value < .001:
        return '**'
    if p_value < .05:
        return "*"

def add_p_value_info(df,f_features,s_features,a_features):
    """Takes a set of """
    missing_dataframe = df.copy()
    second_line_ps = compute_p_value_array(s_features,f_features)
    alternatives_ps = compute_p_value_array(a_features,f_features)
    
    missing_dataframe['second_line_pv'] = second_line_ps
    missing_dataframe['alternatives_pv'] = alternatives_ps
    missing_dataframe['second_line_flag'] = missing_dataframe['second_line_pv'].apply(compute_flag)
    missing_dataframe['alternatives_flag'] = missing_dataframe['alternatives_pv'].apply(compute_flag)
    missing_dataframe['second_diff'] = missing_dataframe['first_line'] - missing_dataframe['second_line']
    missing_dataframe['alternatives_diff'] = missing_dataframe['first_line'] - missing_dataframe['alternatives']
    return missing_dataframe



def run_missing_analysis(feature_set):
    feature_names, f_features,s_features,a_features,df = compute_antibiotic_stratified_missing_dataframe(feature_set)
    new_df = add_p_value_info(df,f_features,s_features,a_features)
    return new_df
    
# def cohen_d(x,y):
#     nx = len(x)
#     ny = len(y)
#     dof = nx + ny - 2
#     return (mean(x) - mean(y)) / sqrt(((nx-1)*std(x, ddof=1) ** 2 + (ny-1)*std(y, ddof=1) ** 2) / dof


In [192]:
missing_features_dataframe = run_missing_analysis(cohort_features)

In [193]:
missing_features_dataframe.to_csv("Missing analysis with feature variables.csv")

In [194]:
missing_outcome_dataframe = run_missing_analysis(outcome_vars)

In [195]:
missing_outcome_dataframe.to_csv("Missing analysis with outcome variables.csv")

In [196]:
missing_censor_dataframe = run_missing_analysis(censor_vars)

In [197]:
missing_censor_dataframe.to_csv("Missing analysis with censor variables.csv")

### Old way

In [None]:
# #compute amount of False values in boolean features
# all_features = cohort_features.drop(['antibiotic_type'],axis=1)
# all_features_count = pd.Series(1 - np.mean(all_features))

# cohort_features.loc[cohort_features.antibiotic_type.isin(['nitrofurantoin','trimethoprim-sulfamethoxazole']),'antibiotic_type'] = 'first_line'
# first_line_features = cohort_features.loc[cohort_features['antibiotic_type'] == 'first_line']
# first_line_features = first_line_features.drop(['antibiotic_type'],axis=1)
# first_line_features_count = pd.Series(1 - np.mean(first_line_features))

# second_line_features = cohort_features.loc[cohort_features['antibiotic_type'] == 'second_line']
# second_line_features = second_line_features.drop(['antibiotic_type'],axis=1)
# second_line_features_count = pd.Series(1 - np.mean(second_line_features))

# alternatives_features = cohort_features.loc[cohort_features['antibiotic_type'] == 'alternatives']
# alternatives_features = alternatives_features.drop(['antibiotic_type'],axis=1)
# alternatives_features_count = pd.Series(1 - np.mean(alternatives_features))

# missing_dataframe = pd.DataFrame(index=all_features_count.index)
# missing_dataframe['first_line'] = first_line_features_count
# missing_dataframe['second_line'] = second_line_features_count
# missing_dataframe['alternatives'] = alternatives_features_count

# second_line_ps = compute_p_value_array(second_line_features,first_line_features)
# alternatives_ps = compute_p_value_array(alternatives_features,first_line_features)

# missing_dataframe['second_line_pv'] = second_line_ps
# missing_dataframe['alternatives_pv'] = alternatives_ps
# missing_dataframe['second_line_flag'] = missing_dataframe['second_line_pv'].apply(compute_flag)
# missing_dataframe['alternatives_flag'] = missing_dataframe['alternatives_pv'].apply(compute_flag)

### Other features that were skipped

In [None]:
# Figure out how these are imputed

cohort_features['age']
cohort_features['previous_utis']
cohort_features['days_since_previous_uti']
cohort_features['years_since_diagnosis']

### Pearson Correlation for years since diagnosis and condition start date

In [120]:
filtered_cohort['condition_start_date'].dt.year,

0        2019
1        2015
2        2016
3        2019
4        2021
         ... 
68497    2018
68498    2019
68499    2020
68501    2019
68502    2018
Name: condition_start_date, Length: 65881, dtype: int64

In [124]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Sample data for two Pandas Series
data_x = filtered_cohort['condition_start_date'].dt.year
data_y = filtered_cohort['years_since_diagnosis']

# Setting the style for a cleaner look
sns.set(style="ticks")

# Creating the scatter plot with a regression line
plt.figure(figsize=(8, 6))
sns.regplot(x=data_x, y=data_y, scatter_kws={'s':50}, line_kws={'color':'green', 'lw':2})

# Calculating Pearson correlation coefficient
pearson_corr = data_x.corr(data_y)

# Annotating the plot with the Pearson correlation coefficient
plt.text(3, 1, f'Pearson Correlation: {pearson_corr:.2f}', fontsize=12, color='blue', 
         ha='center', va='center')

# Adding title and labels
plt.title('Correlation between Condition Start Year and Years Since Diagnosis', fontsize=16, color='darkred')
plt.xlabel('X Series', fontsize=13)
plt.ylabel('Y Series', fontsize=13)

# Displaying the plot
plt.show()


ValueError: Image size of 91155x395 pixels is too large. It must be less than 2^16 in each direction.

<Figure size 576x432 with 1 Axes>

In [127]:
pearson_corr

-1.0