# Predict patient conditions

* Use current patient conditions to predict which conditions they would have in the future
* Data source: Synthea (synthetic open source) or Optum (proprietary) claims data (this code is able to handle both)
* Use case: adding medications to the feature vector we could measure medication effects

# Research agenda


* Mostly centered around applications of Variational Auto-Encoders (VAE)
* Main issue: what is a good measure of performance for a VAE?
* Solution: use encoding in some downstream task.
* How this project fits in: it's a first pass at the downstream task.

In [445]:
!pip install pyTigerGraph



## Connect to a Synthea TigerGraph server


In [2]:
import getpass

server = 'https://yaniv.i.tgcloud.io'
password = getpass.getpass()

········


In [442]:
import pyTigerGraph as tg 

conn = tg.TigerGraphConnection(
    host=server, 
    graphname="synthea", 
    username="tigergraph",
    password=password,
)

shell = tg.Gsql(conn, client_version="2.6.0")

Downloading gsql client Jar
Downloading SSL Certificate


In [443]:
!ls -l $shell.jarLocation

total 1432
-rw-r--r--  1 ybenami  ybenami  671144 Jul 23 11:27 gsql_client.jar
-rw-r--r--  1 ybenami  ybenami    8518 Jul 23 11:27 my-cert.txt


In [5]:
#!rm -rf $shell.jarLocation

In [6]:
secret = shell.createSecret()
conn.getToken(secret=secret)

('qjv4m187po3f21abtoki1u94d5bemeh0', 1597950428, '2020-08-20 19:07:08')

In [7]:
query = '''
drop query get_all_patients
create query get_all_patients() for graph synthea{
    TYPEDEF TUPLE <description STRING, date DATETIME> CondTup;

    BagAccum <CondTup> @conditions;

    patients = {Patient.*};

    patients = select pat from patients:pat;

    x = select pat from patients:pat-(PATIENT_HAS_CONDITION)-Condition:cond
            accum pat.@conditions += CondTup(cond.description, cond.startDate);

    print patients;
}
install query get_all_patients
'''

print(shell.gsql(query))

Trying version: v2_6_0
Connecting to yaniv.i.tgcloud.io:14240
If there is any relative path, it is relative to tigergraph/dev/gdk/gsql
The query get_all_patients is dropped.
The query get_all_patients has been added!
Start installing queries, about 1 minute ...
get_all_patients query: curl -X GET 'https://127.0.0.1:9000/query/synthea/get_all_patients'. Add -H "Authorization: Bearer TOKEN" if authentication is enabled.




In [8]:
query = conn.runInstalledQuery('get_all_patients', sizeLimit=10**10)

In [52]:
len(query[0]['patients'])

109321

In [88]:
%%writefile getFeatures.py

import numpy as np
import pandas as pd
from datetime import datetime
import math

def yaniv_to_ed_query(query):

    query[0]['people'] = query[0].pop('patients')

    for patient in query[0]['people']:
        patient['attributes']['people.dateOfBirth'] = patient['attributes'].pop('birth')
        patient['attributes']['people.dateOfDeath'] = patient['attributes'].pop('death')
        patient['attributes']['people.@gender'] = [(
            'F' if patient['attributes'].pop('gender') == 'female'
            else 'M')]
        patient['attributes']['people.@diagData'] = patient['attributes'].pop('@conditions')
        for condition in patient['attributes']['people.@diagData']:
            condition['diagnosisDate'] = condition.pop('date')
            condition['diagnosis'] = condition.pop('description')
        patient['attributes'].pop('name')
        patient['attributes'].pop('patient_id')
        
    return query


def get_conditions(query, startDate='1900-01-01', endDate='2019-12-31'):
        
    startDate = datetime.strptime(startDate, '%Y-%m-%d')
    endDate = datetime.strptime(endDate, '%Y-%m-%d')

    conditions = [
        condition['diagnosis']
        for patient in query[0]['people'] 
        for condition in patient['attributes']['people.@diagData']
        if (
            datetime.strptime(condition['diagnosisDate'], '%Y-%m-%d %H:%M:%S') 
            >= startDate
            and datetime.strptime(condition['diagnosisDate'], '%Y-%m-%d %H:%M:%S')
            <= endDate
        )
    ]

    conditions = pd.Series(conditions).value_counts()    

    return conditions


def get_live_patients(query, startDate='1900-01-01', endDate='2019-12-31'):

    startDate = datetime.strptime(startDate, '%Y-%m-%d')
    endDate = datetime.strptime(endDate, '%Y-%m-%d')

    patients = [
        patient['v_id'] 
        for patient in query[0]['people']
        if (
            datetime.strptime(
                patient['attributes']['people.dateOfBirth'], '%Y-%m-%d %H:%M:%S') 
            <= endDate
            and datetime.strptime(
                patient['attributes']['people.dateOfDeath'], '%Y-%m-%d %H:%M:%S')
            >= startDate
        )
    ]

    return patients

def make_age_groups(years = 5, top_year = 100):
    age_groups_ranges = [(i,min(i+years-1,top_year)) 
        for i in range(0, top_year, years)] + [(top_year, 140)]

    age_group_titles = [
        'Age {}-{}'.format(start, end) for start, end in age_groups_ranges]

    return age_groups_ranges, age_group_titles


def get_feature_vec(query, conditions, startDate, endDate, age_groups):

    startDate = datetime.strptime(startDate, '%Y-%m-%d')
    endDate = datetime.strptime(endDate, '%Y-%m-%d')

    demog_df = pd.DataFrame([patient['attributes'] 
                             for patient in query[0]['people']])

    demog_df.index = [
        patient['v_id'] for patient in query[0]['people']
    ]

    demog_df = demog_df[[
        'people.@gender',
        'people.dateOfBirth',
        'people.dateOfDeath',
    ]]

    df = pd.DataFrame(
            np.zeros((len(demog_df.index), len(conditions.index))),
            index=demog_df.index, 
            columns=conditions.index,
        )

    for patient in query[0]['people']:

        patient_conditions = [
            condition['diagnosis'] 
            for condition in patient['attributes']['people.@diagData']
            if (
                datetime.strptime(condition['diagnosisDate'], '%Y-%m-%d %H:%M:%S') 
                >= startDate
                and datetime.strptime(condition['diagnosisDate'], '%Y-%m-%d %H:%M:%S')
                <= endDate
                and condition['diagnosis'] in conditions
            )
        ]

        df.loc[patient['v_id'], patient_conditions ] = 1

    return concat_features(
        conditions_df=df, 
        demog_df=demog_df, 
        date=endDate, 
        age_groups=age_groups,
    )

def concat_features(conditions_df, demog_df, date, age_groups):

    dead_df = deceased(demog_df, date)
    gender_df = gender(demog_df)
    age_df = age_group_df(
        df = demog_df, 
        date_for_age = date,
        age_groups=age_groups,
    )
    
    return pd.concat([gender_df, dead_df, age_df, conditions_df], axis=1)


def age_group_df(df, date_for_age, age_groups):

    age_group_df = pd.DataFrame(
        np.zeros((len(df),len(age_groups[0]))),
        index=df.index, 
        columns=age_groups[1]
        )

    for i in df.index:
        age = math.floor(
            (date_for_age
            - datetime.strptime(df.loc[i,'people.dateOfBirth'], '%Y-%m-%d %H:%M:%S')
            ).days/365.25
        )

        for j, age_group in enumerate(age_groups[0]):
            if age >= age_group[0] and age <= age_group[1]:
                age_group_df.loc[i].iloc[j]=1

    return age_group_df


def deceased(df, date):

    dead = df['people.dateOfDeath'].apply(
        lambda x: 1.0 if (
            date - datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
        ).days > 0 else 0.0
    )

    dead.name = 'Deceased'

    return dead


def gender(df):

    gender = df['people.@gender'].apply(
        lambda x: 1.0 if x[0]=='F' else 0.0
    )

    gender.name = 'Female'

    return gender

Overwriting getFeatures.py


In [9]:
import getFeatures
import importlib

importlib.reload(getFeatures)

<module 'getFeatures' from '/Users/ybenami/EasyAsPie.ai/MedGraphML/getFeatures.py'>

In [10]:
query = getFeatures.yaniv_to_ed_query(query)

In [51]:
%%writefile feature_weighted_mse.py
import tensorflow as tf

def make_feature_weighted_mse(feature_weights):
    
    feature_weights = tf.reshape(tf.cast(feature_weights, 'float32'), (-1,1))
    
    def feature_weighted_mse(y_true, y_pred):
        
        y_true = tf.cast(y_true, 'float32')
        y_pred = tf.cast(y_pred, 'float32')

        return  `
    
    return feature_weighted_mse

Overwriting feature_weighted_mse.py


## Get the set of all conditions

In [514]:
%%writefile predictConditions.py
import getFeatures
import feature_weighted_mse
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import tensorflow.keras as keras
import matplotlib.pyplot as plt
import seaborn as sns
import json
from scipy.stats import t    
from statsmodels.stats.multitest import multipletests

def predictConditions(query):
    
    print('Collecting all conditions:\n')
    
    conditions = getFeatures.get_conditions(
        query=query, startDate='2019-01-01', endDate='2020-12-31')
    
    print(conditions)
    

    patients = getFeatures.get_live_patients(
        query=query, startDate='2019-12-31', endDate='2019-12-31')
    
    print('\nNumber of patients', len(patients))
    

    age_groups = getFeatures.make_age_groups()
    
    print('\nAge groups\n', age_groups)
    

    print('\nCompute features: ')
    
    x_df = getFeatures.get_feature_vec(
        query,
        conditions=conditions,
        startDate='2019-01-01', 
        endDate='2019-12-31', 
        age_groups=age_groups)

    print('\nx_df.shape ', x_df.shape)
    
    print('\nCompute labels: ')
    
    y_df = getFeatures.get_feature_vec(
        query,
        conditions=conditions,
        startDate='2020-01-01', 
        endDate='2020-12-31', 
        age_groups=age_groups)
    
    print('\ny_df.shape ', y_df.shape)
    

    train, test = train_test_split(patients, test_size=0.25, random_state=42)
    
    x_train_df = x_df.loc[train]
    y_train_df = y_df.loc[train]
    x_test_df = x_df.loc[test]
    y_test_df = y_df.loc[test]
    
    print('\n\nTrain set:', len(train), 'Test set: ', len(test))
    
    print(
        '\n\nSorted x_train means:\n\n',
        x_train_df.mean().sort_values(ascending=False), 
        '\n\nSorted y_train means:\n\n',
        y_train_df.mean().sort_values(ascending=False)
    )

    filter_below = 20
    print('\nFiltereing conditions with less than {} cases:'.format(filter_below))
    
    x_drop_list = ( 
        set(x_train_df.columns[x_train_df.sum() < filter_below])
        | set(x_test_df.columns[x_train_df.sum() < filter_below])
    )

    x_train_df = x_train_df.drop(x_drop_list, axis=1)
    x_test_df = x_test_df.drop(x_drop_list, axis=1)

    y_drop_list = ( 
        set(y_train_df.columns[y_train_df.sum() < filter_below])
        | set(y_test_df.columns[y_train_df.sum() < filter_below])
    )

    y_train_df = y_train_df.drop(y_drop_list, axis=1)
    y_test_df = y_test_df.drop(y_drop_list, axis=1)

    print(
        '\n\nSorted x_train means:\n\n',
        x_train_df.mean().sort_values(ascending=False), 
        '\n\nSorted y_train means:\n\n\n\n',
        y_train_df.mean().sort_values(ascending=False)
    )
    
    print(
        '\n\nSorted x_test means:\n\n',
        x_test_df.mean().sort_values(ascending=False), 
        '\n\nSorted y_test means:\n\n\n\n',
        y_test_df.mean().sort_values(ascending=False)
    )

    y_weights = 1 / (y_train_df.var() + 1e-3)
    y_weights = y_weights/(y_train_df.var()*y_weights).sum()
    
    print(
        '\n',
        pd.DataFrame(
            [y_train_df.var(), y_weights, y_weights*y_train_df.var()],
             index=['y_train var', 'y_weights', 'var*weight']
        ).transpose()
    )

    wmse = feature_weighted_mse.make_feature_weighted_mse(y_weights)
    
    print(
        '\nBasic benchmark - y means\n', 
        'Train loss',
        wmse(
            y_true=y_train_df.values, 
            y_pred=y_train_df.values.mean(axis=0)
        ).numpy().mean(),
    )

    from sklearn.model_selection import RepeatedKFold

    n_splits = 4
    n_repeats = 2
    alpha=0.00001
    learning_rate=0.001
    patience=30
    
    print('\nTrain linear model using Lasso alpha {} {}-fold CV repeated {} times.\n'.format(
        alpha, n_splits, n_repeats,
    ))
    

    rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)
    
    models=[]
    history=[]
    performance=[]
    
    i=0
    for train_index, validate_index in rkf.split(x_train_df):
        
        i += 1
        print('\n\nFold {} out of {}\n\n'.format(i, n_splits*n_repeats))
        
        x_train, x_validate = x_train_df.iloc[train_index], x_train_df.iloc[validate_index]
        y_train, y_validate = y_train_df.iloc[train_index], y_train_df.iloc[validate_index]
    
        inputs = keras.layers.Input(shape=x_train_df.shape[1])
        outputs = keras.layers.Dense(
            units=y_train_df.shape[1], 
            kernel_regularizer=keras.regularizers.l1(l=alpha),
        )(inputs)
        
        models.append(keras.Model(inputs=inputs, outputs=outputs))

        models[-1].compile(loss=wmse, optimizer=keras.optimizers.Adam(learning_rate=learning_rate))

        history.append(models[-1].fit(
            x=x_train,
            y=y_train,
            batch_size=128,
            epochs=1000,
            validation_data=(x_validate, y_validate),
            callbacks=[
                keras.callbacks.EarlyStopping(patience=patience, restore_best_weights=True),
            ]
        ))
    
        print('\nEvaluate on test set:\n')
        performance.append(models[-1].evaluate(x=x_test_df, y=y_test_df))
        print(performance[-1],'\n')

    print('Test loss mean', np.mean(performance), 'std' , np.std(performance, ddof=1))
    

    
    constant_full = pd.DataFrame(
        np.array([model.layers[1].get_weights()[1] for model in models]).transpose(), 
        index=y_train_df.columns, 
        columns=['Fold {}'.format(i) for i in range(1, 1+n_splits*n_repeats)],
    )
    constant_full.to_csv('constant_full.csv')
    
    coef_mat = np.array([model.layers[1].get_weights()[0] for model in models]).transpose((1, 2, 0))
    
    coef_full = pd.DataFrame(
        [[json.dumps(coef_mat[i,j].tolist()) 
          for j in range(coef_mat.shape[1])] 
         for i in range(coef_mat.shape[0])], 
        columns=y_train_df.columns, 
        index=x_train_df.columns
    ).transpose()
    
    coef_full.to_csv('coef_full.csv')

Overwriting predictConditions.py


In [515]:
import predictConditions
import importlib

importlib.reload(predictConditions)

<module 'predictConditions' from '/Users/ybenami/EasyAsPie.ai/MedGraphML/predictConditions.py'>

In [516]:
predictConditions.predictConditions(query)

Collecting all conditions:

Suspected COVID-19                                      81485
COVID-19                                                78851
Fever (finding)                                         72574
Cough (finding)                                         55183
Loss of taste (finding)                                 41275
                                                        ...  
Lupus erythematosus                                         1
Cystic Fibrosis                                             1
Chronic paralysis due to lesion of spinal cord              1
History of amputation of foot (situation)                   1
Blindness due to type 2 diabetes mellitus (disorder)        1
Length: 173, dtype: int64

Number of patients 92258

Age groups
 ([(0, 4), (5, 9), (10, 14), (15, 19), (20, 24), (25, 29), (30, 34), (35, 39), (40, 44), (45, 49), (50, 54), (55, 59), (60, 64), (65, 69), (70, 74), (75, 79), (80, 84), (85, 89), (90, 94), (95, 99), (100, 140)], ['Age 0-4', 'A

Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/10

Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000

Evaluate on test set:

0.8375591516701487 



Fold 2 out of 8


Train on 51895 samples, validate on 17298 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/100

Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000

Evaluate on test set:

0.837689121550447 



Fold 3 out of 8


Train on 51895 samples, validate on 17298 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000

Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/1000
Epoch 77/1000
Epoch 78/1000
Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000
Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000

Evaluate on test set:

0.8375638149938742 



Fold 4 out of 8


Train on 51895 samples, validate on 17298 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/10

Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/1000
Epoch 77/1000
Epoch 78/1000
Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000
Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 102/1000
Epoch 103/1000
Epoch 104/1000
Epoch 105/1000
Epoch 106/1000
Epoch 107/1000
Epoch 108/10

Epoch 113/1000
Epoch 114/1000
Epoch 115/1000
Epoch 116/1000
Epoch 117/1000
Epoch 118/1000
Epoch 119/1000
Epoch 120/1000
Epoch 121/1000
Epoch 122/1000
Epoch 123/1000
Epoch 124/1000
Epoch 125/1000
Epoch 126/1000
Epoch 127/1000
Epoch 128/1000
Epoch 129/1000
Epoch 130/1000

Evaluate on test set:

0.8370330784601476 



Fold 5 out of 8


Train on 51894 samples, validate on 17299 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45

Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000

Evaluate on test set:

0.837742801930262 



Fold 6 out of 8


Train on 51895 samples, validate on 17298 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000

Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/1000
Epoch 77/1000
Epoch 78/1000
Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000
Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 102/1000
Epoch 103/1000
Epoch 104/1000
Epoch 105/1000
Epoch 106/1000
Epoch 107/1000
Epoch 108/1000
Epoch 109/1000
Epoch 110/1000

Evaluate on test set:

0.8376636498511393 



Fold 7 out of 8


Train on 51895 samples, validate on 17298 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000


Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/1000
Epoch 77/1000
Epoch 78/1000
Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000
Epoch 

Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 102/1000
Epoch 103/1000
Epoch 104/1000
Epoch 105/1000
Epoch 106/1000
Epoch 107/1000
Epoch 108/1000
Epoch 109/1000
Epoch 110/1000

Evaluate on test set:

0.8373866184709285 



Fold 8 out of 8


Train on 51895 samples, validate on 17298 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
E

Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/1000
Epoch 77/1000
Epoch 78/1000
Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000
Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 102/1000
Epoch 103/1000
Epoch 104/1000
Epoch 105/1000
Epoch 106/1000
Epoch 107/1000
Epoch 108/1000
Epoch 109/1000
Epoch 110/1000
Epoch 111/1000

Evaluate on test set:

0.8374659780460558 

Test loss mean 0.8375130268716253 std 0.00022669780995632688


# Label weights

* Features and labels are encoded as dummy vectors (0 or 1)
* I'm using Keras to do a linear probability benchmark (with Lasso penalties)
* Main issue: some conditions are common while others are rare
* Why is this a problem: common conditions have higher variance and would dominate the Mean Square Error
* Solution: Define a custom loss that takes weights for features


In [517]:
%%writefile predictConditions.py
import getFeatures
import feature_weighted_mse
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import tensorflow.keras as keras
import matplotlib.pyplot as plt
import seaborn as sns
import json
from scipy.stats import t    
from statsmodels.stats.multitest import multipletests

Overwriting predictConditions.py


In [518]:
constant_full = pd.read_csv('constant_full.csv', index_col=0)
    
constant_full

Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Fold 6,Fold 7,Fold 8
Female,0.000308,0.000166,0.000264,0.000288,0.000308,0.000309,0.000286,0.000244
Deceased,0.056281,0.074205,0.056274,0.049803,0.065216,0.050973,0.040013,0.057133
Age 0-4,0.000758,0.007184,-0.000243,-0.000186,0.000477,0.004553,0.001207,0.003928
Age 5-9,0.002589,0.007931,0.004926,0.000620,-0.003293,-0.006274,0.003479,-0.004181
Age 10-14,-0.003525,0.074625,-0.008108,0.004676,0.009623,-0.000126,0.000315,0.004645
...,...,...,...,...,...,...,...,...
"Localized, primary osteoarthritis of the hand",-0.000286,-0.000097,-0.000244,0.000259,-0.000232,0.001267,-0.000418,-0.000295
Osteoarthritis of hip,0.000749,-0.000209,0.000118,-0.000347,0.010369,0.000776,-0.000426,0.000154
Contact dermatitis,0.002194,0.000347,0.000281,0.000198,0.003015,0.000574,0.000282,-0.000201
Concussion injury of brain,0.000651,0.000167,-0.000390,0.000718,-0.000330,-0.000162,-0.000357,0.001793


In [519]:
coef_full = pd.read_csv('coef_full.csv', index_col=0)

coef_full

Unnamed: 0,Female,Age 0-4,Age 5-9,Age 10-14,Age 15-19,Age 20-24,Age 25-29,Age 30-34,Age 35-39,Age 40-44,...,Tubal pregnancy,Injury of anterior cruciate ligament,Overlapping malignant neoplasm of colon,Malignant tumor of colon,Chronic obstructive bronchitis (disorder),Seasonal allergic rhinitis,Pulmonary emphysema (disorder),Suspected lung cancer (situation),Non-small cell lung cancer (disorder),"Non-small cell carcinoma of lung, TNM stage 1 (disorder)"
Female,"[0.9994238615036011, 0.9994780421257019, 0.999...","[0.0001972150639630854, -0.0001622617419343441...","[1.843123754952103e-05, 0.00015203049406409264...","[3.70907764590811e-05, 0.00022644212003797293,...","[8.786928083281964e-05, 0.00017806136747822165...","[5.59463442186825e-05, -3.954165367758833e-05,...","[7.175697828643024e-05, 0.00018811860354617238...","[0.00022584674297831953, -0.000144424717291258...","[-2.543208665883867e-06, 0.0001201063860207796...","[0.00030227735987864435, -3.263737016823143e-0...",...,"[6.230683356989175e-05, 0.00017113138164859265...","[8.534589142072946e-05, 2.2143518435768783e-05...","[3.0813360353931785e-05, 0.0001030616258503869...","[3.618368646129966e-05, -3.6144774639979005e-0...","[-5.5237600463442504e-05, -0.00020721020700875...","[-9.439473797101527e-05, -0.000240332490648143...","[4.4941873056814075e-05, 0.0001336292625637725...","[2.8802787710446864e-05, 4.975481715518981e-06...","[1.5613528375979513e-05, 0.0002526500029489398...","[0.00027655003941617906, 4.7199137043207884e-0..."
Deceased,"[-0.0073789022862911224, -0.012394324876368046...","[-0.04644082486629486, -0.05925685167312622, -...","[-0.046548955142498016, -0.0644829049706459, -...","[-0.04596250504255295, -0.06376709789037704, -...","[-0.04789375886321068, -0.06498119235038757, -...","[-0.04321202263236046, -0.061850663274526596, ...","[-0.03989005461335182, -0.06104419752955437, -...","[-0.0453440360724926, -0.06076515093445778, -0...","[-0.03650955110788345, -0.06314641237258911, -...","[-0.04088449478149414, -0.05222180113196373, -...",...,"[-3.854333044728264e-05, -4.0612816519569606e-...","[-0.003969011828303337, -5.996434629196301e-05...","[-0.004762034863233566, -0.005148023366928101,...","[0.03961325064301491, 0.0869751125574112, 0.00...","[-0.001616183202713728, 0.01365602295845747, 0...","[-0.00025207287399098277, -0.00021494799875654...","[0.054832253605127335, 0.05059337615966797, -0...","[0.00010571414168225601, 0.008106790482997894,...","[-0.0004772221145685762, 0.009110445156693459,...","[-0.00043111812556162477, 0.009283681400120258..."
Age 0-4,"[0.0010205016005784273, 0.0007051500724628568,...","[0.8044350147247314, 0.8107116222381592, 0.800...","[-0.0012565191136673093, -0.007942652329802513...","[-0.00014739990001544356, -0.00813709292560815...","[-0.00156314333435148, -0.006744459271430969, ...","[-0.0010858167661353946, -0.007710026111453772...","[-0.0017585817258805037, -0.007662032265216112...","[-0.0015511447563767433, -0.007595157716423273...","[-0.0009803103748708963, -0.00642688013613224,...","[-0.0008303343201987445, -0.006820457056164741...",...,"[-0.00010418239253340289, -8.746171806706116e-...","[-0.00012988319213036448, -0.00012213023728691...","[1.5436758985742927e-05, 5.116549436934292e-05...","[-6.384904554579407e-05, -0.000182560746907256...","[0.00020524815772660077, -3.196402394678444e-0...","[-0.005745735019445419, 0.0030566726345568895,...","[9.111770486924797e-05, -4.3333740904927254e-0...","[2.0044150005560368e-05, -7.79762485763058e-05...","[-2.4963184841908514e-05, 9.122402843786404e-0...","[7.959279173519462e-05, 2.37040949286893e-05, ..."
Age 5-9,"[-0.0001450422714697197, -0.003817510325461626...","[0.18661227822303772, 0.16298121213912964, 0.1...","[0.790812075138092, 0.772051215171814, 0.78967...","[-0.0021195514127612114, -0.006445011589676142...","[-0.001565655809827149, -0.008606037124991417,...","[-0.0013227805029600859, -0.006895039230585098...","[-0.00283366022631526, -0.006881819572299719, ...","[-0.0026112745981663465, -0.007918590679764748...","[-0.0021946874912828207, -0.008293031714856625...","[-0.001248957822099328, -0.007753648329526186,...",...,"[5.547000182559714e-05, 5.1014358177781105e-05...","[4.393993003759533e-05, 5.642154792440124e-05,...","[-6.674565520370379e-05, -7.024520891718566e-0...","[-0.00017504340212326497, 7.379589806078002e-0...","[-0.00014997925609350204, 5.5411575885955244e-...","[0.09004131704568863, 0.02460954338312149, 0.0...","[-0.00022522362996824086, -7.880295015638694e-...","[-5.0778726290445775e-06, -0.00015093643742147...","[-0.0001637983659747988, -0.000127524952404201...","[2.1451884094858542e-05, 7.306937186513096e-05..."
Age 10-14,"[0.0006140173063613474, 0.002419772557914257, ...","[0.00940742902457714, -0.06880028545856476, 0....","[0.20845471322536469, 0.13275732100009918, 0.2...","[0.7966035008430481, 0.7261127829551697, 0.797...","[0.005332523956894875, -0.07040298730134964, 0...","[0.002567313378676772, -0.07207359373569489, 0...","[0.001824431587010622, -0.07202185690402985, 0...","[0.0017383192898705602, -0.0722074955701828, 0...","[0.002454928820952773, -0.07243503630161285, 0...","[-8.161207369994372e-05, -0.0725749060511589, ...",...,"[-0.0002536014071665704, 0.0002187804202549159...","[1.7134691006504e-05, -9.455869076191448e-06, ...","[2.1324773115338758e-05, -0.000193423547898419...","[1.1227479262743145e-05, -0.000174921544385142...","[0.0006369359907694161, -0.0001766627101460471...","[-0.01898382417857647, -0.013885690830647945, ...","[0.00010976282646879554, 0.0002489946200512349...","[-0.000309803057461977, 1.3241893611848354e-05...","[-3.913743421435356e-05, 0.0001052936713676899...","[8.966513996711001e-05, 1.4823002857156098e-05..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Localized, primary osteoarthritis of the hand","[4.0090417314786464e-05, -3.598283001338132e-0...","[0.0007900833152234554, 9.194639278575778e-05,...","[0.0006746338913217187, 1.4725615983479656e-05...","[0.0008109706686809659, 0.00018210304551757872...","[0.0006439480930566788, 4.529744910541922e-05,...","[0.0009293500916101038, 2.3999793484108523e-05...","[0.0005423248512670398, 9.972017141990364e-05,...","[0.0003574419824872166, 8.24040180305019e-05, ...","[0.00014537754759658128, 0.0001186129375128075...","[-0.0008519451366737485, 0.0003385589516256004...",...,"[0.0027758947107940912, -0.0003293281188234687...","[-0.0020137368701398373, 0.0002050920156762004...","[0.001335489796474576, -3.5961853427579626e-05...","[0.0007358953589573503, -0.0001927909179357811...","[0.0003047981590498239, 0.00013267577742226422...","[0.00010799425945151597, 2.0460440282477066e-0...","[-0.00446255411952734, -7.863252540118992e-05,...","[-9.623550431570038e-05, -4.9505597417009994e-...","[-0.0033802681136876345, -2.0989449694752693e-...","[-0.0033734729513525963, -2.1381056285463274e-..."
Osteoarthritis of hip,"[0.0018328026635572314, 0.0006813855725340545,...","[-0.001603428041562438, 0.0001015524030663073,...","[-0.0021901633590459824, 0.0005741319619119167...","[-0.0021721082739531994, -0.000202900817384943...","[-0.0018341697286814451, 0.0002177444985136389...","[-0.002637826604768634, 0.0013090919237583876,...","[-0.0028795532416552305, 0.0008902658591978252...","[-0.001949777826666832, 0.000385000224923715, ...","[-0.000444955425336957, 0.003943155519664288, ...","[7.8570083132945e-05, 0.0010121414670720696, -...",...,"[-6.9282978074625134e-06, -0.00082595791900530...","[-0.0006372103234753013, 9.738650078361388e-06...","[-6.474513065768406e-05, 0.0002560231951065361...","[-0.0014436754863709211, 6.479215517174453e-05...","[-0.0019352996023371816, 0.0003861443838104605...","[-6.485779158538207e-05, -0.006658607162535191...","[5.9520047216210514e-05, -0.001136936945840716...","[8.844854892231524e-05, -0.008777087554335594,...","[0.00021759145602118224, -0.007528015878051519...","[0.00014735921286046505, -0.007548378314822912..."
Contact dermatitis,"[-0.00017066835425794125, 0.000632971816230565...","[-0.00020512868650257587, -0.00122592132538557...","[-0.0019964457023888826, -0.000957437208853662...","[-0.0018378443783149123, -0.001827104948461055...","[-0.0014855304034426808, -0.000428536208346486...","[-0.002441136399284005, -0.0006554509745910764...","[0.0022610826417803764, 0.004413180518895388, ...","[0.007243082392960787, -0.000956884934566915, ...","[-0.001958079868927598, -0.0013045298401266336...","[-0.0013815563870593905, -0.001288508879952132...",...,"[0.0001599916722625494, 3.4388620406389236e-05...","[0.0022971571888774633, -4.095547046745196e-05...","[-5.0423244829289615e-05, -0.00567299779504537...","[1.2320218957029283e-05, -0.001254665665328502...","[-0.004087189212441444, 0.0007802926702424884,...","[0.0016642072005197406, 0.0017202284652739763,...","[-0.0008222736069001257, -2.2323685698211193e-...","[-0.0002447800652589649, -3.9056845707818866e-...","[0.0008785197860561311, -3.6071360227651894e-0...","[0.0009270607843063772, -3.8051686715334654e-0..."
Concussion injury of brain,"[-0.0011314250295981765, 0.0002293234283570200...","[-0.00028582316008396447, -0.00014765167725272...","[-0.00017394739552401006, -0.00024959706934168...","[-0.0007169736782088876, -0.000335843622451648...","[0.00744012463837862, -0.00026659801369532943,...","[-0.0005306768580339849, 0.0001470526331104338...","[-0.0006025409093126655, 0.0001876642199931666...","[-0.00037407371564768255, 0.000141168464324437...","[-0.0003051667008548975, -0.000366283406037837...","[-6.489855877589434e-05, 0.0006841637659817934...",...,"[6.630386633332819e-05, -0.0001359621528536081...","[0.000304557936033234, 4.499410351854749e-05, ...","[-1.341301958746044e-05, 0.0002068174362648278...","[0.00022644942509941757, 0.0002223839255748316...","[0.0005827371496707201, -9.003715240396559e-05...","[1.3064207450952381e-05, 0.0007816119468770921...","[0.00011892397742485628, 2.0466359273996204e-0...","[7.948465281515382e-06, 8.696505392435938e-05,...","[5.8566285588312894e-05, 1.5649844499421306e-0...","[3.5549295716919005e-05, 5.673987470800057e-05..."


In [521]:
constant_name = pd.Series(constant_full.index, index=constant_full.index, name='name')

constant_mean = constant_full.mean(axis=1)
constant_mean.name = 'mean'

constant_std = constant_full.std(axis=1, ddof=1)
constant_std.name = 'std'

constant_df = pd.concat(
    [constant_name, constant_mean, constant_std], axis=1)

constant_df['p-value'] = (1 - t.cdf(
    x=abs(constant_mean/constant_std), df=n_splits*n_repeats-1)) * 2

constant_df['FDR adj p-value'] = multipletests(constant_df['p-value'], method='fdr_bh')[1]

constant_df.to_csv('constant.csv', index=False)

coef_mean = coef_full.applymap(lambda x: np.mean(json.loads(x)))
coef_mean.to_csv('coef_mean.csv')

coef_std = coef_full.applymap(lambda x: np.std(json.loads(x), ddof=1))
coef_std.to_csv('coef_std.csv')

coef_p_value = pd.DataFrame(
    (1 - t.cdf(x=abs(coef_mean/coef_std), df=n_splits*n_repeats-1)) * 2,
    index=coef_full.index,
    columns=coef_full.columns
)
coef_p_value.to_csv('coef_p_value.csv')

coef_p_value_adj = multipletests(
    coef_p_value.values.reshape((-1,)), 
    method='fdr_bh')[1].reshape(coef_p_value.shape)

coef_p_value_adj = pd.DataFrame(
    coef_p_value_adj, index=y_train_df.columns, columns=x_train_df.columns)
coef_p_value_adj.to_csv('coef_p_value_adj.csv')

In [523]:
# constant_df = pd.read_csv('constant.csv', index_col=0)
# constant_df
# coef_mean = pd.read_csv('coef_mean.csv', index_col=0)
# coef_mean
# coef_std = pd.read_csv('coef_std.csv', index_col=0)
# coef_std
# coef_p_value = pd.read_csv('coef_p_value.csv', index_col=0)
# coef_p_value.head(50).iloc[:,:20]
# coef_p_value_adj = pd.read_csv('coef_p_value_adj.csv', index_col=0)
# coef_p_value_adj

print('Number of tests: ', coef_mean.shape[0]*coef_mean.shape[1])

Number of tests:  15678


In [524]:
print('Regular p-value %5 rejection rate', (np.abs(coef_p_value.values) < 0.01).mean())
print('Adjusted p-value %5 rejection rate', (np.abs(coef_p_value_adj.values) < 0.01).mean())

Regular p-value %5 rejection rate 0.022962112514351322
Adjusted p-value %5 rejection rate 0.006569715524939406


In [525]:
nonzero = coef_p_value.applymap(lambda x: x <= 0.01).values.nonzero()
nonzero[0].shape, nonzero[1].shape 

((360,), (360,))

In [529]:
tuples = [(
    coef_mean.index[i], 
    coef_mean.columns[j], 
    coef_mean.iloc[i,j],
    coef_std.iloc[i,j],
    coef_p_value.iloc[i,j],
    coef_p_value_adj.iloc[i,j],
) for i,j in zip(nonzero[0],nonzero[1])]

coef_df = pd.DataFrame(tuples, columns=['y','x','mean', 'std', 'p-value', 'FDR adj p-value'])
coef_df.to_csv('coef.csv', index=False)
coef_df

Unnamed: 0,y,x,mean,std,p-value,FDR adj p-value
0,Female,Female,0.999540,0.000068,0.000000,0.000000
1,Deceased,Age 0-4,-0.045816,0.009080,0.001487,0.107157
2,Deceased,Age 5-9,-0.049527,0.008855,0.000822,0.069650
3,Deceased,Age 10-14,-0.049112,0.008841,0.000855,0.071693
4,Deceased,Age 15-19,-0.049202,0.008364,0.000610,0.055868
...,...,...,...,...,...,...
355,Chronic intractable migraine without aura,Impacted molars,0.107712,0.020580,0.001207,0.092779
356,Perennial allergic rhinitis,Childhood asthma,0.041448,0.008088,0.001362,0.099788
357,Hyperglycemia (disorder),Diabetes,0.046885,0.009537,0.001721,0.119936
358,Diabetic renal disease (disorder),Neuropathy due to type 2 diabetes mellitus (di...,0.066444,0.016437,0.004919,0.267770
