In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import networkx as nx
import datetime
import os
import sys

# # Import from the parent directory
sys.path.append(os.path.join(os.getcwd(), '..'))
from utils import utils

Read in discharges and notes data, merge on discharge_id, drop duplicates

In [2]:
discharge_table = pd.read_csv('../data/discharges_w_disposition_new.csv', parse_dates=['arrive_date','discharge_date'])
notes_table = pd.read_csv('../data/notes_w_disposition_new.csv', parse_dates=['date'])
notes_table.rename(columns = {'discharge_id':'id'}, inplace = True)

# authors who wrote notes within 48 hours of arrival date are in index team for that patient
INDEX_DELTA = np.timedelta64(2, 'D')
# notes written within previous 90 days are considered when calculating collaborative experience
TEAMWORK_DELTA = np.timedelta64(90, 'D')

# match in admission datetime indexing on visit id from discharge table
notes_table = notes_table.merge(discharge_table, on='id', how='right')

# create new column for normalized date by ignoring time of day
notes_table['normalized_date'] = notes_table['arrive_date'].astype('datetime64[D]')

notes_table.drop_duplicates(['normalized_date','dr','id'], keep="first", inplace=True)

notes_table.sort_values('arrive_date', inplace=True)
FIRST_DATE = notes_table['arrive_date'].iloc[0]

notes_table.head()

Unnamed: 0,id,dr,date,patient,arrive_date,discharge_date,disposition,age,normalized_date
531,97,Nuria Cano,2020-03-07,Alexander Castillo,2020-03-04,2020-03-07,0,72,2020-03-04
530,97,Bryan Jones,2020-03-07,Alexander Castillo,2020-03-04,2020-03-07,0,72,2020-03-04
529,97,Kathy Walker,2020-03-06,Alexander Castillo,2020-03-04,2020-03-07,0,72,2020-03-04
2467,452,Calvin Daniels,2020-03-08,Lorraine Sims,2020-03-05,2020-03-08,0,66,2020-03-05
2470,452,Roger Porter,2020-03-07,Lorraine Sims,2020-03-05,2020-03-08,0,66,2020-03-05


"Self join" the notes table on the discharge id to create edge list, with some steps to remove duplicates

In [3]:
edges_table = notes_table.merge(notes_table[['dr','id','date']], how='left', on='id')
edges_table = edges_table[edges_table['dr_x'] < edges_table['dr_y']]
edges_table['edge'] = edges_table['dr_x'] + edges_table['dr_y']
edges_table.head()

Unnamed: 0,id,dr_x,date_x,patient,arrive_date,discharge_date,disposition,age,normalized_date,dr_y,date_y,edge
3,97,Bryan Jones,2020-03-07,Alexander Castillo,2020-03-04,2020-03-07,0,72,2020-03-04,Nuria Cano,2020-03-07,Bryan JonesNuria Cano
5,97,Bryan Jones,2020-03-07,Alexander Castillo,2020-03-04,2020-03-07,0,72,2020-03-04,Kathy Walker,2020-03-06,Bryan JonesKathy Walker
6,97,Kathy Walker,2020-03-06,Alexander Castillo,2020-03-04,2020-03-07,0,72,2020-03-04,Nuria Cano,2020-03-07,Kathy WalkerNuria Cano
10,452,Calvin Daniels,2020-03-08,Lorraine Sims,2020-03-05,2020-03-08,0,66,2020-03-05,Roger Porter,2020-03-07,Calvin DanielsRoger Porter
11,452,Calvin Daniels,2020-03-08,Lorraine Sims,2020-03-05,2020-03-08,0,66,2020-03-05,Philip Holland,2020-03-06,Calvin DanielsPhilip Holland


Define function to take in network graph, perform network calculations, and return formatted output

In [4]:
def get_output_for_row(g, visit_id):
    data = {}
    data['discharge_id'] = visit_id
    
    ''' Clustering coefficient of all nodes (in a dictionary) '''
    clustering_coefficient = nx.clustering(g, weight='weight')
    
    ''' Average clustering coefficient with divide-by-zero check '''
    clust_sum = sum(clustering_coefficient.values())
    clust_len = len(clustering_coefficient)
        
    data['avg_clust'] = clust_sum / clust_len if clust_len > 0 else 0 
    
    data['sum_clust'] = clust_sum
    data['team_size'] = g.number_of_nodes()
    data['team_edge_size'] = g.number_of_edges()
    
    experience = g.size(weight='weight') #Experience as sum of weights
    data['cumulative_experience'] = experience - data['team_edge_size']
    data['avg_cumulative_experience'] = data['cumulative_experience'] / data['team_size'] if data['team_size'] > 0 else 0
    
    return data

Get the index teams, then loop through teams by discharge id, add up edges, and create network graph for each team. Add the output to a list 

In [5]:
is_index_edge = ((edges_table["date_x"] - edges_table["arrive_date"] < INDEX_DELTA) 
                 & (edges_table["date_y"] - edges_table["arrive_date"] < INDEX_DELTA))
    
is_after_delta = edges_table["arrive_date"] > FIRST_DATE + TEAMWORK_DELTA

index_table = edges_table[(is_index_edge) & (is_after_delta)]

experience_data_list= pd.DataFrame(columns = ['discharge_id',
        'avg_clust',
        'cumulative_experience',
        'avg_cumulative_experience',
        "team_edge_size",
        "team_size",
         "sum_clust"])

for visit_id in index_table.id.unique():
    team = index_table[index_table['id'] == visit_id]
    edge_list = pd.DataFrame(columns = ['source', 'target', 'weight'])
    for i,edge in team.iterrows():
        cutoff_lower = edge['normalized_date'] - TEAMWORK_DELTA
        is_in_window = (edges_table['normalized_date'] >= cutoff_lower) & (edges_table['normalized_date'] < edge['normalized_date'])
        weight = len(edges_table[(edges_table['edge'] == edge['edge']) & (is_in_window)])
#         Need the following line if we don't add 1 to weights total outside this inner loop
        if(weight < 1): continue 
        edge_list = edge_list.append({'source':edge['dr_x'],'target':edge['dr_y'],'weight':weight}, ignore_index=True)

#     TODO: discuss this line. Adding 1 to the weight closes the triangles. Otherwise we need to leave out 0 weights  
#     edge_list["weight"] = edge_list["weight"] + 1 
    g = nx.Graph()  # empty graph
    g = nx.from_pandas_edgelist(edge_list, source='source', target='target',edge_attr='weight')
    experience_data_list = experience_data_list.append(get_output_for_row(g, visit_id), ignore_index=True)  

print(experience_data_list.head(20))

    discharge_id  avg_clust  cumulative_experience  avg_cumulative_experience  \
0          268.0   0.000000                    0.0                   0.000000   
1            8.0   0.353329                    4.0                   1.000000   
2          356.0   0.000000                    0.0                   0.000000   
3           28.0   0.000000                    0.0                   0.000000   
4          172.0   0.000000                    0.0                   0.000000   
5          407.0   0.000000                    0.0                   0.000000   
6          163.0   0.000000                    0.0                   0.000000   
7          201.0   0.000000                    0.0                   0.000000   
8          395.0   0.000000                    1.0                   0.500000   
9          241.0   0.000000                    4.0                   1.333333   
10          21.0   0.000000                    2.0                   1.000000   
11         156.0   0.000000 

Create dataframe from experience output list, and join with discharge data on discharge id to get disposition and age for further analysis

In [6]:
experience_df = pd.DataFrame(experience_data_list, columns=utils.columns).drop_duplicates()

experience_master_df = experience_df.merge(discharge_table, left_on='discharge_id', right_on='id', copy=False)

print(experience_master_df.shape)

(191, 12)


In [7]:
def get_model(var):
    return sm.GLM.from_formula(f'disposition ~ {var} + age', family = sm.families.Binomial(), data=experience_master_df)

model = get_model('avg_clust')
result = model.fit()
result.summary() 

0,1,2,3
Dep. Variable:,disposition,No. Observations:,191.0
Model:,GLM,Df Residuals:,188.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-55.621
Date:,"Fri, 05 Mar 2021",Deviance:,111.24
Time:,04:44:07,Pearson chi2:,143.0
No. Iterations:,24,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,2.1325,3.849,0.554,0.580,-5.412,9.677
avg_clust,-70.8588,6.62e+04,-0.001,0.999,-1.3e+05,1.3e+05
age,-0.0560,0.054,-1.039,0.299,-0.162,0.050


In [8]:
model = get_model('cumulative_experience')
result = model.fit()
result.summary() 

0,1,2,3
Dep. Variable:,disposition,No. Observations:,191.0
Model:,GLM,Df Residuals:,188.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-45.662
Date:,"Fri, 05 Mar 2021",Deviance:,91.323
Time:,04:44:07,Pearson chi2:,89.5
No. Iterations:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,3.5790,4.094,0.874,0.382,-4.445,11.603
cumulative_experience,-22.2521,1.4e+04,-0.002,0.999,-2.75e+04,2.74e+04
age,-0.0681,0.057,-1.192,0.233,-0.180,0.044


In [9]:
model = get_model('avg_cumulative_experience')
result = model.fit()
result.summary()   

0,1,2,3
Dep. Variable:,disposition,No. Observations:,191.0
Model:,GLM,Df Residuals:,188.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-45.662
Date:,"Fri, 05 Mar 2021",Deviance:,91.323
Time:,04:44:07,Pearson chi2:,89.5
No. Iterations:,26,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,3.5790,4.094,0.874,0.382,-4.445,11.603
avg_cumulative_experience,-92.9223,8.19e+04,-0.001,0.999,-1.61e+05,1.6e+05
age,-0.0681,0.057,-1.192,0.233,-0.180,0.044
