In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import networkx as nx
import datetime
import os
import sys
from tqdm import tqdm

# # Import from the parent directory
sys.path.append(os.path.join(os.getcwd(), '..'))
from utils import utils

Read in discharges and notes data, merge on discharge_id, drop duplicates

In [2]:
discharge_table = pd.read_csv('../data/discharges_w_disposition_abdul_test.csv', parse_dates=['arrive_date','discharge_date'])
notes_table = pd.read_csv('../data/notes_w_disposition_abdul_test.csv', parse_dates=['date'])
notes_table.rename(columns = {'discharge_id':'id'}, inplace = True)

# authors who wrote notes within 48 hours of arrival date are in index team for that patient
INDEX_DELTA = np.timedelta64(2, 'D')
# notes written within previous 90 days are considered when calculating collaborative experience
TEAMWORK_DELTA = np.timedelta64(90, 'D')

# match in admission datetime indexing on visit id from discharge table
notes_table = notes_table.merge(discharge_table, on='id', how='right')

# create new column for normalized date by ignoring time of day
notes_table['normalized_date'] = notes_table['arrive_date'].astype('datetime64[D]')
notes_table['normalized_note_date'] = notes_table['date'].astype('datetime64[D]')

notes_table.drop_duplicates(['normalized_note_date','dr','id'], keep="first", inplace=True)

notes_table.sort_values('arrive_date', inplace=True)
FIRST_DATE = notes_table['arrive_date'].iloc[0]

notes_table.head()

Unnamed: 0,id,dr,date,patient,arrive_date,discharge_date,disposition,age,normalized_date,normalized_note_date
2559,15,dr212,2020-03-16,patient16,2020-03-09,2020-03-16,0,65,2020-03-09,2020-03-16
2533,15,dr189,2020-03-13,patient16,2020-03-09,2020-03-16,0,65,2020-03-09,2020-03-13
2534,15,dr189,2020-03-12,patient16,2020-03-09,2020-03-16,0,65,2020-03-09,2020-03-12
2540,15,dr193,2020-03-15,patient16,2020-03-09,2020-03-16,0,65,2020-03-09,2020-03-15
2545,15,dr28,2020-03-11,patient16,2020-03-09,2020-03-16,0,65,2020-03-09,2020-03-11


"Self join" the notes table on the discharge id to create edge list, with some steps to remove duplicates

In [3]:
edges_table = notes_table.merge(notes_table[['dr','id','normalized_note_date']], how='left', on=['id','normalized_note_date'])
edges_table = edges_table[edges_table['dr_x'] < edges_table['dr_y']]
edges_table['edge'] = edges_table['dr_x'] + edges_table['dr_y']
edges_table.head()

Unnamed: 0,id,dr_x,date,patient,arrive_date,discharge_date,disposition,age,normalized_date,normalized_note_date,dr_y,edge
1,15,dr212,2020-03-16,patient16,2020-03-09,2020-03-16,0,65,2020-03-09,2020-03-16,dr298,dr212dr298
2,15,dr212,2020-03-16,patient16,2020-03-09,2020-03-16,0,65,2020-03-09,2020-03-16,dr285,dr212dr285
3,15,dr212,2020-03-16,patient16,2020-03-09,2020-03-16,0,65,2020-03-09,2020-03-16,dr28,dr212dr28
4,15,dr212,2020-03-16,patient16,2020-03-09,2020-03-16,0,65,2020-03-09,2020-03-16,dr81,dr212dr81
5,15,dr212,2020-03-16,patient16,2020-03-09,2020-03-16,0,65,2020-03-09,2020-03-16,dr228,dr212dr228


Define function to take in network graph, perform network calculations, and return formatted output

In [4]:
def get_output_for_row(g, visit_id):
    data = {}
    data['discharge_id'] = visit_id
    
    ''' Clustering coefficient of all nodes (in a dictionary) '''
    clustering_coefficient = nx.clustering(g, weight='weight')
    
    ''' Average clustering coefficient with divide-by-zero check '''
    clust_sum = sum(clustering_coefficient.values())
    clust_len = len(clustering_coefficient)
        
    data['avg_clust'] = clust_sum / clust_len if clust_len > 0 else 0 
    
    data['sum_clust'] = clust_sum
    data['team_size'] = g.number_of_nodes()
    data['team_edge_size'] = g.number_of_edges()
    
    experience = g.size(weight='weight') #Experience as sum of weights
    data['cumulative_experience'] = experience - data['team_edge_size']
    data['avg_cumulative_experience'] = data['cumulative_experience'] / data['team_size'] if data['team_size'] > 0 else 0
    
    return data

Get the index teams, then loop through teams by discharge id, add up edges, and create network graph for each team. Add the output to a list 

In [7]:
is_index_edge = (edges_table["date"] - edges_table["arrive_date"] < INDEX_DELTA) 
                 
    
is_after_delta = (edges_table["arrive_date"] > (FIRST_DATE + TEAMWORK_DELTA))

index_table = edges_table[(is_index_edge) & (is_after_delta)]

# experience_data_list= pd.DataFrame(columns = ['discharge_id',
#         'avg_clust',
#         'cumulative_experience',
#         'avg_cumulative_experience',
#         "team_edge_size",
#         "team_size",
#          "sum_clust"])
experience_data_list = []

index_table.head()

Unnamed: 0,id,dr_x,date,patient,arrive_date,discharge_date,disposition,age,normalized_date,normalized_note_date,dr_y,edge
64529,40,dr136,2020-06-09,patient41,2020-06-08,2020-06-15,0,65,2020-06-08,2020-06-09,dr25,dr136dr25
64530,40,dr136,2020-06-09,patient41,2020-06-08,2020-06-15,0,65,2020-06-08,2020-06-09,dr153,dr136dr153
64531,40,dr136,2020-06-09,patient41,2020-06-08,2020-06-15,0,65,2020-06-08,2020-06-09,dr47,dr136dr47
64532,40,dr136,2020-06-09,patient41,2020-06-08,2020-06-15,0,65,2020-06-08,2020-06-09,dr24,dr136dr24
64533,40,dr136,2020-06-09,patient41,2020-06-08,2020-06-15,0,65,2020-06-08,2020-06-09,dr287,dr136dr287


In [13]:
# measure performance
start_time = time.perf_counter()

for visit_id in tqdm(index_table.id.unique()):
    team = index_table[index_table['id'] == visit_id]
#     edge_list = pd.DataFrame(columns = ['source', 'target', 'weight'])
    edge_list = []
    for i,edge in team.iterrows():
        cutoff_lower = edge['normalized_date'] - TEAMWORK_DELTA
        is_in_window = (edges_table['normalized_date'] >= cutoff_lower) & (edges_table['normalized_date'] < edge['normalized_date'])
        weight = len(edges_table[(edges_table['edge'] == edge['edge']) & (is_in_window)])
#         Need the following line if we don't add 1 to weights total outside this inner loop
        if(weight < 1): continue 
        edge_list.append({'source':edge['dr_x'],'target':edge['dr_y'],'weight':weight})

#     TODO: discuss this line. Adding 1 to the weight closes the triangles. Otherwise we need to leave out 0 weights  
#     edge_list["weight"] = edge_list["weight"] + 1 
    edge_df = pd.DataFrame(edge_list, columns = ['source', 'target', 'weight'])
    g = nx.Graph()  # empty graph
    g = nx.from_pandas_edgelist(edge_df, source='source', target='target',edge_attr='weight')
    experience_data_list.append(get_output_for_row(g, visit_id))  

stop_time = time.perf_counter()
print(f"It took {stop_time - start_time} seconds or {(stop_time - start_time) / 60} minutes")

100%|██████████| 541/541 [01:42<00:00,  5.30it/s]

It took 102.14362149999943 seconds or 1.702393691666657 minutes





Create dataframe from experience output list, and join with discharge data on discharge id to get disposition and age for further analysis

In [14]:
experience_df = pd.DataFrame(experience_data_list, columns=utils.columns).drop_duplicates()

experience_master_df = experience_df.merge(discharge_table, left_on='discharge_id', right_on='id', copy=False)

print(experience_master_df.shape)

(541, 12)


In [15]:
def get_model(var):
    return sm.GLM.from_formula(f'disposition ~ {var} + age', family = sm.families.Binomial(), data=experience_master_df)

model = get_model('avg_clust')
result = model.fit()
result.summary() 

0,1,2,3
Dep. Variable:,disposition,No. Observations:,541.0
Model:,GLM,Df Residuals:,538.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-188.08
Date:,"Tue, 09 Mar 2021",Deviance:,376.17
Time:,14:10:17,Pearson chi2:,541.0
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.5821,2.129,-1.213,0.225,-6.755,1.591
avg_clust,0.7183,0.780,0.920,0.357,-0.811,2.248
age,0.0063,0.029,0.216,0.829,-0.051,0.064


In [16]:
model = get_model('cumulative_experience')
result = model.fit()
result.summary() 

0,1,2,3
Dep. Variable:,disposition,No. Observations:,541.0
Model:,GLM,Df Residuals:,538.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-188.09
Date:,"Tue, 09 Mar 2021",Deviance:,376.18
Time:,14:10:30,Pearson chi2:,541.0
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.7120,2.145,-1.264,0.206,-6.916,1.492
cumulative_experience,0.0106,0.012,0.886,0.376,-0.013,0.034
age,0.0066,0.029,0.224,0.822,-0.051,0.064


In [17]:
model = get_model('avg_cumulative_experience')
result = model.fit()
result.summary()   

0,1,2,3
Dep. Variable:,disposition,No. Observations:,541.0
Model:,GLM,Df Residuals:,538.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-188.21
Date:,"Tue, 09 Mar 2021",Deviance:,376.43
Time:,14:10:35,Pearson chi2:,541.0
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.7103,2.149,-1.261,0.207,-6.922,1.501
avg_cumulative_experience,0.0583,0.081,0.719,0.472,-0.101,0.217
age,0.0060,0.029,0.207,0.836,-0.051,0.063


In [19]:
experience_df.head(20)

Unnamed: 0,discharge_id,avg_clust,cumulative_experience,avg_cumulative_experience,team_edge_size,team_size
0,40,0.0,3.0,1.5,1,2
1,208,0.333333,27.0,5.4,5,5
2,60,0.0,11.0,2.75,2,4
3,28,0.0,6.0,3.0,1,2
4,285,0.0,12.0,3.0,2,4
5,338,0.0,10.0,2.5,2,4
6,158,0.0,6.0,3.0,1,2
7,241,0.40169,18.0,4.5,4,4
8,105,0.0,25.0,5.0,4,5
9,457,0.0,6.0,3.0,1,2
