In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import networkx as nx
import datetime
import os
import sys

# # Import from the parent directory
sys.path.append(os.path.join(os.getcwd(), '..'))
from utils import utils

Read in discharges and notes data, merge on discharge_id, drop duplicates

In [2]:
discharge_table = pd.read_csv('../data/discharges_w_disposition_abdul_test.csv', parse_dates=['arrive_date','discharge_date'])
notes_table = pd.read_csv('../data/notes_w_disposition_abdul_test.csv', parse_dates=['date'])
notes_table.rename(columns = {'discharge_id':'id'}, inplace = True)

# authors who wrote notes within 48 hours of arrival date are in index team for that patient
INDEX_DELTA = np.timedelta64(2, 'D')
# notes written within previous 90 days are considered when calculating collaborative experience
TEAMWORK_DELTA = np.timedelta64(90, 'D')

# match in admission datetime indexing on visit id from discharge table
notes_table = notes_table.merge(discharge_table, on='id', how='right')

# create new column for normalized date by ignoring time of day
notes_table['normalized_date'] = notes_table['arrive_date'].astype('datetime64[D]')

notes_table.drop_duplicates(['normalized_date','dr','id'], keep="first", inplace=True)

notes_table.sort_values('arrive_date', inplace=True)
FIRST_DATE = notes_table['arrive_date'].iloc[0]

notes_table.head()

Unnamed: 0,id,dr,date,patient,arrive_date,discharge_date,disposition,age,normalized_date
6918,42,John Romero,2020-03-11,Celia Diaz,2020-03-10,2020-03-17,0,76,2020-03-10
7011,42,Susie Reed,2020-03-13,Celia Diaz,2020-03-10,2020-03-17,0,76,2020-03-10
7022,42,Sara Bailey,2020-03-17,Celia Diaz,2020-03-10,2020-03-17,0,76,2020-03-10
6930,42,Johnni Graham,2020-03-14,Celia Diaz,2020-03-10,2020-03-17,0,76,2020-03-10
6929,42,Lucas Vicente,2020-03-14,Celia Diaz,2020-03-10,2020-03-17,0,76,2020-03-10


"Self join" the notes table on the discharge id to create edge list, with some steps to remove duplicates

In [15]:
edges_table = notes_table.merge(notes_table[['dr','id','date']], how='left', on='id')
edges_table = edges_table[edges_table['dr_x'] < edges_table['dr_y']]
edges_table['edge'] = edges_table['dr_x'] + edges_table['dr_y']
edges_table.head()

Unnamed: 0,id,dr_x,date_x,patient,arrive_date,discharge_date,disposition,age,normalized_date,dr_y,date_y,edge
1,42,John Romero,2020-03-11,Celia Diaz,2020-03-10,2020-03-17,0,76,2020-03-10,Susie Reed,2020-03-13,John RomeroSusie Reed
2,42,John Romero,2020-03-11,Celia Diaz,2020-03-10,2020-03-17,0,76,2020-03-10,Sara Bailey,2020-03-17,John RomeroSara Bailey
3,42,John Romero,2020-03-11,Celia Diaz,2020-03-10,2020-03-17,0,76,2020-03-10,Johnni Graham,2020-03-14,John RomeroJohnni Graham
4,42,John Romero,2020-03-11,Celia Diaz,2020-03-10,2020-03-17,0,76,2020-03-10,Lucas Vicente,2020-03-14,John RomeroLucas Vicente
6,42,John Romero,2020-03-11,Celia Diaz,2020-03-10,2020-03-17,0,76,2020-03-10,Shaun Diaz,2020-03-16,John RomeroShaun Diaz


Define function to take in network graph, perform network calculations, and return formatted output

In [4]:
def get_output_for_row(g, visit_id):
    data = {}
    data['discharge_id'] = visit_id
    
    ''' Clustering coefficient of all nodes (in a dictionary) '''
    clustering_coefficient = nx.clustering(g, weight='weight')
    
    ''' Average clustering coefficient with divide-by-zero check '''
    clust_sum = sum(clustering_coefficient.values())
    clust_len = len(clustering_coefficient)
        
    data['avg_clust'] = clust_sum / clust_len if clust_len > 0 else 0 
    
    data['sum_clust'] = clust_sum
    data['team_size'] = g.number_of_nodes()
    data['team_edge_size'] = g.number_of_edges()
    
    experience = g.size(weight='weight') #Experience as sum of weights
    data['cumulative_experience'] = experience - data['team_edge_size']
    data['avg_cumulative_experience'] = data['cumulative_experience'] / data['team_size'] if data['team_size'] > 0 else 0
    
    return data

Get the index teams, then loop through teams by discharge id, add up edges, and create network graph for each team. Add the output to a list 

In [5]:
is_index_edge = ((edges_table["date_x"] - edges_table["arrive_date"] < INDEX_DELTA) 
                 & (edges_table["date_y"] - edges_table["arrive_date"] < INDEX_DELTA))
    
is_after_delta = (edges_table["arrive_date"] > (FIRST_DATE + TEAMWORK_DELTA))

index_table = edges_table[(is_index_edge) & (is_after_delta)]

experience_data_list= pd.DataFrame(columns = ['discharge_id',
        'avg_clust',
        'cumulative_experience',
        'avg_cumulative_experience',
        "team_edge_size",
        "team_size",
         "sum_clust"])

index_table.head()

Unnamed: 0,id,dr_x,date_x,patient,arrive_date,discharge_date,disposition,age,normalized_date,dr_y,date_y,edge
9088,34,Ana Santos,2020-06-16,Tracy Olson,2020-06-15,2020-06-22,0,71,2020-06-15,Shaun Diaz,2020-06-16,Ana SantosShaun Diaz
9099,34,Ana Santos,2020-06-16,Tracy Olson,2020-06-15,2020-06-22,0,71,2020-06-15,Jose Esteban,2020-06-16,Ana SantosJose Esteban
9100,34,Ana Santos,2020-06-16,Tracy Olson,2020-06-15,2020-06-22,0,71,2020-06-15,Shane Mcdonalid,2020-06-16,Ana SantosShane Mcdonalid
9101,34,Ana Santos,2020-06-16,Tracy Olson,2020-06-15,2020-06-22,0,71,2020-06-15,Johnni Graham,2020-06-16,Ana SantosJohnni Graham
9105,34,Ana Santos,2020-06-16,Tracy Olson,2020-06-15,2020-06-22,0,71,2020-06-15,Celia Santana,2020-06-16,Ana SantosCelia Santana


In [6]:
# measure performance
start_time = time.perf_counter()

for visit_id in index_table.id.unique():
    team = index_table[index_table['id'] == visit_id]
    edge_list = pd.DataFrame(columns = ['source', 'target', 'weight'])
    for i,edge in team.iterrows():
        cutoff_lower = edge['normalized_date'] - TEAMWORK_DELTA
        is_in_window = (edges_table['normalized_date'] >= cutoff_lower) & (edges_table['normalized_date'] < edge['normalized_date'])
        weight = len(edges_table[(edges_table['edge'] == edge['edge']) & (is_in_window)])
#         Need the following line if we don't add 1 to weights total outside this inner loop
        if(weight < 1): continue 
        edge_list = edge_list.append({'source':edge['dr_x'],'target':edge['dr_y'],'weight':weight}, ignore_index=True)

#     TODO: discuss this line. Adding 1 to the weight closes the triangles. Otherwise we need to leave out 0 weights  
#     edge_list["weight"] = edge_list["weight"] + 1 
    g = nx.Graph()  # empty graph
    g = nx.from_pandas_edgelist(edge_list, source='source', target='target',edge_attr='weight')
    experience_data_list = experience_data_list.append(get_output_for_row(g, visit_id), ignore_index=True)  

stop_time = time.perf_counter()
print(f"It took {stop_time - start_time} seconds or {(stop_time - start_time) / 60} minutes")

It took 4.223624700010987 seconds or 0.07039374500018311 minutes


Create dataframe from experience output list, and join with discharge data on discharge id to get disposition and age for further analysis

In [7]:
experience_df = pd.DataFrame(experience_data_list, columns=utils.columns).drop_duplicates()

experience_master_df = experience_df.merge(discharge_table, left_on='discharge_id', right_on='id', copy=False)

print(experience_master_df.shape)

(39, 12)


In [13]:
# experience_master_df.head()

edges_table.shape()

TypeError: 'tuple' object is not callable

In [8]:
def get_model(var):
    return sm.GLM.from_formula(f'disposition ~ {var} + age', family = sm.families.Binomial(), data=experience_master_df)

model = get_model('avg_clust')
result = model.fit()
result.summary() 

  t = np.exp(-z)
  endog_mu = self._clean(endog / mu)
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu)) +
  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu)) +


0,1,2,3
Dep. Variable:,disposition,No. Observations:,39.0
Model:,GLM,Df Residuals:,36.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,
Date:,"Mon, 08 Mar 2021",Deviance:,
Time:,11:06:06,Pearson chi2:,26.1
No. Iterations:,100,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1527.1988,2.97e+07,-5.14e-05,1.000,-5.83e+07,5.83e+07
avg_clust,1516.4609,2.97e+07,5.1e-05,1.000,-5.83e+07,5.83e+07
age,0.1249,0.116,1.082,0.279,-0.101,0.351


In [9]:
model = get_model('cumulative_experience')
result = model.fit()
result.summary() 

0,1,2,3
Dep. Variable:,disposition,No. Observations:,39.0
Model:,GLM,Df Residuals:,36.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-12.565
Date:,"Mon, 08 Mar 2021",Deviance:,25.129
Time:,11:06:06,Pearson chi2:,39.2
No. Iterations:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-19.2288,11.855,-1.622,0.105,-42.464,4.006
cumulative_experience,0.0079,0.005,1.690,0.091,-0.001,0.017
age,0.2191,0.154,1.425,0.154,-0.082,0.520


In [10]:
model = get_model('avg_cumulative_experience')
result = model.fit()
result.summary()   

0,1,2,3
Dep. Variable:,disposition,No. Observations:,39.0
Model:,GLM,Df Residuals:,36.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-12.546
Date:,"Mon, 08 Mar 2021",Deviance:,25.093
Time:,11:06:07,Pearson chi2:,35.9
No. Iterations:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-19.9161,11.968,-1.664,0.096,-43.373,3.541
avg_cumulative_experience,0.0775,0.047,1.639,0.101,-0.015,0.170
age,0.2186,0.152,1.436,0.151,-0.080,0.517
