In [1]:
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from multiprocessing import Pool
import networkx as nx
import datetime
import os
import sys

# Import from the parent directory
sys.path.append(os.path.join(os.getcwd(), '..'))
from teamwork import teamwork_v2 as tw
from utils import utils

In [2]:
discharge_table = pd.read_csv('../data/discharges_w_disposition_new.csv', parse_dates=['arrive_date','discharge_date'])
notes_table = pd.read_csv('../data/notes_w_disposition_new.csv', parse_dates=['date'])
notes_table.rename(columns = {'discharge_id':'id'}, inplace = True)
INDEX_DELTA = datetime.timedelta(days=2)
TEAMWORK_DELTA = datetime.timedelta(days=90)

##match in admission datetime indexing on visit id from discharge table
notes_table = notes_table.merge(discharge_table, on='id', how='right')
notes_table['normalized_date'] = notes_table['arrive_date'].astype('datetime64[D]')

notes_table.drop_duplicates(['normalized_date','dr','id'], keep="first", inplace=True)

notes_table.head()

Unnamed: 0,id,dr,date,patient,arrive_date,discharge_date,disposition,age,normalized_date
0,0,Adolfo Rubio,2020-07-06,Cindy Wheeler,2020-07-03,2020-07-06,0,74,2020-07-03
1,0,Bob Peterson,2020-07-05,Cindy Wheeler,2020-07-03,2020-07-06,0,74,2020-07-03
2,0,Duane Tucker,2020-07-05,Cindy Wheeler,2020-07-03,2020-07-06,0,74,2020-07-03
3,0,Nelson Hanson,2020-07-05,Cindy Wheeler,2020-07-03,2020-07-06,0,74,2020-07-03
4,0,Matthew Mason,2020-07-04,Cindy Wheeler,2020-07-03,2020-07-06,0,74,2020-07-03


In [3]:
edges_table = notes_table.merge(notes_table[['dr','id','date']], how='left', on='id')
edges_table = edges_table[edges_table['dr_x'] < edges_table['dr_y']]
edges_table['edge'] = edges_table['dr_x'] + edges_table['dr_y']
edges_table.head()

Unnamed: 0,id,dr_x,date_x,patient,arrive_date,discharge_date,disposition,age,normalized_date,dr_y,date_y,edge
1,0,Adolfo Rubio,2020-07-06,Cindy Wheeler,2020-07-03,2020-07-06,0,74,2020-07-03,Bob Peterson,2020-07-05,Adolfo RubioBob Peterson
2,0,Adolfo Rubio,2020-07-06,Cindy Wheeler,2020-07-03,2020-07-06,0,74,2020-07-03,Duane Tucker,2020-07-05,Adolfo RubioDuane Tucker
3,0,Adolfo Rubio,2020-07-06,Cindy Wheeler,2020-07-03,2020-07-06,0,74,2020-07-03,Nelson Hanson,2020-07-05,Adolfo RubioNelson Hanson
4,0,Adolfo Rubio,2020-07-06,Cindy Wheeler,2020-07-03,2020-07-06,0,74,2020-07-03,Matthew Mason,2020-07-04,Adolfo RubioMatthew Mason
5,0,Adolfo Rubio,2020-07-06,Cindy Wheeler,2020-07-03,2020-07-06,0,74,2020-07-03,Nolan Stone,2020-07-04,Adolfo RubioNolan Stone


In [4]:
index_table = edges_table[(edges_table["date_x"] - edges_table["arrive_date"] < INDEX_DELTA)
                         & (edges_table["date_y"] - edges_table["arrive_date"] < INDEX_DELTA)]

def get_output_for_row(g, visit_id):
    data = {}
    data['discharge_id'] = visit_id
    
    ''' Clustering coefficient of all nodes (in a dictionary) '''
    clustering_coefficient = nx.clustering(g, weight='weight')
    
    ''' Average clustering coefficient with divide-by-zero check '''
    clust_sum = sum(clustering_coefficient.values())
    clust_len = len(clustering_coefficient)
        
    data['avg_clust'] = clust_sum / clust_len if clust_len > 0 else 0 
#     data['avg_clust'] = nx.average_clustering(g)
#     data['avg_clust'] = sum(clustering_coefficient.values()) / g.number_of_nodes() if g.number_of_nodes() > 0 else 0
    data['sum_clust'] = clust_sum
    data['team_size'] = g.number_of_nodes()
    data['team_edge_size'] = g.number_of_edges()
    
    experience = g.size(weight='weight') #Experience as sum of weights
    data['cumulative_experience'] = experience - data['team_edge_size']
    data['avg_cumulative_experience'] = data['cumulative_experience'] / data['team_size'] if data['team_size'] > 0 else 0
    
    return data

experience_data_list= pd.DataFrame(columns = ['discharge_id',
        'avg_clust',
        'cumulative_experience',
        'avg_cumulative_experience',
        "team_edge_size",
        "team_size",
         "sum_clust"])
for visit_id in index_table.id.unique():
    team = index_table[index_table['id'] == visit_id]
    if(visit_id == 0): print(team)
    df = pd.DataFrame(columns = ['source', 'target', 'weight'])
    for i,edge in team.iterrows():
        cutoff_lower = edge['normalized_date'] - TEAMWORK_DELTA
        is_in_window = (edges_table['normalized_date'] >= cutoff_lower) & (edges_table['normalized_date'] < edge['normalized_date'])
        thesum = len(edges_table[(edges_table['edge'] == edge['edge']) & (is_in_window)])
        if(thesum < 1): continue
        df = df.append({'source':edge['dr_x'],'target':edge['dr_y'],'weight':thesum}, ignore_index=True)
#     df["weight"] = df["weight"] + 1 
    g = nx.Graph()  # empty graph
    g = nx.from_pandas_edgelist(df, source='source', target='target',edge_attr='weight')
    experience_data_list = experience_data_list.append(get_output_for_row(g, visit_id), ignore_index=True)  

print(experience_data_list.head())


    id           dr_x     date_x        patient arrive_date discharge_date  \
37   0  Matthew Mason 2020-07-04  Cindy Wheeler  2020-07-03     2020-07-06   

    disposition  age normalized_date         dr_y     date_y  \
37            0   74      2020-07-03  Nolan Stone 2020-07-04   

                        edge  
37  Matthew MasonNolan Stone  
   discharge_id  avg_clust  cumulative_experience  avg_cumulative_experience  \
0           0.0   0.000000                    0.0                        0.0   
1           1.0   0.000000                    1.0                        0.5   
2           3.0   0.000000                    0.0                        0.0   
3           4.0   0.631177                   10.0                        2.5   
4           6.0   0.000000                    0.0                        0.0   

   team_edge_size  team_size  sum_clust  
0             1.0        2.0   0.000000  
1             1.0        2.0   0.000000  
2             1.0        2.0   0.000000  
3  

In [5]:
experience_df = pd.DataFrame(experience_data_list, columns=utils.columns).drop_duplicates()

discharges_df = pd.read_csv('../data/discharges_w_disposition_new.csv')

experience_master_df = experience_df.merge(discharges_df, left_on='discharge_id', right_on='id', copy=False)

print(experience_master_df.shape)

(262, 12)


In [6]:
def get_model(var):
    return sm.GLM.from_formula(f'disposition ~ {var} + age', family = sm.families.Binomial(), data=experience_master_df)

model = get_model('avg_clust')
result = model.fit()
result.summary() 

0,1,2,3
Dep. Variable:,disposition,No. Observations:,262.0
Model:,GLM,Df Residuals:,259.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-70.132
Date:,"Thu, 04 Mar 2021",Deviance:,140.26
Time:,11:53:40,Pearson chi2:,198.0
No. Iterations:,24,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,3.2598,3.595,0.907,0.364,-3.786,10.305
avg_clust,-70.9729,5.61e+04,-0.001,0.999,-1.1e+05,1.1e+05
age,-0.0743,0.051,-1.464,0.143,-0.174,0.025


In [7]:
model = get_model('cumulative_experience')
result = model.fit()
result.summary() 

0,1,2,3
Dep. Variable:,disposition,No. Observations:,262.0
Model:,GLM,Df Residuals:,259.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-59.326
Date:,"Thu, 04 Mar 2021",Deviance:,118.65
Time:,11:53:40,Pearson chi2:,129.0
No. Iterations:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,4.4798,3.730,1.201,0.230,-2.830,11.790
cumulative_experience,-22.1410,1.23e+04,-0.002,0.999,-2.41e+04,2.41e+04
age,-0.0843,0.053,-1.604,0.109,-0.187,0.019


In [8]:
model = get_model('avg_cumulative_experience')
result = model.fit()
result.summary()   

0,1,2,3
Dep. Variable:,disposition,No. Observations:,262.0
Model:,GLM,Df Residuals:,259.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-59.326
Date:,"Thu, 04 Mar 2021",Deviance:,118.65
Time:,11:53:40,Pearson chi2:,129.0
No. Iterations:,26,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,4.4798,3.730,1.201,0.230,-2.830,11.790
avg_cumulative_experience,-90.1229,7.26e+04,-0.001,0.999,-1.42e+05,1.42e+05
age,-0.0843,0.053,-1.604,0.109,-0.187,0.019
