In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from itertools import combinations 
import time
import os
import sys
from tqdm import tqdm

In [2]:
start_time = time.perf_counter()

discharge_table = pd.read_csv('../data/discharges_w_disposition_abdul_test.csv', parse_dates=['arrive_date','discharge_date'])
notes_table = pd.read_csv('../data/notes_w_disposition_abdul_test.csv', parse_dates=['date'])
notes_table.rename(columns = {'discharge_id':'id'}, inplace = True)

# authors who wrote notes within 48 hours of arrival date are in index team for that patient
INDEX_DELTA = np.timedelta64(2, 'D')
# notes written within previous 90 days are considered when calculating collaborative experience
TEAMWORK_DELTA = np.timedelta64(90, 'D')

# match in admission datetime indexing on visit id from discharge table
notes_table = notes_table.merge(discharge_table, on='id', how='right')

# create new column for normalized date by ignoring time of day
notes_table['norm_arrive_date'] = notes_table['arrive_date'].astype('datetime64[D]')
notes_table['norm_note_date'] = notes_table['date'].astype('datetime64[D]')

notes_table.drop_duplicates(['norm_note_date','dr','id'], keep="first", inplace=True)

notes_table.sort_values('arrive_date', inplace=True)
FIRST_DATE = notes_table['arrive_date'].iloc[0]

# add indicator column for whether the note author is in the index team
notes_table['is_in_team'] = notes_table["date"] - notes_table["arrive_date"] <= INDEX_DELTA

stop_time = time.perf_counter()
print(f"It took {stop_time - start_time} seconds or {(stop_time - start_time) / 60} minutes to read in data and modify table")   

print(notes_table.head())

It took 0.14399330000014743 seconds or 0.0023998883333357906 minutes to read in data and modify table
      id     dr       date    patient arrive_date discharge_date  disposition  \
2559  15  dr212 2020-03-16  patient16  2020-03-09     2020-03-16            0   
2533  15  dr189 2020-03-13  patient16  2020-03-09     2020-03-16            0   
2534  15  dr189 2020-03-12  patient16  2020-03-09     2020-03-16            0   
2540  15  dr193 2020-03-15  patient16  2020-03-09     2020-03-16            0   
2545  15   dr28 2020-03-11  patient16  2020-03-09     2020-03-16            0   

      age norm_arrive_date norm_note_date  is_in_team  
2559   65       2020-03-09     2020-03-16       False  
2533   65       2020-03-09     2020-03-13       False  
2534   65       2020-03-09     2020-03-12       False  
2540   65       2020-03-09     2020-03-15       False  
2545   65       2020-03-09     2020-03-11        True  


In [3]:
start_time = time.perf_counter()

# do self join on discharge id and normalized note date to get table of edges
edges_table = notes_table.merge(notes_table[['dr','id','is_in_team','norm_note_date']], how='inner', on=['id','norm_note_date'])
# remove edges with the same name twice or with authors in reverse order
edges_table = edges_table[edges_table['dr_x'] < edges_table['dr_y']]
edges_table['edge'] = edges_table['dr_x'] + edges_table['dr_y']

# might be able to remove this line, need to discuss
edges_table['is_in_team'] = edges_table['is_in_team_x'] & edges_table['is_in_team_y']
# add column indicating whether there are 90 days prior to arrive date. if not, don't count as index team
edges_table['is_after_delta'] = edges_table["arrive_date"] > (FIRST_DATE + TEAMWORK_DELTA)

stop_time = time.perf_counter()
print(f"It took {stop_time - start_time} seconds or {(stop_time - start_time) / 60} minutes to self join and get edges table")  

It took 0.12910680000095454 seconds or 0.002151780000015909 minutes to self join and get edges table


In [4]:
def get_output_for_row(g, visit_id):
    data = {}
    data['discharge_id'] = visit_id
    
    ''' Clustering coefficient of all nodes (in a dictionary) '''
    clustering_coefficient = nx.clustering(g, weight='weight')
    
    ''' Average clustering coefficient with divide-by-zero check '''
    clust_sum = sum(clustering_coefficient.values())
    clust_len = len(clustering_coefficient)
        
    data['avg_clust'] = clust_sum / clust_len if clust_len > 0 else 0 
    
    data['sum_clust'] = clust_sum
    data['team_size'] = g.number_of_nodes()
    data['team_edge_size'] = g.number_of_edges()
    
    experience = g.size(weight='weight') #Experience as sum of weights
    data['cumulative_experience'] = experience - data['team_edge_size']
    data['avg_cumulative_experience'] = data['cumulative_experience'] / data['team_size'] if data['team_size'] > 0 else 0
    
    return data

In [5]:
# map discharge id to list of edges
discharge_id_to_edges_dict = dict()
# map edge to list of dates when note authors collaborated
edge_to_date_dict = dict()

start_time = time.perf_counter()

for _, edge_record in tqdm(edges_table.iterrows()):
    if(edge_record['is_in_team'] & edge_record['is_after_delta']):
        edge_tup = (edge_record['dr_x'], edge_record['dr_y'])
        # store edge, individual note author names, and arrive date in list item
        edge_list_item = (edge_record['edge'], edge_tup, edge_record['norm_arrive_date'])
        discharge_id_to_edges_dict.setdefault(edge_record['id'],[]).append(edge_list_item)
    edge_to_date_dict.setdefault(edge_record['edge'],[]).append(edge_record['norm_note_date'])
    
stop_time = time.perf_counter()
print(f"It took {stop_time - start_time} seconds or {(stop_time - start_time) / 60} minutes to build dictionaries") 

122390it [00:13, 9012.93it/s] 

It took 13.58689079999931 seconds or 0.2264481799999885 minutes to build dictionaries





In [6]:
experience_data_list = []

start_time = time.perf_counter()

for discharge_id, edge_items in tqdm(discharge_id_to_edges_dict.items()):
    edge_list = []
    for edge_item in edge_items:
        edge = edge_item[0]
        (dr_x,dr_y) = edge_item[1]
        arrive_date = edge_item[2]
        weight = len([note_day for note_day in edge_to_date_dict[edge] 
                      if note_day < arrive_date and note_day >= arrive_date - TEAMWORK_DELTA])
        if(weight < 1): continue 
        edge_list.append({'source':dr_x,'target':dr_y,'weight':weight})
    edge_df = pd.DataFrame(edge_list, columns = ['source', 'target', 'weight'])
    g = nx.Graph()  # empty graph
    g = nx.from_pandas_edgelist(edge_df, source='source', target='target',edge_attr='weight')
    experience_data_list.append(get_output_for_row(g, discharge_id))
        
stop_time = time.perf_counter()
print(f"It took {stop_time - start_time} seconds or {(stop_time - start_time) / 60} minutes to create graphs")     

100%|██████████| 541/541 [00:00<00:00, 665.48it/s]

It took 0.8158968999996432 seconds or 0.01359828166666072 minutes to create graphs





In [7]:
columns = ['discharge_id',
        'avg_clust',
        'cumulative_experience',
        'avg_cumulative_experience',
        'team_edge_size',
        'team_size']

experience_df = pd.DataFrame(experience_data_list, columns=columns).drop_duplicates()
experience_df.head(20)

Unnamed: 0,discharge_id,avg_clust,cumulative_experience,avg_cumulative_experience,team_edge_size,team_size
0,40,0.0,3.0,1.5,1,2
1,208,0.333333,27.0,5.4,5,5
2,60,0.0,16.0,2.666667,3,6
3,28,0.0,6.0,3.0,1,2
4,285,0.0,12.0,3.0,2,4
5,338,0.0,10.0,2.5,2,4
6,158,0.0,6.0,3.0,1,2
7,241,0.40169,18.0,4.5,4,4
8,105,0.0,23.0,4.6,4,5
9,457,0.0,6.0,3.0,1,2
