In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from itertools import combinations 
import time
import os
import sys
from tqdm import tqdm

In [2]:
start_time = time.perf_counter()

discharges_test_file = '../data/discharges_test.csv'
notes_test_file = '../data/notes_test.csv'

discharge_table = pd.read_csv(discharges_test_file, parse_dates=['arrive_date','discharge_date'])
notes_table = pd.read_csv(notes_test_file, parse_dates=['date'])
notes_table.rename(columns = {'discharge_id':'id'}, inplace = True)

# authors who wrote notes within 48 hours of arrival date are in index team for that patient
INDEX_DELTA = np.timedelta64(2, 'D')
# notes written within previous 90 days are considered when calculating collaborative experience
TEAMWORK_DELTA = np.timedelta64(90, 'D')

# match in admission datetime indexing on visit id from discharge table
notes_table = notes_table.merge(discharge_table, on='id', how='right')

# create new column for normalized date by ignoring time of day
notes_table['norm_arrive_date'] = notes_table['arrive_date'].astype('datetime64[D]')
notes_table['norm_note_date'] = notes_table['date'].astype('datetime64[D]')

notes_table.drop_duplicates(['norm_note_date','dr','id'], keep="first", inplace=True)

notes_table.sort_values('arrive_date', inplace=True)
FIRST_DATE = notes_table['arrive_date'].iloc[0]

# add indicator column for whether the note author is in the index team
notes_table['is_in_team'] = notes_table["date"] - notes_table["arrive_date"] <= INDEX_DELTA

stop_time = time.perf_counter()
print(f"It took {stop_time - start_time} seconds or {(stop_time - start_time) / 60} minutes to read in data and modify table")   

print(notes_table.head())

It took 0.04005100001813844 seconds or 0.000667516666968974 minutes to read in data and modify table
   id             dr       date   patient arrive_date discharge_date  \
0   0    Brad Palmer 2019-01-01  patient1  2019-01-01     2019-01-01   
1   1  Albert Romero 2019-01-24  patient2  2019-01-24     2019-01-24   
2   1   Margie Meyer 2019-01-24  patient2  2019-01-24     2019-01-24   
3   1   Evan Frazier 2019-01-24  patient2  2019-01-24     2019-01-24   
4   1  Myrtle George 2019-01-24  patient2  2019-01-24     2019-01-24   

   disposition  age norm_arrive_date norm_note_date  is_in_team  
0            1   75       2019-01-01     2019-01-01        True  
1            0   68       2019-01-24     2019-01-24        True  
2            0   68       2019-01-24     2019-01-24        True  
3            0   68       2019-01-24     2019-01-24        True  
4            0   68       2019-01-24     2019-01-24        True  


In [3]:
start_time = time.perf_counter()

# do self join on discharge id and normalized note date to get table of edges
edges_table = notes_table.merge(notes_table[['dr','id','is_in_team','norm_note_date']], how='inner', on=['id','norm_note_date'])
# remove edges with the same name twice or with authors in reverse order
edges_table = edges_table[edges_table['dr_x'] < edges_table['dr_y']]
edges_table['edge'] = edges_table['dr_x'] + edges_table['dr_y']

# might be able to remove this line, need to discuss
edges_table['is_in_team'] = edges_table['is_in_team_x'] & edges_table['is_in_team_y']
# add column indicating whether there are 90 days prior to arrive date. if not, don't count as index team
edges_table['is_after_delta'] = edges_table["arrive_date"] > (FIRST_DATE + TEAMWORK_DELTA)

stop_time = time.perf_counter()
print(f"It took {stop_time - start_time} seconds or {(stop_time - start_time) / 60} minutes to self join and get edges table")  

It took 0.03155489999335259 seconds or 0.0005259149998892099 minutes to self join and get edges table


In [4]:
start_time = time.perf_counter()

# do self join on discharge id get table of team edges
team_table = notes_table.merge(notes_table[['dr','id','is_in_team','norm_note_date']], how='inner', on='id')
# remove edges with the same name twice or with authors in reverse order
team_table = team_table[team_table['dr_x'] < team_table['dr_y']]
team_table['edge'] = team_table['dr_x'] + team_table['dr_y']

# might be able to remove this line, need to discuss
team_table['is_in_team'] = team_table['is_in_team_x'] & team_table['is_in_team_y']
# add column indicating whether there are 90 days prior to arrive date. if not, don't count as index team
team_table['is_after_delta'] = team_table["arrive_date"] > (FIRST_DATE + TEAMWORK_DELTA)

team_table = team_table[team_table['is_in_team'] & team_table['is_after_delta']]

stop_time = time.perf_counter()
print(f"It took {stop_time - start_time} seconds or {(stop_time - start_time) / 60} minutes to self join and get edges table")  

It took 0.02238260000012815 seconds or 0.0003730433333354692 minutes to self join and get edges table


In [5]:
def get_output_for_row(g, visit_id):
    data = {}
    data['discharge_id'] = visit_id
    
    ''' Clustering coefficient of all nodes (in a dictionary) '''
    clustering_coefficient = nx.clustering(g, weight='weight')
    
    ''' Average clustering coefficient with divide-by-zero check '''
    clust_sum = sum(clustering_coefficient.values())
    clust_len = len(clustering_coefficient)
        
    data['avg_clust'] = clust_sum / clust_len if clust_len > 0 else 0 
    
    data['sum_clust'] = clust_sum
    data['team_size'] = g.number_of_nodes()
    data['team_edge_size'] = g.number_of_edges()
    
    experience = g.size(weight='weight') #Experience as sum of weights
    data['cumulative_experience'] = experience - data['team_edge_size']
    data['avg_cumulative_experience'] = data['cumulative_experience'] / data['team_size'] if data['team_size'] > 0 else 0
    
    return data

In [6]:
# map discharge id to list of edges
discharge_id_to_edges_dict = dict()
# map edge to list of dates when note authors collaborated
edge_to_date_dict = dict()

def add_to_team_dict(edge_record):
    edge_tup = (edge_record['dr_x'], edge_record['dr_y'])
    # store edge, individual note author names, and arrive date in list item
    edge_list_item = (edge_record['edge'], edge_tup, edge_record['norm_arrive_date'])
    discharge_id_to_edges_dict.setdefault(edge_record['id'],[]).append(edge_list_item)

def add_edge_to_dict(edge_record):
    edge_to_date_dict.setdefault(edge_record['edge'],[]).append(edge_record['norm_note_date'])
    
tqdm.pandas(desc="Dictionary Progress Bar!")

start_time = time.perf_counter()

# edges_table.apply(add_edge_to_dict, axis='columns')

# team_table.apply(add_to_team_dict, axis='columns')

edges_table.progress_apply(add_edge_to_dict, axis='columns')

team_table.progress_apply(add_to_team_dict, axis='columns')
    
stop_time = time.perf_counter()
print(f"It took {stop_time - start_time} seconds or {(stop_time - start_time) / 60} minutes to build dictionaries") 

  from pandas import Panel
Dictionary Progress Bar!: 100%|██████████| 28/28 [00:00<00:00, 2515.54it/s]
Dictionary Progress Bar!: 100%|██████████| 21/21 [00:00<00:00, 4070.26it/s]

It took 0.032036000047810376 seconds or 0.000533933334130173 minutes to build dictionaries





In [7]:
experience_data_list = []

start_time = time.perf_counter()

for discharge_id, edge_items in tqdm(discharge_id_to_edges_dict.items()):
    edge_list = []
    for edge_item in edge_items:
        edge = edge_item[0]
        if edge not in edge_to_date_dict: continue
        (dr_x,dr_y) = edge_item[1]
        arrive_date = edge_item[2]
        weight = len([note_day for note_day in edge_to_date_dict[edge] 
                      if note_day < arrive_date and note_day >= arrive_date - TEAMWORK_DELTA])
        if(weight < 1): continue 
        edge_list.append({'source':dr_x,'target':dr_y,'weight':weight})
    edge_df = pd.DataFrame(edge_list, columns = ['source', 'target', 'weight'])
    g = nx.from_pandas_edgelist(edge_df, source='source', target='target',edge_attr='weight')
    experience_data_list.append(get_output_for_row(g, discharge_id))
        
stop_time = time.perf_counter()
print(f"It took {stop_time - start_time} seconds or {(stop_time - start_time) / 60} minutes to create graphs")     

100%|██████████| 1/1 [00:00<00:00, 182.37it/s]

It took 0.012031399994157255 seconds or 0.00020052333323595424 minutes to create graphs





In [8]:
columns = ['discharge_id',
        'avg_clust',
        'cumulative_experience',
        'avg_cumulative_experience',
        'team_edge_size',
        'team_size']

# experience_df = pd.DataFrame(experience_data_list, columns=columns).drop_duplicates()
experience_df = pd.DataFrame(experience_data_list, columns=columns)
experience_df.head(20)

Unnamed: 0,discharge_id,avg_clust,cumulative_experience,avg_cumulative_experience,team_edge_size,team_size
0,6,0.677976,6.0,1.0,15,6


In [9]:
for discharge_id, edges in discharge_id_to_edges_dict.items(): 
    for e in edges:
        print(e)

('Albert RomeroMargie Meyer', ('Albert Romero', 'Margie Meyer'), Timestamp('2019-04-15 00:00:00'))
('Albert RomeroEvan Frazier', ('Albert Romero', 'Evan Frazier'), Timestamp('2019-04-15 00:00:00'))
('Albert RomeroMyrtle George', ('Albert Romero', 'Myrtle George'), Timestamp('2019-04-15 00:00:00'))
('Albert RomeroVictoria Washington', ('Albert Romero', 'Victoria Washington'), Timestamp('2019-04-15 00:00:00'))
('Albert RomeroBrad Palmer', ('Albert Romero', 'Brad Palmer'), Timestamp('2019-04-15 00:00:00'))
('Albert RomeroNeil Mitchell', ('Albert Romero', 'Neil Mitchell'), Timestamp('2019-04-15 00:00:00'))
('Margie MeyerMyrtle George', ('Margie Meyer', 'Myrtle George'), Timestamp('2019-04-15 00:00:00'))
('Margie MeyerVictoria Washington', ('Margie Meyer', 'Victoria Washington'), Timestamp('2019-04-15 00:00:00'))
('Margie MeyerNeil Mitchell', ('Margie Meyer', 'Neil Mitchell'), Timestamp('2019-04-15 00:00:00'))
('Evan FrazierMargie Meyer', ('Evan Frazier', 'Margie Meyer'), Timestamp('2019-04

In [10]:
'''This code does not work but playing around with an easy API for the library'''

# column_names = {
#     "visit_id": "discharge_id",
#     "admission_date": "arrive_date",
#     "note_author": "dr",
#     "note_date": "date"
# }

# EXPERIENCE_WINDOW = 90
# TEAM_WINDOW = 2

# study = TeamWorkStudy(notes_csv_file, EXPERIENCE_WINDOW,TEAM_WINDOW, column_names=column_names)



'This code does not work but playing around with an easy API for the library'