In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from itertools import combinations 
import time
import os
import sys
from tqdm import tqdm
from math import comb

In [2]:
start_time = time.perf_counter()

# discharges_test_file = '../data/discharges_test.csv'
# notes_test_file = '../data/notes_test.csv'

discharges_test_file = '../data/discharges_w_disposition_abdul_test.csv'
notes_test_file = '../data/notes_w_disposition_abdul_test.csv'

discharge_table = pd.read_csv(discharges_test_file, parse_dates=['arrive_date','discharge_date'])
notes_table = pd.read_csv(notes_test_file, parse_dates=['date'])
notes_table.rename(columns = {'discharge_id':'id'}, inplace = True)

# authors who wrote notes within 48 hours of arrival date are in index team for that patient
INDEX_DELTA = np.timedelta64(2, 'D')
# notes written within previous 90 days are considered when calculating collaborative experience
TEAMWORK_DELTA = np.timedelta64(90, 'D')

# match in admission datetime indexing on visit id from discharge table
notes_table = notes_table.merge(discharge_table, on='id', how='right')

# create new column for normalized date by ignoring time of day
notes_table['norm_arrive_date'] = notes_table['arrive_date'].astype('datetime64[D]')
notes_table['norm_note_date'] = notes_table['date'].astype('datetime64[D]')

notes_table.drop_duplicates(['norm_note_date','dr','id'], keep="first", inplace=True)

notes_table.sort_values('arrive_date', inplace=True)
FIRST_DATE = notes_table['arrive_date'].iloc[0]

# add indicator column for whether the note author is in the index team
notes_table['is_in_team'] = notes_table["date"] - notes_table["arrive_date"] <= INDEX_DELTA

stop_time = time.perf_counter()
print(f"It took {stop_time - start_time} seconds or {(stop_time - start_time) / 60} minutes to read in data and modify table")   

print(notes_table.head())

It took 0.2346821998944506 seconds or 0.003911369998240844 minutes to read in data and modify table
      id     dr       date    patient arrive_date discharge_date  disposition  \
2559  15  dr212 2020-03-16  patient16  2020-03-09     2020-03-16            0   
2533  15  dr189 2020-03-13  patient16  2020-03-09     2020-03-16            0   
2534  15  dr189 2020-03-12  patient16  2020-03-09     2020-03-16            0   
2540  15  dr193 2020-03-15  patient16  2020-03-09     2020-03-16            0   
2545  15   dr28 2020-03-11  patient16  2020-03-09     2020-03-16            0   

      age norm_arrive_date norm_note_date  is_in_team  
2559   65       2020-03-09     2020-03-16       False  
2533   65       2020-03-09     2020-03-13       False  
2534   65       2020-03-09     2020-03-12       False  
2540   65       2020-03-09     2020-03-15       False  
2545   65       2020-03-09     2020-03-11        True  


In [3]:
start_time = time.perf_counter()

# do self join on discharge id and normalized note date to get table of edges
edges_table = notes_table.merge(notes_table[['dr','id','is_in_team','norm_note_date']], how='inner', on=['id','norm_note_date'])
# remove edges with the same name twice or with authors in reverse order
edges_table = edges_table[edges_table['dr_x'] < edges_table['dr_y']]
edges_table['edge'] = edges_table['dr_x'] + edges_table['dr_y']

# might be able to remove this line, need to discuss
edges_table['is_in_team'] = edges_table['is_in_team_x'] & edges_table['is_in_team_y']
# add column indicating whether there are 90 days prior to arrive date. if not, don't count as index team
edges_table['is_after_delta'] = edges_table["arrive_date"] > (FIRST_DATE + TEAMWORK_DELTA)

stop_time = time.perf_counter()
print(f"It took {stop_time - start_time} seconds or {(stop_time - start_time) / 60} minutes to self join and get edges table")  

It took 0.3169557999353856 seconds or 0.00528259666558976 minutes to self join and get edges table


In [4]:
start_time = time.perf_counter()

# do self join on discharge id get table of team edges
team_table = notes_table.merge(notes_table[['dr','id','is_in_team','norm_note_date']], how='inner', on='id')
# remove edges with the same name twice or with authors in reverse order
team_table = team_table[team_table['dr_x'] < team_table['dr_y']]
team_table['edge'] = team_table['dr_x'] + team_table['dr_y']

# might be able to remove this line, need to discuss
team_table['is_in_team'] = team_table['is_in_team_x'] & team_table['is_in_team_y']
# add column indicating whether there are 90 days prior to arrive date. if not, don't count as index team
team_table['is_after_delta'] = team_table["arrive_date"] > (FIRST_DATE + TEAMWORK_DELTA)

team_table = team_table[team_table['is_in_team'] & team_table['is_after_delta']]

stop_time = time.perf_counter()
print(f"It took {stop_time - start_time} seconds or {(stop_time - start_time) / 60} minutes to self join and get edges table")  

It took 1.128571099950932 seconds or 0.018809518332515533 minutes to self join and get edges table


In [5]:
def get_output_for_row(g, visit_id, team):
    data = {}
    data['discharge_id'] = visit_id
    
    ''' Clustering coefficient of all nodes (in a dictionary) '''
    clustering_coefficient = nx.clustering(g, weight='weight')
    
    ''' Average clustering coefficient with divide-by-zero check '''
    clust_sum = sum(clustering_coefficient.values())
    clust_len = len(clustering_coefficient)
        
    data['avg_clust'] = clust_sum / clust_len if clust_len > 0 else 0 
    
    data['sum_clust'] = clust_sum
    data['team_size'] = len(team)
    potential_edges = comb(len(team),2)
    data['potential_edges'] = potential_edges
    data['team_edge_size'] = g.number_of_edges()
    
    experience = g.size(weight='weight') #Experience as sum of weights
    data['cumulative_experience'] = experience - data['team_edge_size']
    data['avg_cumulative_experience'] = data['cumulative_experience'] / potential_edges if data['team_size'] > 0 else 0
    
    return data

In [6]:
# map discharge id to list of edges
discharge_id_to_edges_dict = dict()
# map edge to list of dates when note authors collaborated
edge_to_date_dict = dict()
# map discharge id to list of team members
discharge_id_to_team_dict = dict()

def add_to_team_dict(edge_record):
    edge_tup = (edge_record['dr_x'], edge_record['dr_y'])
    # store edge, individual note author names, and arrive date in list item
    edge_list_item = (edge_record['edge'], edge_tup, edge_record['norm_arrive_date'])
    discharge_id_to_edges_dict.setdefault(edge_record['id'],[]).append(edge_list_item)
    discharge_id_to_team_dict.setdefault(edge_record['id'],set()).update(edge_tup)

def add_edge_to_dict(edge_record):
    edge_to_date_dict.setdefault(edge_record['edge'],[]).append(edge_record['norm_note_date'])
    
tqdm.pandas(desc="Dictionary Progress Bar!")

start_time = time.perf_counter()

# edges_table.apply(add_edge_to_dict, axis='columns')

# team_table.apply(add_to_team_dict, axis='columns')

edges_table.progress_apply(add_edge_to_dict, axis='columns')

team_table.progress_apply(add_to_team_dict, axis='columns')
    
stop_time = time.perf_counter()
print(f"It took {stop_time - start_time} seconds or {(stop_time - start_time) / 60} minutes to build dictionaries") 

  from pandas import Panel
Dictionary Progress Bar!: 100%|██████████| 122390/122390 [00:04<00:00, 27791.97it/s]
Dictionary Progress Bar!: 100%|██████████| 53951/53951 [00:02<00:00, 19479.36it/s]

It took 7.185759199899621 seconds or 0.11976265333166035 minutes to build dictionaries





In [7]:
experience_data_list = []

start_time = time.perf_counter()

for discharge_id, edge_items in tqdm(discharge_id_to_edges_dict.items()):
    edge_list = []
    for edge_item in edge_items:
        edge = edge_item[0]
        if edge not in edge_to_date_dict: continue
        (dr_x,dr_y) = edge_item[1]
        arrive_date = edge_item[2]
        weight = len([note_day for note_day in edge_to_date_dict[edge] 
                      if note_day < arrive_date and note_day >= arrive_date - TEAMWORK_DELTA])
        if(weight < 1): continue 
        edge_list.append({'source':dr_x,'target':dr_y,'weight':weight})
    edge_df = pd.DataFrame(edge_list, columns = ['source', 'target', 'weight'])
    g = nx.from_pandas_edgelist(edge_df, source='source', target='target',edge_attr='weight')
    team = discharge_id_to_team_dict[discharge_id]
    experience_data_list.append(get_output_for_row(g, discharge_id, team))
        
stop_time = time.perf_counter()
print(f"It took {stop_time - start_time} seconds or {(stop_time - start_time) / 60} minutes to create graphs")     

100%|██████████| 541/541 [00:01<00:00, 384.79it/s]

It took 1.4093690000008792 seconds or 0.023489483333347987 minutes to create graphs





In [8]:
columns = ['discharge_id',
        'avg_clust',
        'cumulative_experience',
        'avg_cumulative_experience',
        'team_edge_size',
           'team_size',
           'potential_edges'
          ]

# experience_df = pd.DataFrame(experience_data_list, columns=columns).drop_duplicates()
experience_df = pd.DataFrame(experience_data_list, columns=columns)
experience_df.head(20)

Unnamed: 0,discharge_id,avg_clust,cumulative_experience,avg_cumulative_experience,team_edge_size,team_size,potential_edges
0,40,0.0,3.0,0.107143,1,8,28
1,208,0.333333,27.0,1.285714,5,7,21
2,60,0.0,20.0,0.444444,4,10,45
3,28,0.0,6.0,0.285714,1,7,21
4,158,0.0,6.0,0.4,1,6,15
5,285,0.0,12.0,0.428571,2,8,28
6,338,0.0,10.0,0.666667,2,6,15
7,241,0.40169,18.0,0.5,4,9,36
8,457,0.0,6.0,0.4,1,6,15
9,105,0.0,23.0,0.821429,4,8,28


In [9]:
'''This code does not work but playing around with an easy API for the library'''

# column_names = {
#     "visit_id": "discharge_id",
#     "admission_date": "arrive_date",
#     "note_author": "dr",
#     "note_date": "date"
# }

# EXPERIENCE_WINDOW = 90
# TEAM_WINDOW = 2

# study = TeamWorkStudy(notes_csv_file, EXPERIENCE_WINDOW,TEAM_WINDOW, column_names=column_names)



'This code does not work but playing around with an easy API for the library'

In [10]:
myset = set([1,2,3])
myset.add(1)
myset.add(4)
myset.update((2,5,8))
print(myset)
print(len(myset))

from math import comb
comb(4,2)


{1, 2, 3, 4, 5, 8}
6


6

In [11]:
test_df = pd.DataFrame({'date': ['3/10/2000', '3/11/2000', '3/12/2000'],
                   'value': [2, 3, 4],
                       'another': [6,7,8]})
def add_column(df):
    df['another_column'] = [7,8,9]
add_column(test_df)
print(test_df.columns.values)

['date' 'value' 'another' 'another_column']


In [12]:
cols = {'values':'value', 'datetime':'custdate'}
defcols = {'datetime':'date','values':'value','hello':[1,2,3], 'goodbye':{1,2,3}}
mycols = {**defcols,**cols}
def take_params(**cols):
    print(cols['hello'])
take_params(**mycols)

_,v,_,gb = [*mycols.values()]
print(gb)  

def print_val(v):
    print(v)
    
list(map(print_val, [1,4,5]))


[1, 2, 3]
{1, 2, 3}
1
4
5


[None, None, None]

In [13]:
from multiprocessing import Pool, Process, connection, current_process, cpu_count
from random import randint
from time import sleep
from datetime import datetime


def run(i):
    sleep_time = randint(2,10)
    sleep(sleep_time)
    print(f"{datetime.now()} {current_process().name} exiting with arg {i} after sleeping for {sleep_time}")

def cpu_bound(numbers, r):
    print(f"\n{datetime.now()} {current_process().name} running for r := {r}")
    thesum = sum(sum(i * i for i in range(number)) for number in numbers)
    print(f"{datetime.now()} {current_process().name} exiting with sum of {thesum}")
    return thesum

def get_numbers(r):
    return [5_000_000 + i for i in range(r)]

def print_res(res):
    print(f'in callback: {res}')

if __name__ == '__main__':
    with Pool(cpu_count()) as pool:
        print(f'cpu count: {cpu_count()}')
#         pool = [Process(target=cpu_bound,args=(get_numbers(r := randint(10,20)), r)) for _ in range(4)]
        print(f"{datetime.now()} {current_process().name} waiting")
        multiple_results = [pool.apply_async(cpu_bound, args=(get_numbers(r := randint(10,20)), r), callback=print_res) for _ in range(4)]
        [res.get() for res in multiple_results]

    print(f"{datetime.now()} {current_process().name} out of with block")


2021-05-04 00:08:24.345376 ForkPoolWorker-1 running for r := 18
2021-05-04 00:08:24.345576 ForkPoolWorker-2 running for r := 18
2021-05-04 00:08:24.345897 ForkPoolWorker-3 running for r := 13
2021-05-04 00:08:24.346157 ForkPoolWorker-4 running for r := 18



cpu count: 8
2021-05-04 00:08:24.341747 MainProcess waiting
2021-05-04 00:08:31.836306 ForkPoolWorker-3 exiting with sum of 541668454169537501716
in callback: 541668454169537501716
2021-05-04 00:08:34.234262 ForkPoolWorker-2 exiting with sum of 750003600008175006936
2021-05-04 00:08:34.343296 ForkPoolWorker-1 exiting with sum of 750003600008175006936
in callback: 750003600008175006936
in callback: 750003600008175006936
2021-05-04 00:08:34.471361 ForkPoolWorker-4 exiting with sum of 750003600008175006936
in callback: 750003600008175006936
2021-05-04 00:08:34.485519 MainProcess out of with block
