In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import networkx as nx
import datetime
import os
import sys
from collections import Counter
from time import perf_counter
from tqdm import tqdm_notebook as tqdm


# # Import from the parent directory
# sys.path.append(os.path.join(os.getcwd(), '..'))
# from utils import utils


# Start the stopwatch / counter
t1_start = perf_counter()

discharge_table = pd.read_csv('../data/discharges_w_disposition_large.csv', parse_dates=['arrive_date','discharge_date'])
discharge_table.rename(columns = {'arrive_date':'admission_date'}, inplace = True)
notes_table = pd.read_csv('../data/notes_w_disposition_large.csv', parse_dates=['date'])
notes_table.rename(columns = {'discharge_id':'id'}, inplace = True)
notes_table.rename(columns = {'date':'note_date'}, inplace = True)

####Add section to take Abdul's inputs and rename

# authors who wrote notes within 48 hours of arrival date are in index team for that patient
INDEX_DELTA = np.timedelta64(2, 'D')
# notes written within previous 90 days are considered when calculating collaborative experience
TEAMWORK_DELTA = np.timedelta64(90, 'D')
end_window = np.timedelta64(-2, 'D')

# match in admission datetime indexing on visit id from discharge table
notes_table = notes_table.merge(discharge_table, on='id', how='right')

# create new column for normalized date by ignoring time of day
notes_table['admission_date'] = notes_table['admission_date'].astype('datetime64[D]')
notes_table['note_date'] = notes_table['note_date'].astype('datetime64[D]')

notes_table.drop_duplicates(['note_date','dr','id'], keep="first", inplace=True) #changed normalized date to note_date


notes_table.sort_values('admission_date', inplace=True)
FIRST_DATE = notes_table['admission_date'].iloc[0]

notes_table.head()



##Merge
edges_table = notes_table.merge(notes_table[['dr','id','note_date']], how='left', on=['id','note_date'])
edges_table = edges_table[edges_table['dr_x'] < edges_table['dr_y']]
edges_table['edge'] = edges_table['dr_x'] + edges_table['dr_y']

#Find index edges
# edges_table["DaySinceAdmission_x"]= edges_table["note_date_x"]-edges_table["normalized_date"]
# edges_table["DaySinceAdmission_y"]= edges_table["note_date_y"]-edges_table["normalized_date"]
# edges_table['IndexTeamMate_x'] = np.where(edges_table['DaySinceAdmission_x']<=INDEX_DELTA, "yes", "no")
# edges_table['IndexTeamMate_y'] = np.where(edges_table['DaySinceAdmission_y']<=INDEX_DELTA, "yes", "no")
# edges_table["IndexTeam"]= np.where((edges_table['IndexTeamMate_x']=="yes") & (edges_table['IndexTeamMate_y']=="yes") , "yes", "no") #Establishes index careteam

edges_table["DaySinceAdmission"]= edges_table["note_date"]-edges_table["admission_date"]
edges_table['IndexTeam'] = np.where(edges_table['DaySinceAdmission']<=INDEX_DELTA, "yes", "no")

edges_table= edges_table.drop(columns=['DaySinceAdmission', "discharge_date", "patient"])

###Create df with only the index edges

index_edges = edges_table.loc[edges_table['IndexTeam'] == "yes"]
index_edges.rename(columns = {'note_date':'index_note_date'}, inplace = True)


###Join in other
Experience_table = index_edges.merge(edges_table[['id','note_date','edge']], how='left', on=['edge'])
Experience_table["days_before_admission"]= Experience_table["admission_date"]-Experience_table["note_date"]
Experience_table['Potential_ExperienceTeam'] = np.where(Experience_table['days_before_admission']>= end_window, "yes", "no")
Experience_table['ExperienceTeam'] = np.where(Experience_table['days_before_admission']<=TEAMWORK_DELTA, "yes", "no")
Experience_table['Include'] = np.where((Experience_table['ExperienceTeam']=="yes") & (Experience_table['Potential_ExperienceTeam']=="yes"), "yes", "no")


def get_output_for_row(g, visit_id):
    data = {}
    data['discharge_id'] = visit_id

    ''' Clustering coefficient of all nodes (in a dictionary) '''
    clustering_coefficient = nx.clustering(g, weight='weight')

    ''' Average clustering coefficient with divide-by-zero check '''
    clust_sum = sum(clustering_coefficient.values())
    clust_len = len(clustering_coefficient)

    data['avg_clust'] = clust_sum / clust_len if clust_len > 0 else 0

    data['sum_clust'] = clust_sum
    data['team_size'] = g.number_of_nodes()
    data['team_edge_size'] = g.number_of_edges()

    experience = g.size(weight='weight') #Experience as sum of weights
    data['cumulative_experience'] = experience - data['team_edge_size']
    data['avg_cumulative_experience'] = data['cumulative_experience'] / data['team_size'] if data['team_size'] > 0 else 0

    return data

experience_data_list= pd.DataFrame(columns = ['discharge_id',
        'avg_clust',
        'cumulative_experience',
        'avg_cumulative_experience',
        "team_edge_size",
        "team_size",
         "sum_clust"])

# dfObj[(dfObj['Sale'] > 30) & (dfObj['Sale'] < 33) ]
progress = 0

for visit_id in tqdm(Experience_table.id_x.unique()):
    team = Experience_table[(Experience_table['id_x'] == visit_id) & (Experience_table['Include'] == 'yes')]
    edge_list = pd.DataFrame(columns = ['source', 'target', 'weight'])
    progress= progress+1
    denom = len(pd.unique(Experience_table['id_x']))
    print(progress/denom)
    for i,edge in team.iterrows():
        # print(i)
        weight = len(edges_table[(edges_table['edge'] == edge['edge'])])
#         Need the following line if we don't add 1 to weights total outside this inner loop
        if(weight < 1): continue
        edge_list = edge_list.append({'source':edge['dr_x'],'target':edge['dr_y'],'weight':weight}, ignore_index=True)

    g = nx.Graph()  # empty graph
    g = nx.from_pandas_edgelist(edge_list, source='source', target='target',edge_attr='weight')
    experience_data_list = experience_data_list.append(get_output_for_row(g, visit_id), ignore_index=True) 

# Stop the stopwatch / counter
t1_stop = perf_counter()
print("Elapsed time:", t1_stop, t1_start)

print("Elapsed time during the whole program in seconds:", t1_stop - t1_start)

experience_master_df = experience_data_list.merge(discharge_table, left_on='discharge_id', right_on='id', copy=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for visit_id in tqdm(Experience_table.id_x.unique()):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4502.0), HTML(value='')))

0.0002221235006663705
0.000444247001332741
0.0006663705019991116
0.000888494002665482
0.0011106175033318525
0.0013327410039982231
0.0015548645046645935
0.001776988005330964
0.0019991115059973343
0.002221235006663705
0.0024433585073300756
0.0026654820079964462
0.0028876055086628164
0.003109729009329187
0.0033318525099955577
0.003553976010661928
0.0037760995113282985
0.003998223011994669
0.004220346512661039
0.00444247001332741
0.0046645935139937806
0.004886717014660151
0.005108840515326522
0.0053309640159928924
0.005553087516659262
0.005775211017325633
0.0059973345179920035
0.006219458018658374
0.006441581519324745
0.006663705019991115
0.006885828520657486
0.007107952021323856
0.007330075521990226
0.007552199022656597
0.007774322523322968
0.007996446023989337
0.008218569524655708
0.008440693025322079
0.00866281652598845
0.00888494002665482
0.00910706352732119
0.009329187027987561
0.009551310528653932
0.009773434029320302
0.009995557529986673
0.010217681030653044
0.010439804531319414
0.0