# Teamwork study

Here is where we could put an overview of what we're doing in this notebook

## Import libraries

In [20]:
from datetime import datetime, date, timedelta
import pandas as pd
import networkx as nx
from itertools import combinations  
import numpy as np
import os
import sys

# This allows us to import from src and utils
for dir in ['src','utils']: sys.path.append(os.path.join(os.getcwd(), '..', dir))
 
from experience import ExperienceStudyRunner, CareDate, CareTeam
from utils import columns, get_careteam_data

## Read in EHR data and create study runner object

The study runner is a generator object

In [25]:
notes = pd.read_csv('../data/notes_w_disposition_large.csv', parse_dates=[2])

WINDOW = 90
STEP = 2

get_care_dates = ExperienceStudyRunner(notes, WINDOW, STEP)

## Gather data for each care team identified on each care date

The `get_careteam_data` utility function uses the `care_team` network graph
to calculate the cumulative experience and other metrics for the care team


In [26]:
from time import perf_counter

t1_start = perf_counter()

# data_list = [get_careteam_data(care_team) for care_date in get_care_dates for care_team in care_date]

output_path = '../data/experience_large.csv'
for care_date in get_care_dates:
    care_date_list = [get_careteam_data(care_team) for care_team in care_date]
    df = pd.DataFrame(data_list, columns=columns)
    df.to_csv(output_path, mode='a', header=not os.path.exists(output_path))
    
t1_stop = perf_counter()
print(f"Elapsed seconds: {t1_stop - t1_start}")

Elapsed seconds: 9105.095341599954


In [None]:
data_list = (get_careteam_data(care_team) for care_date in get_care_dates for care_team in care_date)


## Convert data into DataFrame for analysis

In [32]:
print(f"Elapsed seconds: {t1_stop - t1_start}")

experience_df = pd.read_csv('../data/experience.csv')
print(experience_df.shape)
experience_df.drop_duplicates(inplace=True)
print(experience_df.shape)
# experience_df = pd.DataFrame(data_list, columns=columns)
discharges_df = pd.read_csv('../data/discharges_w_disposition.csv')

merged_df = experience_df.merge(discharges_df, left_on='discharge_id', right_on='id', copy=False)

print(merged_df.shape)
print(merged_df.head())



Elapsed seconds: 9105.095341599954
(51216, 7)
(388, 7)
(388, 13)
   Unnamed: 0  discharge_id  avg_clust  cumulative_experience  \
0           0           454   0.561905                    5.0   
1           1           476   0.833333                    3.0   
2           2           451   0.527778                    7.0   
3           3           416   0.000000                    2.0   
4           4           169   0.833333                    2.0   

   avg_cumulative_experience  team_edge_size  team_size   id          patient  \
0                   0.714286              12          7  454    Barry Stanley   
1                   0.750000               5          4  476     Elmer Lawson   
2                   1.166667              10          6  451      Clara Rojas   
3                   0.500000               3          4  416   Taylor Morales   
4                   0.500000               5          4  169  Clifford Hansen   

  arrive_date discharge_date  disposition  age  
0  2020-

## Analysis can be performed on the resulting DataFrame to study, e.g. the correlation between cumulative care team experience and patient outcomes 

In [29]:
# %matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# import pandas as pd
import statsmodels.api as sm

l = ['avg_clust', 'cumulative_experience', 'avg_cumulative_experience']

# for var in l:

model = sm.GLM.from_formula(f'disposition ~ {l[0]} + age', family = sm.families.Binomial(), data=merged_df)
result = model.fit()
result.summary() 

0,1,2,3
Dep. Variable:,disposition,No. Observations:,388.0
Model:,GLM,Df Residuals:,385.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-110.87
Date:,"Mon, 25 Jan 2021",Deviance:,221.74
Time:,03:07:15,Pearson chi2:,388.0
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-8.2942,3.073,-2.699,0.007,-14.317,-2.271
avg_clust,-0.0897,0.472,-0.190,0.849,-1.014,0.835
age,0.0815,0.041,1.966,0.049,0.000,0.163


In [30]:
model = sm.GLM.from_formula(f'disposition ~ {l[1]} + age', family = sm.families.Binomial(), data=merged_df)
result = model.fit()
result.summary() 

0,1,2,3
Dep. Variable:,disposition,No. Observations:,388.0
Model:,GLM,Df Residuals:,385.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-109.06
Date:,"Mon, 25 Jan 2021",Deviance:,218.12
Time:,03:08:08,Pearson chi2:,384.0
No. Iterations:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-8.1931,3.117,-2.628,0.009,-14.303,-2.083
cumulative_experience,-0.0447,0.025,-1.765,0.078,-0.094,0.005
age,0.0839,0.042,1.991,0.046,0.001,0.167


In [31]:
model = sm.GLM.from_formula(f'disposition ~ {l[2]} + age', family = sm.families.Binomial(), data=merged_df)
result = model.fit()
result.summary() 
    

0,1,2,3
Dep. Variable:,disposition,No. Observations:,388.0
Model:,GLM,Df Residuals:,385.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-109.69
Date:,"Mon, 25 Jan 2021",Deviance:,219.37
Time:,03:08:18,Pearson chi2:,383.0
No. Iterations:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-8.1604,3.104,-2.629,0.009,-14.244,-2.077
avg_cumulative_experience,-0.2820,0.188,-1.500,0.134,-0.650,0.086
age,0.0835,0.042,1.990,0.047,0.001,0.166
