In [1]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import random as rnd
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, StratifiedKFold,  cross_val_score
import heapq
import altair as alt
alt.data_transformers.enable("vegafusion")
from sklearn.metrics import roc_curve, auc, accuracy_score, f1_score, precision_score, recall_score


In [2]:
df_orig = pd.read_csv("comorbidity_odds_matrix.csv")
threshold = 0
df_orig.head(5)
adjacency_matrix_df = pd.read_csv('comorbidity_odds_matrix.csv', index_col=0)

# matrix -> edge list
edge_list = adjacency_matrix_df.stack().reset_index()
edge_list.columns = ['source', 'target', 'weight']
df_orig

Unnamed: 0.1,Unnamed: 0,AHRQ_AbdominalHernia_DT,AHRQ_AcquiredDeformities_DT,AHRQ_AdjustmentDO_DT,AHRQ_Anemia_DT,AHRQ_Asthma_DT,AHRQ_AttentionDeficitDO_DT,AHRQ_BacterialInfection_DT,AHRQ_BiliaryTractDs_DT,AHRQ_Burns_DT,...,NEPEC_MDD_DT,NEPEC_AFBPDX_DT,NEPEC_PTSD_DT,NEPEC_DXODep_DT,NEPEC_ANXunsp_DT,NEPEC_ANXgen_DT,dmdxDT,cancerdxDT,anomdxDT,genitaldxDT
0,AHRQ_AbdominalHernia_DT,0.000000,0.021786,-0.064825,0.090966,0.252191,-0.163508,-0.131310,0.710723,0.056411,...,-0.498740,0.193681,0.218394,0.566455,-0.190068,-0.187180,-0.258791,0.098747,0.005116,0.170254
1,AHRQ_AcquiredDeformities_DT,-0.029977,0.000000,-0.025956,-0.438555,0.024919,-0.068868,-0.481755,-0.470741,0.465003,...,-0.528881,0.041082,0.234353,0.495289,-0.248027,-0.272405,-0.256588,-0.311974,0.870268,0.021739
2,AHRQ_AdjustmentDO_DT,0.070200,0.138657,0.000000,-0.133794,0.100560,-0.017296,-0.108122,-0.075878,0.359342,...,-0.371717,-0.119203,-0.004907,1.008589,0.173898,-0.139772,-0.153220,-0.218298,-0.165416,0.120009
3,AHRQ_Anemia_DT,0.168454,-0.183813,-0.140857,0.000000,-0.020432,-0.237165,0.603858,0.341700,0.314066,...,-0.415647,0.179997,-0.056869,0.519196,-0.251697,-0.242676,0.102596,0.148492,-0.286744,0.164761
4,AHRQ_Asthma_DT,0.045368,-0.053641,-0.132397,-0.443185,0.000000,-0.007760,-0.494482,-0.269715,0.281069,...,-0.525055,0.246626,0.305148,0.590638,-0.219870,-0.301684,-0.277794,-0.425384,-0.308434,-0.051846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,NEPEC_ANXgen_DT,0.123031,0.080503,0.206096,-0.173856,0.131367,0.239518,-0.197005,-0.154001,-0.134128,...,0.391847,-0.018868,0.156169,0.944590,1.080603,0.000000,-0.118804,-0.117126,-0.074685,0.110762
91,dmdxDT,-0.054709,-0.148972,-0.111919,-0.145929,0.163565,-0.286049,-0.332102,0.014565,0.098773,...,-0.346833,0.259720,0.205407,0.539875,-0.194428,-0.234471,0.000000,-0.339655,-0.348517,-0.067273
92,cancerdxDT,0.445680,-0.010087,-0.056787,0.202946,0.019497,-0.083310,-0.048166,0.163758,0.004694,...,-0.439969,0.038738,0.138285,0.473643,-0.147065,-0.170342,-0.091718,0.000000,-0.052402,0.478656
93,anomdxDT,0.140942,1.543945,0.040799,-0.310538,0.022777,-0.137930,-0.462064,-0.189916,0.203408,...,-0.439871,0.004980,0.189222,0.483185,-0.175950,-0.183526,-0.195816,0.010378,0.000000,0.187030


In [3]:
df_filter = df_orig.iloc[:,1:] 
df_filter = df_filter.where(df_filter > threshold,0) 
df_clean = pd.concat([df_orig.iloc[:,0], df_filter], axis=1)
df_clean.head()

Unnamed: 0.1,Unnamed: 0,AHRQ_AbdominalHernia_DT,AHRQ_AcquiredDeformities_DT,AHRQ_AdjustmentDO_DT,AHRQ_Anemia_DT,AHRQ_Asthma_DT,AHRQ_AttentionDeficitDO_DT,AHRQ_BacterialInfection_DT,AHRQ_BiliaryTractDs_DT,AHRQ_Burns_DT,...,NEPEC_MDD_DT,NEPEC_AFBPDX_DT,NEPEC_PTSD_DT,NEPEC_DXODep_DT,NEPEC_ANXunsp_DT,NEPEC_ANXgen_DT,dmdxDT,cancerdxDT,anomdxDT,genitaldxDT
0,AHRQ_AbdominalHernia_DT,0.0,0.021786,0.0,0.090966,0.252191,0.0,0.0,0.710723,0.056411,...,0.0,0.193681,0.218394,0.566455,0.0,0.0,0.0,0.098747,0.005116,0.170254
1,AHRQ_AcquiredDeformities_DT,0.0,0.0,0.0,0.0,0.024919,0.0,0.0,0.0,0.465003,...,0.0,0.041082,0.234353,0.495289,0.0,0.0,0.0,0.0,0.870268,0.021739
2,AHRQ_AdjustmentDO_DT,0.0702,0.138657,0.0,0.0,0.10056,0.0,0.0,0.0,0.359342,...,0.0,0.0,0.0,1.008589,0.173898,0.0,0.0,0.0,0.0,0.120009
3,AHRQ_Anemia_DT,0.168454,0.0,0.0,0.0,0.0,0.0,0.603858,0.3417,0.314066,...,0.0,0.179997,0.0,0.519196,0.0,0.0,0.102596,0.148492,0.0,0.164761
4,AHRQ_Asthma_DT,0.045368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.281069,...,0.0,0.246626,0.305148,0.590638,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df_clean.to_csv('cleaned_data_2.csv', index=False)

In [5]:
csv_file = pd.read_csv('cleaned_data.csv', index_col=0)

G = nx.DiGraph()

for i, row in csv_file.iterrows():
    for j, weight in row.items():
        if weight != 0 :
            G.add_edge(i,j,weight=weight)

rnd.seed()
m = G.edges()
print(len(m))

4837


In [6]:
def create_holdout_set(G,rate):
    G_missing = nx.DiGraph()  
    # G_missing.add_nodes_from(G.nodes()) 
    missing_links = []
    add_edge = []
    for u, v, data in G.edges(G,data=True):
        #print("u,v",u,v)
        if np.random.random() > rate:
            G_missing.add_edge(u, v, **data)
        else:
            missing_links.append((u,v))
            
    return missing_links, G_missing

In [7]:
holdout_links, G_new_orig = create_holdout_set(G,0.2)
test = G_new_orig.edges()
print(len(test))

3837


In [8]:
def create_training_set(G,rate):
    G_missing = nx.DiGraph()  
    # G_missing.add_nodes_from(G.nodes()) 
    missing_links = []
    add_edge = []
    for u, v, data in G.edges(G,data=True):
        #print("u,v",u,v)
        if np.random.random() > rate:
            G_missing.add_edge(u, v, **data)
        else:
            missing_links.append((u,v))
            
    return missing_links, G_missing
    

In [9]:
Y, Go = create_training_set(G_new_orig,0.2)

node_dict = {}
index = 0
for node in G.nodes():
  if node not in node_dict:
    node_dict[node] = index
    index += 1

In [10]:
def get_candidateEdges(G):
    nonedges= []
    for i in nx.nodes(G):
        for j in nx.nodes(G):
            if i!=j:
                if not G.has_edge(i,j):
                    nonedges.append((i,j))
                if not G.has_edge(j,i):
                    nonedges.append((j,i))
    return nonedges


In [11]:
def jaccard_predictor(G, i, j):
    n = G.order()
    numerator = len(set(G.successors(i)).intersection(G.predecessors(j))) # sum of common outgoing i edges and incoming j edges
    denominator = len(set(G.successors(i)).union(G.predecessors(j))) # sum outgoing i and incoming j edges
    
    if denominator == 0:
        return 0.0
    
    return (numerator / denominator) +  rnd.uniform(0,1/(10*n))

    

In [12]:
def degree_product(G, i, j):
    n = G.order()
    degree_prod = G.out_degree(i) * G.in_degree(j)
    
    score = degree_prod +  rnd.uniform(0,1/(10*n)) 
    
    return score

In [13]:
def dijkstra_predictor(G,i,j,flag):
    distances = {node: float('inf') for node in G.nodes()} 
    distances[i] = 0
    priority_queue = [(0, i)]
    tiebreaker = rnd.uniform(0,.99)
    predecessor = {}
    
    
    while priority_queue: # until empty
        current_distance, current_node = heapq.heappop(priority_queue) #smallest dist.
        if current_distance > distances[current_node]:
            continue # skip if smaller distance already
        for neighbor in G.neighbors(current_node):
            weight = G[current_node][neighbor].get('weight', 1) 
            distance = current_distance + weight
            if distance < distances[neighbor]: 
                distances[neighbor] = distance
                predecessor[neighbor] = current_node
                heapq.heappush(priority_queue, (distance, neighbor))
    shortest_path = []
    current_node = j
    while current_node in predecessor:
        shortest_path.insert(0, current_node) # j at the front
        current_node = predecessor[current_node]
    shortest_path.insert(0, i)

    if flag == 1:
        print(shortest_path)
    score = distances[j] + tiebreaker
    
    return -score
    

In [14]:
X = get_candidateEdges(Go) 
test = []
for i,j in X: 
    if (i,j) in Y:
        test.append("i")

print(len(test))

1570


In [15]:
X_test = get_candidateEdges(G_new_orig) 
test = []
for i,j in X_test: 
    if (i,j) in holdout_links:
        test.append("i")

print(len(test))

2000


In [16]:
def apply_predictors(G,Y):
    flag = 0
    X = get_candidateEdges(G)
    number_edges = len(X)
    dt = np.dtype([('i', 'U50'), ('j', 'U50'), ('tau', float), ('dp', float), ('jc', float), ('dij', float)])
    dat = np.zeros(number_edges, dtype=dt)

    for k,x in enumerate(X):
        i = x[0]
        j = x[1]
        tau = (i,j) in Y
        dp = degree_product(G,i,j)
        jc = jaccard_predictor(G, i,j)
        dij = dijkstra_predictor(G,i,j,flag)
        dat[k]['i'] = i
        dat[k]['j'] = j
        dat[k]['tau'] = tau
        dat[k]['dp'] = dp
        dat[k]['jc'] = jc
        dat[k]['dij'] = dij
        
        
    return dat

In [17]:
dat = apply_predictors(Go,Y)
num_row = np.shape(dat)[0]
sdat = dat[dat[:6].argsort()[::-1][:num_row]]

for row in dat[:10]:  # Loop through the first 10 
    print(f" Nodes: ({row['i']}, {row['j']}) : missing? {row['tau']} |  degree product: {row['dp']} | jaccard: {row['jc']} | dijkstra: {row['dij']}")
    print("----------------------------------------------------------------------------------------------------------------------")

 Nodes: (AHRQ_AcquiredDeformities_DT, AHRQ_AbdominalHernia_DT) : missing? 0.0 |  degree product: 1015.0006049703816 | jaccard: 0.23103850989982067 | dijkstra: -0.5055038192711269
----------------------------------------------------------------------------------------------------------------------
 Nodes: (AHRQ_DementiaAndOthDO_DT, AHRQ_AbdominalHernia_DT) : missing? 0.0 |  degree product: 875.0000552499716 | jaccard: 0.1326348743910208 | dijkstra: -0.47689653119828224
----------------------------------------------------------------------------------------------------------------------
 Nodes: (AHRQ_DisOfArteries_DT, AHRQ_AbdominalHernia_DT) : missing? 1.0 |  degree product: 1400.0003719473784 | jaccard: 0.22993959764721483 | dijkstra: -0.10256847010905872
----------------------------------------------------------------------------------------------------------------------
 Nodes: (AHRQ_DisOfTheHeart_DT, AHRQ_AbdominalHernia_DT) : missing? 0.0 |  degree product: 1365.0010403729789 | jac

In [18]:
df = pd.DataFrame(dat)
df = df.drop_duplicates(subset=['i','j'])
df_dij = df[['i','j','tau','dij']].copy()
df_dp = df[['i','j','tau','dp']].copy()
df_jc = df[['i','j','tau','jc']].copy()

df_jc_sorted = df_jc.sort_values(by='jc',ascending=False).reset_index(drop=True)
df_dp_sorted = df_dp.sort_values('dp', ascending=False).reset_index(drop=True)
df_dij_sorted = df_dij.sort_values('dij', ascending=False).reset_index(drop=True)

In [19]:
# df contains the information for the random forest
df

Unnamed: 0,i,j,tau,dp,jc,dij
0,AHRQ_AcquiredDeformities_DT,AHRQ_AbdominalHernia_DT,0.0,1015.000605,0.231039,-0.505504
1,AHRQ_DementiaAndOthDO_DT,AHRQ_AbdominalHernia_DT,0.0,875.000055,0.132635,-0.476897
2,AHRQ_DisOfArteries_DT,AHRQ_AbdominalHernia_DT,1.0,1400.000372,0.229940,-0.102568
3,AHRQ_DisOfTheHeart_DT,AHRQ_AbdominalHernia_DT,0.0,1365.001040,0.254966,-0.130765
4,AHRQ_DOOfLipidMetabolism_DT,AHRQ_AbdominalHernia_DT,0.0,735.000071,0.244830,-0.576676
...,...,...,...,...,...,...
11443,NEPEC_MDD_DT,NEPEC_ANXgen_DT,0.0,144.000819,0.026289,-0.229713
11444,NEPEC_ANXgen_DT,AHRQ_ImmunityDO_DT,0.0,93.000203,0.031354,-0.959736
11445,AHRQ_ImmunityDO_DT,NEPEC_ANXgen_DT,0.0,92.000331,0.039296,-0.289378
11592,NEPEC_MDD_DT,AHRQ_ImmunityDO_DT,0.0,108.000613,0.027355,-0.853136


In [20]:
pre_dp_scores = df['dp']
dp_scores = (pre_dp_scores - np.min(pre_dp_scores)) / (np.max(pre_dp_scores) - np.min(pre_dp_scores))
pre_jc_scores = df['jc']
jc_scores = (pre_jc_scores - np.min(pre_jc_scores)) / (np.max(pre_jc_scores) - np.min(pre_jc_scores))
pre_dij_scores = df['dij']
dij_scores = (np.max(pre_dij_scores) - pre_dij_scores) / (np.max(pre_dij_scores) - np.min(pre_dij_scores))
y_true = df['tau']
nodei = df['i']
nodej = df['j']
features = np.vstack([dp_scores, jc_scores, dij_scores]).T
print(features)

[[3.63088560e-01 4.77957843e-01 3.03478458e-01]
 [3.10357666e-01 2.74385079e-01 2.85307434e-01]
 [5.08097863e-01 4.75684466e-01 4.75385060e-02]
 ...
 [1.54425687e-02 8.12905801e-02 1.66197857e-01]
 [2.14690391e-02 5.65863674e-02 5.24290509e-01]
 [1.54428300e-02 2.45831736e-05 4.34348954e-01]]


# Hyperparameter Tuning

In [21]:
model = RandomForestClassifier()
#model.fit(features,y_true)

In [22]:
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5, 10, 15, 20]
}

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(features, y_true)
best_model = grid_search.best_estimator_
print(best_model)


RandomForestClassifier(max_depth=5, n_estimators=150)


In [23]:
y_pred = best_model.predict(features)
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8701939435182034


# Test on training set

In [24]:
fpr1, tpr1, thresholds1 = roc_curve(y_true, y_pred)
roc_auc1 = auc(fpr1, tpr1)
print("AUC:", roc_auc1)

AUC: 0.5150902762752922


# Holdout Dataset

In [25]:
dat_ho = apply_predictors(G_new_orig,holdout_links)
num_row = np.shape(dat_ho)[0]
sdat_ho = dat_ho[dat_ho[:6].argsort()[::-1][:num_row]]

for row in dat_ho[:10]:  # Loop through the first 10 
    print(f" Nodes: ({row['i']}, {row['j']}) : missing? {row['tau']} |  degree product: {row['dp']} | jaccard: {row['jc']} | dijkstra: {row['dij']}")
    print("----------------------------------------------------------------------------------------------------------------------")
# for k in range(20):
#     print(f'{int(sdat[k,0])},{int(sdat[k,1])} : {sdat[k,2]} : {sdat[k,3]} {sdat[k,4]} {sdat[k,5]} {sdat[k,6]}')

 Nodes: (AHRQ_AcquiredDeformities_DT, AHRQ_AbdominalHernia_DT) : missing? 0.0 |  degree product: 1551.0001616046509 | jaccard: 0.3338957242009196 | dijkstra: -0.8412763040900387
----------------------------------------------------------------------------------------------------------------------
 Nodes: (AHRQ_Anemia_DT, AHRQ_AbdominalHernia_DT) : missing? 1.0 |  degree product: 2397.0009884280444 | jaccard: 0.2566467484981379 | dijkstra: -0.0650032755272311
----------------------------------------------------------------------------------------------------------------------
 Nodes: (AHRQ_DementiaAndOthDO_DT, AHRQ_AbdominalHernia_DT) : missing? 0.0 |  degree product: 1504.0007775855133 | jaccard: 0.19720963748104806 | dijkstra: -0.365494850185053
----------------------------------------------------------------------------------------------------------------------
 Nodes: (AHRQ_DisOfTheHeart_DT, AHRQ_AbdominalHernia_DT) : missing? 1.0 |  degree product: 2256.0001985775575 | jaccard: 0.35

In [26]:
df_ho = pd.DataFrame(dat_ho)
df_ho = df_ho.sort_values(by='jc',ascending=False).reset_index(drop=True)
df_ho

Unnamed: 0,i,j,tau,dp,jc,dij
0,AHRQ_BacterialInfection_DT,AHRQ_COPDAndBronchiectasis_DT,1.0,3009.000952,0.594623,-0.249306
1,AHRQ_Anemia_DT,AHRQ_DisOfArteries_DT,1.0,2601.000985,0.594492,-0.719432
2,AHRQ_BacterialInfection_DT,AHRQ_COPDAndBronchiectasis_DT,1.0,3009.000998,0.594212,-0.377416
3,AHRQ_Anemia_DT,AHRQ_DisOfArteries_DT,1.0,2601.000790,0.594117,-0.219486
4,AHRQ_OpenWounds_DT,AHRQ_Fractures_DT,1.0,3264.000735,0.589991,-0.922963
...,...,...,...,...,...,...
10181,AHRQ_CrushingInjuryOrInternalIn0,NEPEC_MDD_DT,0.0,195.000085,0.000036,-1.135179
10182,AHRQ_SprainsAndStrains_DT,AHRQ_NutritionalDeficiencies_DT,0.0,144.000845,0.000023,-0.059360
10183,AHRQ_NoninfectiousGastroenterit0,AHRQ_NutritionalDeficiencies_DT,0.0,160.000853,0.000014,-0.065223
10184,AHRQ_OthEndocrineDO_DT,NEPEC_DXALC_DT,0.0,128.000243,0.000013,-0.756459


In [27]:
pre_dpho_scores = df_ho['dp']
dpho_scores = (pre_dpho_scores - np.min(pre_dpho_scores)) / (np.max(pre_dpho_scores) - np.min(pre_dpho_scores))
pre_jcho_scores = df_ho['jc']
jcho_scores = (pre_jcho_scores - np.min(pre_jcho_scores)) / (np.max(pre_jcho_scores) - np.min(pre_jcho_scores))
pre_dijho_scores = df_ho['dij']
dijho_scores = (np.max(pre_dijho_scores) - pre_dijho_scores) / (np.max(pre_dijho_scores) - np.min(pre_dijho_scores))
y_true_ho = df_ho['tau']
i_ho = df_ho['i']
j_ho = df_ho['j']
features_ho = np.vstack([dpho_scores, jcho_scores, dijho_scores]).T
print(features_ho)
print(len(i_ho))

[[6.65075100e-01 1.00000000e+00 1.69624651e-01]
 [5.72305686e-01 9.99778376e-01 5.14630575e-01]
 [6.65075110e-01 9.99307575e-01 2.63639178e-01]
 ...
 [1.72807621e-02 1.47670015e-05 3.45336591e-02]
 [1.00045903e-02 1.25012301e-05 5.41803190e-01]
 [1.63711443e-02 0.00000000e+00 3.74601396e-01]]
10186


# Test the model on holdout set

In [28]:
def tabulate_TPR_FPR(df):
    

    TP = df['tau'].sum()  #TP
    TN = len(df) - TP  #TN

    TPR = df['tau'].cumsum() / TP  
    FPR = (df.index + 1 - df['tau'].cumsum()) / TN
    return df.assign(TPR=TPR, FPR=FPR)  
        

In [29]:
y_pred_ho = best_model.predict(features_ho)
accuracy_ho = accuracy_score(y_true_ho, y_pred_ho)
print("Accuracy:", accuracy_ho)

Accuracy: 0.8055173767916749


In [30]:
fpr, tpr, thresholds = roc_curve(y_true_ho, y_pred_ho)
roc_auc = auc(fpr, tpr)
print("AUC:", roc_auc)

AUC: 0.5064502809675054


In [31]:
first = pd.DataFrame(fpr)
first = first.assign(TPR = tpr, thresholds = threshold, FPR = fpr)
first
testchart = alt.Chart(first).mark_line().encode(
    x='FPR',
    y='TPR'
).properties(
    width=500,
    height=300,
    title='ROC Curve for Meta-Learner'
)
testchart


In [32]:
precision = precision_score(y_true_ho, y_pred_ho)
recall = recall_score(y_true_ho, y_pred_ho)

print("Precision:", precision)
print("Recall:", recall)

Precision: 0.7567567567567568
Recall: 0.014


In [33]:
top_n = len(holdout_links)
pred_probs = best_model.predict_proba(features_ho)


In [43]:
# new_rows = pd.DataFrame({"Node 1": [link[0] for link in pred_probs], "Node 2": [link[1] for link in pred_probs]})
# predictions_df = pd.concat([predictions_df, new_rows], ignore_index=True)
predictions_df = pd.DataFrame(pred_probs)
predictions_df = predictions_df.assign(i=df_ho['i'], j = df_ho['j'], tau=df_ho['tau'])
predictions_df = predictions_df.sort_values(by=1,ascending=False).reset_index(drop=True)
top_n_predictions = predictions_df.head(top_n)
top_n_predictions = top_n_predictions.drop_duplicates( 
  subset = ['i', 'j'], 
  keep = 'last').reset_index(drop = True) 
top_n_predictions.head()

Unnamed: 0,0,1,i,j,tau
0,0.452501,0.547499,AHRQ_OpenWounds_DT,NEPEC_DXODep_DT,1.0
1,0.455984,0.544016,AHRQ_Anemia_DT,NEPEC_DXODep_DT,1.0
2,0.461406,0.538594,AHRQ_BacterialInfection_DT,AHRQ_OthInjuriesExternalCauses_D,1.0
3,0.470861,0.529139,AHRQ_SpinalCordInjury_DT,NEPEC_DXODep_DT,1.0
4,0.478185,0.521815,AHRQ_DOOfTeethAndJaw_DT,AHRQ_DementiaAndOthDO_DT,1.0


In [35]:
for index, row in top_n_predictions.iterrows():
  G_new_orig.add_edge(row['i'], row['j'])

betweenness_centrality = nx.betweenness_centrality(G_new_orig)
closeness_centrality = nx.closeness_centrality(G_new_orig)
highest_betweenness_nodes = sorted(betweenness_centrality, key=betweenness_centrality.get, reverse=True)[:10]
highest_closeness_nodes = sorted(closeness_centrality, key=closeness_centrality.get, reverse=True)[:10]
print("Top 10 nodes with highest betweenness centrality:", highest_betweenness_nodes)
print("------------------------------------------------------------------------")
print("Top 10 nodes with highest closeness centrality:", highest_closeness_nodes)


Top 10 nodes with highest betweenness centrality: ['AHRQ_OthInjuriesExternalCauses_D', 'AHRQ_OthGIDO_DT', 'AHRQ_PersonalityDO_DT', 'AHRQ_DOOfTeethAndJaw_DT', 'AHRQ_OthInfections_DT', 'AHRQ_Osteoporosis_DT', 'AHRQ_SystemicLupus_DT', 'NEPEC_PTSD_DT', 'AHRQ_RespiratoryInfections_DT', 'AHRQ_SuicideAndSelfInjury_DT']
------------------------------------------------------------------------
Top 10 nodes with highest closeness centrality: ['AHRQ_OthInjuriesExternalCauses_D', 'NEPEC_DXODep_DT', 'AHRQ_DementiaAndOthDO_DT', 'AHRQ_Burns_DT', 'AHRQ_SprainsAndStrains_DT', 'AHRQ_RespiratoryInfections_DT', 'NEPEC_PTSD_DT', 'AHRQ_SuperficialInjury_DT', 'AHRQ_DOOfTeethAndJaw_DT', 'AHRQ_Fractures_DT']


In [36]:
df_new = tabulate_TPR_FPR(predictions_df)
TPR_new = df_new['TPR']
FPR_new = df_new['FPR']
auc_new = auc(FPR_new, TPR_new)
print(auc_new)

0.8515155753725874


In [37]:
chart = alt.Chart(df_new).mark_line().encode(
    x='FPR',
    y='TPR'
).properties(
    width=500,
    height=300,
    title='ROC Curve for Meta-Learner'
)

# chart