In [2]:
import nltk
import spacy
import pandas as pd
from tqdm import tqdm
import os
import nltk
import numpy as np


# Checking for sentence classification

In [50]:
folder = "../LitCovid_Treatment_json/"

dirs = os.listdir(folder)

for file in tqdm(dirs, desc = "dirs"):
    try: 
        if file not in ["exceptionlist.txt", "label2id.pkl", "predict.tf_record"]:
            file_path = os.path.join(folder, file)
            temp = pd.read_json(file_path)
            sections = [temp.sentences.values[i]["Section"] for i in range(temp.shape[0])]
            if any(s != "METHODS" for s in sections):
                print(file)
                print(sections)
    except:
        print("Error lol")
        print(file)

dirs: 100%|████████████████████████████████████████████████████████████████████████| 6459/6459 [01:44<00:00, 61.84it/s]

Error lol
exceptionlist.txt
Error lol
label2id.pkl
Error lol
predict.tf_record





# Get data

In [3]:
folder = "../LitCovid_Treatment_json/"
dirs = os.listdir(folder)
pmids = []
entities = []

for file in tqdm(dirs, desc = "dirs"):
    if file not in ["exceptionlist.txt", "label2id.pkl", "predict.tf_record"]:
        
        #get filepath and read in data
        file_path = os.path.join(folder, file)
        temp = pd.read_json(file_path)
        pmid = file[:-5]
        
        for i in range(temp.shape[0]):
            
            #get entities sentence by sentence
            ents_dict = temp.sentences.values[i].get("entities")
            if len(ents_dict.keys()) > 0:
                entities += [(ents_dict[k]["text"], ents_dict[k]["class"], ents_dict[k]["negation"], pmid) for k in ents_dict.keys()]



dirs: 100%|████████████████████████████████████████████████████████████████████████| 6459/6459 [02:53<00:00, 37.26it/s]


In [4]:
ent_df = pd.DataFrame(entities, columns=["text", "type", "negation", "pmid"])
ent_df.to_csv("LitCovid_treatment_ents.csv")


# Building weighted graph

In [3]:
from collections import Counter
import itertools as it

# get data and group
ent_df = pd.read_csv("LitCovid_treatment_ents.csv")
g = ent_df.groupby(["pmid", "type"])

#initialize variables
groups = list(g.groups)
types = ["Participant", "Outcome", "Intervention"]
edge_weight = Counter()
nodes = []

for p in tqdm(ent_df.pmid.values):
    temp = []
    for tup in list(it.product([p], types)):
        if tup in groups:
            #get pmid entities
            grp = g.get_group(tup)
            
            #format appropriately
            temp_nodes = ["{} ({})".format(t, tup[1][0]) for t in grp.text.values]
            temp.append(temp_nodes)
            nodes += temp_nodes
        else:
            temp.append([])
            
    #get edges
    products = [list(it.product(temp[0], temp[1])), list(it.product(temp[0], temp[2])), list(it.product(temp[1], temp[2]))]
    
    #add edges to adj list
    for prod in products:
        if prod != []:
            edge_weight.update(prod)

100%|██████████████████████████████████████████████████████████████████████████| 125062/125062 [49:55<00:00, 41.75it/s]


In [None]:
import networkx as nx
#same intervention/population -> different outcomes, etc.
#categorize segments by intervention/population
#independence amongst categories
#what are treatments with same outcome
#what are possible outcomes
#build graph from adj list
G = nx.Graph()
for item in edge_weight.items():
    G.add_edge(item[0][0], item[0][1], weight=item[1])

#only show edge if enough hits
cutoff = sorted(list(edge_weight.values()))[-50]
elarge = [(u, v) for (u, v, d) in G.edges(data=True) if d["weight"] >= cutoff]
# esmall = [(u, v) for (u, v, d) in G.edges(data=True) if d["weight"] <= 0.5]

pos = nx.spring_layout(G)  # positions for all nodes

#draw nodes
nx.draw_networkx_nodes(G, pos, node_size=700)

#draw edges
nx.draw_networkx_edges(G, pos, edgelist=elarge, width=6)
# nx.draw_networkx_edges(
#     G, pos, edgelist=esmall, width=6, alpha=0.5, edge_color="b", style="dashed"
# )

#draw labels
nx.draw_networkx_labels(G, pos, font_size=20, font_family="sans-serif")

plt.axis("off")
plt.show()

# Frequencies of PICO element text

In [126]:
from collections import Counter
p_count = Counter(ent_df[ent_df.type == "Participant"].text.values)
i_count = Counter(ent_df[ent_df.type == "Intervention"].text.values)
o_count = Counter(ent_df[ent_df.type == "Outcome"].text.values)

In [129]:
p_count.most_common(100)

[('COVID-19', 3371),
 ('age', 637),
 ('SARS-CoV-2', 398),
 ('SARS-CoV-2 infection', 366),
 ('diabetes', 327),
 ('severe COVID-19', 317),
 ('COVID-19 infection', 267),
 ('hypertension', 239),
 ('children', 208),
 ('hospitalized', 196),
 ('male', 186),
 ('critically ill', 183),
 ('cancer', 169),
 ('COVID-19 pneumonia', 167),
 ('severe', 166),
 ('aged', 161),
 ('comorbidities', 160),
 ('men', 159),
 ('coronavirus disease 2019 ( COVID-19 )', 157),
 ('women', 149),
 ('elderly', 132),
 ('males', 131),
 ('obesity', 130),
 ('.', 117),
 ('adults', 113),
 ('older', 104),
 ('pregnant women', 94),
 ('ICU', 87),
 ('median age', 85),
 ('healthy', 81),
 ('pneumonia', 78),
 ('Covid-19', 78),
 ('adult', 75),
 ('fever', 73),
 ('female', 71),
 ('ARDS', 70),
 ('laboratory-confirmed COVID-19', 69),
 ('females', 68),
 (',', 67),
 ('diabetes mellitus', 65),
 ('hospitalized COVID-19', 61),
 ('mean age', 61),
 ('coronavirus', 55),
 ('mechanical ventilation', 55),
 ('older age', 53),
 ('coronavirus disease 2019