# RTR Connectivity Graph

- Index pruned trends to pruned keywords
- Build graph

## Next steps:

- Make Network With Pruned Trends

Create new structure with: 
- keywords from given abstract
- trending words from given abstract

TODO: include pubmed link to original abstract

In [4]:
# SETUP
%run rtr.ipynb
import operator

Create data structure to hold trend and associated keywords

In [5]:
from collections import Counter
class Trend:
    def __init__(self, trend):
        self.trend = trend
        #using a set doesn't allow duplicates
        #self.keywords = set()
        self.keywords = Counter()
    def add_keyword(self, keyword):
        #self.keywords.add(keyword)
        self.keywords.update({keyword:1})
    def as_dict(self):
        return {self.trend:self.keywords}

In [6]:
def pair_trends_keywords(df, window_records, n_trends, n_keywords, from_sql = False):
    df_pruned = df.iloc[:n_trends,:]
    trends_list = []
    for entry in df_pruned.key:
        trend_str = ' '.join(entry)
        trend = Trend(trend_str)
        trends_list.append(trend)
    # get filtered keywords
    top_keywords = get_top_keywords(n_keywords)
    top_keywords_text = [keyword[0] for keyword in top_keywords]
    #print(top_keywords_text)
    # TRIPLE LOOP - sure to be a bottleneck
    for trend in trends_list:
        for window in window_records:
            # direct method only --
            # update to include sql compatibility
            # when sql is working
            for abstract_record in window:
                if from_sql:
                    abstract = abstract_record[0]
                else:
                    abstract = abstract_record['Abstract'][0]
                if trend.trend in abstract:
                    if from_sql:
                        keywords = str.split(abstract_record[2], ',')
                    else:
                        keywords = abstract_record['Keywords']
                    for keyword in keywords:
                        if keyword in top_keywords_text:
                            #print("matching trend", trend.trend, "and keyword", keyword)
                            trend.add_keyword(keyword)
                #else:
                    #print("not in abstract")
    return trends_list

In [26]:
import pickle
def unpickle_data():
    with open("df.txt", "rb") as fp:   # Unpickling
        df = pickle.load(fp)
    with open("window_records.txt", "rb") as fp:   # Unpickling
        window_records = pickle.load(fp)
    return df, window_records
df, window_records = unpickle_data()

In [27]:
n_trends = 1000
n_keywords = 1000
trends_list = []
for sub_df in df:
    trends_list.append(pair_trends_keywords(sub_df, window_records, n_trends, n_keywords, from_sql = True))

creating engine:
-  postgresql://ericbarnhill:carter0109@localhost/rtr_db
-  ['rtr_abstracts', 'rtr_keywords']
creating engine:
-  postgresql://ericbarnhill:carter0109@localhost/rtr_db
-  ['rtr_abstracts', 'rtr_keywords']
creating engine:
-  postgresql://ericbarnhill:carter0109@localhost/rtr_db
-  ['rtr_abstracts', 'rtr_keywords']


In [28]:
def print_trend_dict(trends_list, N=50):
    n = 0
    for trend in trends_list:
        print(trend.as_dict())
        n += 1
        if n > N:
            return
for trends_sublist in trends_list:
    print_trend_dict(trends_sublist)

{'alone combination': Counter()}
{'ligament acl': Counter()}
{'disease characterized': Counter({'tomography': 6, 'carcinoma': 4, 'pneumonia': 2})}
{'tomography used': Counter()}
{'tested whether': Counter({'microscopy': 4, 'carcinoma': 2})}
{'patients male': Counter()}
{'breast imaging': Counter({'imaging': 16, 'tomography': 6, 'diagnosis': 5, 'radiology': 4})}
{'patients suspected': Counter({'carcinoma': 4})}
{'widely available': Counter({'tomography': 14, 'radiography': 4, 'imaging': 4, 'gene expression regulation': 4})}
{'acoustic radiation': Counter({'diagnosis': 5, 'imaging': 3, 'ultrasonography': 2, 'carcinoma': 2})}
{'mitral regurgitation': Counter({'echocardiography': 11, 'hypertension': 8, 'osteoporosis': 2, 'diagnosis': 2})}
{'body surface': Counter({'tomography': 7, 'imaging': 4, 'carcinoma': 2})}
{'investigate potential': Counter()}
{'much higher': Counter({'tomography': 8, 'imaging': 8, 'carcinoma': 4, 'rats': 2})}
{'vessel disease': Counter({'osteoporosis': 8, 'tomography

## Building the graph

In [13]:
import networkx as nx

In [29]:
def populate_graph(trends_list):
    G = nx.Graph()
    for trend in trends_list:
        node1 = trend.trend
        for key, item in trend.keywords.items():
            node2 = key
            #print(node1, "|", node2, "|", item)
            if G.has_edge(node1, node2):
                print("increasing weight by", item)
                G[node1][node2]['weight'] += item
            else:
                # new edge. add with weight=1
                G.add_node(node1, is_key = False)
                G.add_node(node2, is_key = True)
                G.add_edge(node1, node2, weight=item)
    for u, v, d in G.edges(data=True):
        weight = d['weight']
        if weight > 50:
            print(u,"," , v, ",", "weight: ", d['weight'])
    return G
G = []
for trends_sublist in trends_list:
    G.append(populate_graph(trends_sublist))

tomography , photon emission , weight:  243
tomography , single photon , weight:  103
tomography , coronary artery , weight:  111
imaging , coherence tomography , weight:  56
imaging , optical coherence , weight:  58
imaging , healthy volunteers , weight:  67
imaging , soft tissue , weight:  96
echocardiography , left ventricular , weight:  117
echocardiography , doppler echocardiography , weight:  57
imaging , coherence tomography , weight:  56
imaging , optical coherence , weight:  58
imaging , soft tissue , weight:  96
imaging , determine whether , weight:  53
tomography , coronary artery , weight:  111
tomography , artery disease , weight:  80
echocardiography , left ventricular , weight:  117
echocardiography , heart failure , weight:  67
cartilage , articular cartilage , weight:  95
tomography , artery disease , weight:  80
tomography , imaging technique , weight:  53
tomography , coronary artery , weight:  111
imaging , coherence tomography , weight:  56
imaging , optical cohere

## Find ten most heavily weighted edges of the graph

In [30]:
def top_N_trends(G, keyword, n=10):
    node_edges = G.edges(keyword.lower())
    edges_dict = {}
    for edge in node_edges:
        key = edge[1]
        value = G[edge[0]][edge[1]]['weight']
        edges_dict.update({key:value})
    index = 1
    trends = []
    for key, value in sorted(edges_dict.items(), key=operator.itemgetter(1), reverse=True):
        trends.append(' '.join([key, str(value)]))
        index += 1
        if index > n:
            break
    return trends
for G_sub in G:
    trends = top_N_trends(G_sub, 'imaging')
    print(trends)
    print('---')

['soft tissue 96', 'healthy volunteers 67', 'optical coherence 58', 'coherence tomography 56', 'results demonstrate 48', 'mr image 46', 'important role 45', 'treatment planning 45', 'image acquisition 44', 'blood flow 43']
---
['soft tissue 96', 'optical coherence 58', 'coherence tomography 56', 'determine whether 53', 'navigation system 45', 'image analysis 42', 'ex vivo 37', 'coronary artery 36', 'tomography scan 36', 'see text 36']
---
['imaging technique 65', 'optical coherence 58', 'data set 57', 'coherence tomography 56', 'electron microscopy 52', 'blood flow 43', 'ex vivo 37', 'coronary artery 36', 'mean absolute 36', 'tomography scan 36']
---


## Find which keywords made the graph

In [31]:
def get_hot_keywords(G):
    N = 20
    keywords =  {}
    for node in G.nodes(data=True):
        if node[1]['is_key']:
            total_wt = 0
            for edge in G.edges(node[0], data=True):
                total_wt += edge[2]['weight']
            keywords.update({node[0]: total_wt})
    n = 0
    for item in sorted(keywords.items(), key=operator.itemgetter(1), reverse=True):
        print(item)
        n += 1
        if n > N:
            break
for G_sub in G:
    get_hot_keywords(G_sub)
    print('---')

('imaging', 4254)
('tomography', 3857)
('rats', 1960)
('ultrasonography', 1364)
('echocardiography', 1149)
('carcinoma', 880)
('mice', 856)
('diagnosis', 521)
('microscopy', 430)
('cartilage', 323)
('radiography', 268)
('hypertension', 218)
('osteoporosis', 210)
('aneurysm', 191)
('dna', 175)
('aged', 142)
('cholesterol', 118)
('magnetic resonance imaging', 108)
('electrocardiography', 103)
('evoked potentials', 103)
('biopsy', 97)
---
('imaging', 3826)
('tomography', 3605)
('rats', 1637)
('ultrasonography', 1175)
('carcinoma', 973)
('echocardiography', 958)
('mice', 821)
('diagnosis', 554)
('cartilage', 499)
('microscopy', 357)
('radiography', 274)
('aneurysm', 246)
('heart failure', 222)
('hypertension', 200)
('dna', 174)
('magnetic resonance imaging', 156)
('osteoporosis', 146)
('aged', 140)
('electrocardiography', 127)
('cholesterol', 124)
('gene expression regulation', 96)
---
('imaging', 3542)
('tomography', 3441)
('rats', 1841)
('ultrasonography', 1057)
('carcinoma', 933)
('mice

In [None]:
# pickle graph
def pickle_graph():
    try: 
        with open("/home/ericbarnhill/Documents/code/insight/rtr/G.txt", "wb") as fp:
            pickle.dump(G, fp)
    except:
        print('error')
pickle_graph()

In [None]:
os.path.exists("/home/ericbarnhill/Documents/code/insight/rtr/")

In [12]:
keywords = get_top_keywords(1000)

creating engine:
-  postgresql://ericbarnhill:carter0109@localhost/rtr_db
-  ['rtr_abstracts', 'rtr_keywords']


In [None]:
print(keywords)