# RTR Connectivity Graph

- Index pruned trends to pruned keywords
- Build graph

## Next steps:

- Make Network With Pruned Trends

Create new structure with: 
- keywords from given abstract
- trending words from given abstract

TODO: include pubmed link to original abstract

In [4]:
# SETUP
%run rtr.ipynb
import operator

Create data structure to hold trend and associated keywords

In [5]:
from collections import Counter
class Trend:
    def __init__(self, trend):
        self.trend = trend
        #using a set doesn't allow duplicates
        #self.keywords = set()
        self.keywords = Counter()
    def add_keyword(self, keyword):
        #self.keywords.add(keyword)
        self.keywords.update({keyword:1})
    def as_dict(self):
        return {self.trend:self.keywords}

In [6]:
def pair_trends_keywords(df, window_records, n_trends, n_keywords, from_sql = False):
    df_pruned = df.iloc[:n_trends,:]
    trends_list = []
    for entry in df_pruned.key:
        trend_str = ' '.join(entry)
        trend = Trend(trend_str)
        trends_list.append(trend)
    # get filtered keywords
    top_keywords = get_top_keywords(n_keywords)
    top_keywords_text = [keyword[0] for keyword in top_keywords]
    #print(top_keywords_text)
    # TRIPLE LOOP - sure to be a bottleneck
    for trend in trends_list:
        for window in window_records:
            # direct method only --
            # update to include sql compatibility
            # when sql is working
            for abstract_record in window:
                if from_sql:
                    abstract = abstract_record[0]
                else:
                    abstract = abstract_record['Abstract'][0]
                if trend.trend in abstract:
                    if from_sql:
                        keywords = str.split(abstract_record[2], ',')
                    else:
                        keywords = abstract_record['Keywords']
                    for keyword in keywords:
                        if keyword in top_keywords_text:
                            #print("matching trend", trend.trend, "and keyword", keyword)
                            trend.add_keyword(keyword)
                #else:
                    #print("not in abstract")
    return trends_list

In [24]:
import pickle
def unpickle_data():
    with open("df.txt", "rb") as fp:   # Unpickling
        df = pickle.load(fp)
    with open("window_records.txt", "rb") as fp:   # Unpickling
        window_records = pickle.load(fp)
    return df, window_records
df, window_records = unpickle_data()

In [25]:
n_trends = 1000
n_keywords = 1000
trends_list = []
for sub_df in df:
    trends_list.append(pair_trends_keywords(sub_df, window_records, n_trends, n_keywords, from_sql = True))

creating engine:
-  postgresql://ericbarnhill:carter0109@localhost/rtr_db
-  ['rtr_abstracts', 'rtr_keywords']


KeyboardInterrupt: 

In [12]:
def print_trend_dict(trends_list, N=50):
    n = 0
    for trend in trends_list:
        print(trend.as_dict())
        n += 1
        if n > N:
            return
for trends_sublist in trends_list:
    print_trend_dict(trends_sublist)

{'value sensitivity': Counter()}
{'half patient': Counter()}
{'patient also': Counter()}
{'vessel diameter': Counter({'imaging': 6})}
{'normal eye': Counter({'imaging': 24})}
{'powerful tool': Counter({'imaging': 18, 'tomography': 6, 'microscopy': 6, 'rats': 4})}
{'mean diameter': Counter({'rats': 6, 'microscopy': 4, 'carcinoma': 2})}
{'secondary outcome': Counter({'imaging': 6, 'tomography': 4})}
{'r correlation': Counter({'imaging': 47, 'tomography': 15, 'carcinoma': 10, 'rats': 10, 'echocardiography': 8, 'radiotherapy': 7, 'hypertension': 6, 'mice': 6, 'cartilage': 4, 'radiography': 4, 'aged': 2, 'ultrasonography': 1})}
{'functional status': Counter({'imaging': 8, 'rats': 3, 'ultrasonography': 2, 'echocardiography': 2})}
{'intraocular pressure': Counter({'imaging': 8, 'hypertension': 6, 'rats': 2, 'dna': 1})}
{'patient due': Counter()}
{'determined whether': Counter({'evoked potentials': 6, 'tomography': 4, 'cartilage': 3})}
{'included retrospective': Counter({'imaging': 9, 'dna': 6

## Building the graph

In [13]:
import networkx as nx

In [None]:
def populate_graph(trends_list):
    G = nx.Graph()
    for trend in trends_list:
        node1 = trend.trend
        for key, item in trend.keywords.items():
            node2 = key
            #print(node1, "|", node2, "|", item)
            if G.has_edge(node1, node2):
                print("increasing weight by", item)
                G[node1][node2]['weight'] += item
            else:
                # new edge. add with weight=1
                G.add_node(node1, is_key = False)
                G.add_node(node2, is_key = True)
                G.add_edge(node1, node2, weight=item)
    for u, v, d in G.edges(data=True):
        weight = d['weight']
        if weight > 50:
            print(u,"," , v, ",", "weight: ", d['weight'])
    return G
G = []
for trends_sublist in trends_list:
    G.append(populate_graph(trends_sublist))

## Find ten most heavily weighted edges of the graph

In [22]:
def top_N_trends(G, keyword, n=10):
    node_edges = G.edges(keyword.lower())
    edges_dict = {}
    for edge in node_edges:
        key = edge[1]
        value = G[edge[0]][edge[1]]['weight']
        edges_dict.update({key:value})
    index = 1
    trends = []
    for key, value in sorted(edges_dict.items(), key=operator.itemgetter(1), reverse=True):
        trends.append(' '.join([key, str(value)]))
        index += 1
        if index > n:
            break
    return trends
for G_sub in G:
    trends = top_N_trends(G_sub, 'imaging')
    print(trends)
    print('---')

['resonance imaging 275', 'soft tissue 168', 'mean age 143', 'ass correlation 133', 'beam computed 112', 'operating characteristic 85', 'optical coherence 76', 'determine whether 74', 'tomography scan 73', 'coronary artery 73']
---
['soft tissue 168', 'mean age 143', 'ass correlation 133', 'beam computed 112', 'consecutive patient 87', 'optical coherence 76', 'coherence tomography 73', 'coronary artery 73', 'tomography scan 73', 'cone beam 70']
---
['n patient 423', 'consecutive patient 87', 'optical coherence 76', 'coherence tomography 73', 'surgical planning 65', 'study included 56', 'face area 54', 'analysis show 53', 'tomography angiography 52', 'quantitative analysis 51']
---


## Find which keywords made the graph

In [21]:
def get_hot_keywords(G):
    N = 20
    keywords =  {}
    for node in G.nodes(data=True):
        if node[1]['is_key']:
            total_wt = 0
            for edge in G.edges(node[0], data=True):
                total_wt += edge[2]['weight']
            keywords.update({node[0]: total_wt})
    n = 0
    for item in sorted(keywords.items(), key=operator.itemgetter(1), reverse=True):
        print(item)
        n += 1
        if n > N:
            break
for G_sub in G:
    get_hot_keywords(G_sub)
    print('---')

('imaging', 6255)
('tomography', 2584)
('rats', 2420)
('ultrasonography', 1884)
('carcinoma', 1805)
('echocardiography', 1106)
('mice', 906)
('cartilage', 685)
('radiography', 627)
('microscopy', 412)
('dna', 331)
('aneurysm', 322)
('aged', 249)
('evoked potentials', 238)
('hypertension', 234)
('osteoporosis', 183)
('cholesterol', 159)
('electrocardiography', 134)
('radiotherapy', 100)
('magnetic resonance imaging', 100)
('biopsy', 97)
---
('imaging', 5816)
('tomography', 2791)
('rats', 2653)
('ultrasonography', 1798)
('carcinoma', 1502)
('echocardiography', 1289)
('mice', 986)
('cartilage', 654)
('radiography', 581)
('microscopy', 427)
('aneurysm', 331)
('dna', 317)
('aged', 297)
('hypertension', 250)
('electrocardiography', 232)
('evoked potentials', 196)
('cholesterol', 196)
('osteoporosis', 171)
('magnetic resonance imaging', 144)
('biopsy', 99)
('radiotherapy', 89)
---
('imaging', 5009)
('rats', 2649)
('tomography', 2570)
('carcinoma', 1690)
('ultrasonography', 1588)
('mice', 965)

In [None]:
# pickle graph
def pickle_graph():
    try: 
        with open("/home/ericbarnhill/Documents/code/insight/rtr/G.txt", "wb") as fp:
            pickle.dump(G, fp)
    except:
        print('error')
pickle_graph()

In [None]:
os.path.exists("/home/ericbarnhill/Documents/code/insight/rtr/")

In [12]:
keywords = get_top_keywords(1000)

creating engine:
-  postgresql://ericbarnhill:carter0109@localhost/rtr_db
-  ['rtr_abstracts', 'rtr_keywords']


In [None]:
print(keywords)