# RTR Connectivity Graph

- Index pruned trends to pruned keywords
- Build graph

## Next steps:

- Make Network With Pruned Trends

Create new structure with: 
- keywords from given abstract
- trending words from given abstract

TODO: include pubmed link to original abstract

In [8]:
# SETUP
%run rtr.ipynb
import operator

Create data structure to hold trend and associated keywords

In [1]:
from collections import Counter
class Trend:
    def __init__(self, trend):
        self.trend = trend
        #using a set doesn't allow duplicates
        #self.keywords = set()
        self.keywords = Counter()
    def add_keyword(self, keyword):
        #self.keywords.add(keyword)
        self.keywords.update({keyword:1})
    def as_dict(self):
        return {self.trend:self.keywords}

In [11]:
def pair_trends_keywords(df, window_records, n_trends, n_keywords, from_sql = False):
    df_pruned = df.iloc[:n_trends,:]
    trends_list = []
    for entry in df_pruned.key:
        trend_str = ' '.join(entry)
        trend = Trend(trend_str)
        trends_list.append(trend)
    # get filtered keywords
    top_keywords = get_top_keywords(n_keywords)
    top_keywords_text = [keyword[0] for keyword in top_keywords]
    #print(top_keywords_text)
    # TRIPLE LOOP - sure to be a bottleneck
    for trend in trends_list:
        for window in window_records:
            # direct method only --
            # update to include sql compatibility
            # when sql is working
            for abstract_record in window:
                if from_sql:
                    abstract = abstract_record[0]
                else:
                    abstract = abstract_record['Abstract'][0]
                if trend.trend in abstract:
                    if from_sql:
                        keywords = str.split(abstract_record[2], ',')
                    else:
                        keywords = abstract_record['Keywords']
                    for keyword in keywords:
                        if keyword in top_keywords_text:
                            #print("matching trend", trend.trend, "and keyword", keyword)
                            trend.add_keyword(keyword)
                #else:
                    #print("not in abstract")
    return trends_list

In [4]:
import pickle
def unpickle_data():
    with open("df.txt", "rb") as fp:   # Unpickling
        df = pickle.load(fp)
    with open("window_records.txt", "rb") as fp:   # Unpickling
        window_records = pickle.load(fp)
    return df, window_records
df, window_records = unpickle_data()

In [12]:
n_trends = 1000
n_keywords = 1000
trends_list = pair_trends_keywords(df, window_records, n_trends, n_keywords, from_sql = True)

creating engine:
-  postgresql://ericbarnhill:carter0109@localhost/rtr_db
-  ['rtr_abstracts', 'rtr_keywords']


In [13]:
def print_trend_dict(trends_list, N=50):
    n = 0
    for trend in trends_list:
        print(trend.as_dict())
        n += 1
        if n > N:
            return
print_trend_dict(trends_list)

{'used method': Counter({'imaging': 10, 'tomography': 6})}
{'record patient': Counter()}
{'report case': Counter()}
{'mean diameter': Counter({'rats': 6, 'microscopy': 4, 'carcinoma': 2})}
{'targeted therapy': Counter({'magnetic resonance imaging': 2, 'rats': 1})}
{'stenosis occlusion': Counter()}
{'fallot tof': Counter()}
{'tetralogy fallot tof': Counter()}
{'conducted prospective': Counter()}
{'success rate': Counter({'imaging': 38, 'carcinoma': 21, 'ultrasonography': 16, 'aneurysm': 7, 'radiography': 6})}
{'de novo': Counter({'microscopy': 12, 'tomography': 11, 'imaging': 7, 'mice': 4, 'mutation': 4, 'ultrasonography': 3})}
{'vessel diameter': Counter({'imaging': 6})}
{'also compared': Counter({'imaging': 10, 'ultrasonography': 6, 'tomography': 6, 'rats': 4, 'cartilage': 4})}
{'randomly divided': Counter({'rats': 67, 'imaging': 21, 'radiography': 6, 'diet': 3, 'microscopy': 3, 'hypertension': 2, 'mice': 1})}
{'reclassification improvement': Counter({'hypertension': 6, 'tomography': 

## Building the graph

In [14]:
import networkx as nx

In [15]:
G = nx.Graph()
for trend in trends_list:
    node1 = trend.trend
    for key, item in trend.keywords.items():
        node2 = key
        #print(node1, "|", node2, "|", item)
        if G.has_edge(node1, node2):
            print("increasing weight by", item)
            G[node1][node2]['weight'] += item
        else:
            # new edge. add with weight=1
            G.add_node(node1, is_key = False)
            G.add_node(node2, is_key = True)
            G.add_edge(node1, node2, weight=item)
for u, v, d in G.edges(data=True):
    weight = d['weight']
    if weight > 50:
        print(u,"," , v, ",", "weight: ", d['weight'])

imaging , cone beam , weight:  70
imaging , tomography angiography , weight:  52
imaging , confidence interval , weight:  97
imaging , magnetic resonance imaging , weight:  275
imaging , coronary artery , weight:  73
imaging , beam computed , weight:  112
imaging , cone beam computed , weight:  52
imaging , beam computed tomography , weight:  106
imaging , resonance imaging , weight:  275
imaging , lung cancer , weight:  52
imaging , coherence tomography , weight:  73
imaging , ass correlation , weight:  133
imaging , using h , weight:  62
imaging , optical coherence tomography , weight:  73
imaging , optical coherence , weight:  76
imaging , soft tissue , weight:  168
imaging , tomography scan , weight:  73
imaging , mean age , weight:  143
tomography , left ventricular , weight:  55
tomography , coronary artery , weight:  153
tomography , coronary angiography , weight:  68
tomography , mean age , weight:  53
tomography , artery disease , weight:  126
carcinoma , confidence interval ,

## Find ten most heavily weighted edges of the graph

In [14]:
def top_N_trends(G, keyword, n=10):
    node_edges = G.edges(keyword.lower())
    edges_dict = {}
    for edge in node_edges:
        key = edge[1]
        value = G[edge[0]][edge[1]]['weight']
        edges_dict.update({key:value})
    index = 1
    trends = []
    for key, value in sorted(edges_dict.items(), key=operator.itemgetter(1), reverse=True):
        trends.append(' '.join([key, str(value)]))
        index += 1
        if index > n:
            break
    return trends
trends = top_N_trends(G, 'magnetic resonance imaging')
print(trends)

NameError: name 'G' is not defined

## Find which keywords made the graph

In [None]:
N = 20
keywords =  {}
for node in G.nodes(data=True):
    if node[1]['is_key']:
        total_wt = 0
        for edge in G.edges(node[0], data=True):
            total_wt += edge[2]['weight']
        keywords.update({node[0]: total_wt})
n = 0
for item in sorted(keywords.items(), key=operator.itemgetter(1), reverse=True):
    print(item)
    n += 1
    if n > N:
        break

In [None]:
# pickle graph
def pickle_graph():
    try: 
        with open("/home/ericbarnhill/Documents/code/insight/rtr/G.txt", "wb") as fp:
            pickle.dump(G, fp)
    except:
        print('error')
pickle_graph()

In [None]:
os.path.exists("/home/ericbarnhill/Documents/code/insight/rtr/")

In [12]:
keywords = get_top_keywords(1000)

creating engine:
-  postgresql://ericbarnhill:carter0109@localhost/rtr_db
-  ['rtr_abstracts', 'rtr_keywords']


In [13]:
print(keywords)

[('humans', 1319), ('male', 880), ('female', 874), ('middle aged', 509), ('adult', 455), ('aged', 420), ('magnetic resonance imaging', 404), ('animals', 320), ('young adult', 242), ('retrospective studies', 203), ('brain', 192), ('tomography, x-ray computed', 167), ('aged, 80 and over', 162), ('treatment outcome', 160), ('reproducibility of results', 142), ('adolescent', 141), ('image processing, computer-assisted', 139), ('brain mapping', 126), ('time factors', 124), ('follow-up studies', 118), ('mice', 110), ('child', 106), ('prospective studies', 101), ('risk factors', 100), ('microscopy, fluorescence', 100), ('microscopy, electron, scanning', 98), ('sensitivity and specificity', 94), ('imaging, three-dimensional', 91), ('algorithms', 85), ('microscopy, electron, transmission', 77), ('echocardiography', 75), ('prognosis', 73), ('microscopy, confocal', 71), ('image interpretation, computer-assisted', 69), ('positron-emission tomography', 68), ('predictive value of tests', 68), ('seve