# RTR Connectivity Graph

- Index pruned trends to pruned keywords
- Build graph

## Next steps:

- Make Network With Pruned Trends

Create new structure with: 
- keywords from given abstract
- trending words from given abstract

TODO: include pubmed link to original abstract

In [76]:
# SETUP
os.chdir("/home/ericbarnhill/Documents/code/insight/rtr/")
#%run rtr.ipynb
import operator
import pickle
PATH = "/home/ericbarnhill/Documents/code/insight/rtr/12_mo_nodedupe/"
os.chdir(PATH)

In [50]:
def unpickle_data():
    with open("trends.pickle", "rb") as fp:
        trends = pickle.load(fp)
    with open("records.pickle", "rb") as fp:
        records = pickle.load(fp)
    with open("df.pickle", "rb") as fp:
        df = pickle.load(fp)
    return trends, records, df

Create data structure to hold trend and associated keywords

In [51]:
from collections import Counter
class Trend:
    def __init__(self, trend):
        self.trend = trend
        #using a set doesn't allow duplicates
        #self.keywords = set()
        self.keywords = Counter()
    def add_keyword(self, keyword):
        #self.keywords.add(keyword)
        self.keywords.update({keyword:1})
    def as_dict(self):
        return {self.trend:self.keywords}

In [52]:
def pair_trends_keywords(df, window_records, n_trends, n_keywords, from_sql = False):
    df_pruned = df.iloc[:n_trends,:]
    trends_list = []
    for entry in df_pruned.key:
        trend_str = ' '.join(entry)
        trend = Trend(trend_str)
        trends_list.append(trend)
    # get filtered keywords
    top_keywords = get_top_keywords(n_keywords)
    top_keywords_text = [keyword[0] for keyword in top_keywords]
    #print(top_keywords_text)
    # TRIPLE LOOP - sure to be a bottleneck
    for trend in trends_list:
        for window in window_records:
            # direct method only --
            # update to include sql compatibility
            # when sql is working
            for abstract_record in window:
                if from_sql:
                    abstract = abstract_record[0]
                else:
                    abstract = abstract_record['Abstract'][0]
                if trend.trend in abstract:
                    if from_sql:
                        keywords = str.split(abstract_record[2], ',')
                    else:
                        keywords = abstract_record['Keywords']
                    for keyword in keywords:
                        if keyword in top_keywords_text:
                            #print("matching trend", trend.trend, "and keyword", keyword)
                            trend.add_keyword(keyword)
                #else:
                    #print("not in abstract")
    return trends_list

In [53]:
def print_trend_dict(trends_list, N=50):
    n = 0
    for trend in trends_list:
        print(trend.as_dict())
        n += 1
        if n > N:
            return

## Building the graph

In [54]:
import networkx as nx
import operator

In [55]:
def populate_graph(trends_list):
    G = nx.Graph()
    for trend in trends_list:
        node1 = trend.trend
        for key, item in trend.keywords.items():
            node2 = key
            if G.has_edge(node1, node2):
                G[node1][node2]['weight'] += item
            else:
                # new edge. add with weight=1
                G.add_node(node1, is_key = False)
                G.add_node(node2, is_key = True)
                G.add_edge(node1, node2, weight=item)
    for u, v, d in G.edges(data=True):
        weight = d['weight']
    return G

## Find ten most heavily weighted edges of the graph

In [56]:
def top_N_trends(G, keyword, n=10):
    node_edges = G.edges(keyword.lower())
    edges_dict = {}
    for edge in node_edges:
        key = edge[1]
        value = G[edge[0]][edge[1]]['weight']
        edges_dict.update({key:value})
    index = 1
    trends = []
    for key, value in sorted(edges_dict.items(), key=operator.itemgetter(1), reverse=True):
        trends.append(' '.join([key, str(value)]))
        index += 1
        if index > n:
            break
    return trends

## Find which keywords made the graph

In [57]:
def get_hot_keywords(G, N=20):
    keywords =  {}
    for node in G.nodes(data=True):
        if node[1]['is_key']:
            total_wt = 0
            for edge in G.edges(node[0], data=True):
                total_wt += edge[2]['weight']
            keywords.update({node[0]: total_wt})
    n = 0
    hot_keywords = []
    for item in sorted(keywords.items(), key=operator.itemgetter(1), reverse=True):
        n += 1
        if n > N:
            break
        else:
            hot_keywords.append(item)
    return hot_keywords

In [58]:
def convert_trends(trends):
    trends_converted = {}
    for key, item in trends.items():
        key_new = ' '.join(key)
        item_new = item['vals']
        trends_converted.update({key_new:item_new})
    return trends_converted

# pickle graph
def pickle_graph():
    try: 
        with open("/home/ericbarnhill/Documents/code/insight/rtr/G.txt", "wb") as fp:
            pickle.dump(G, fp)
    except:
        print('error')
        
def unpickle_graph():
    try: 
        with open("/home/ericbarnhill/Documents/code/insight/rtr/G.txt", "rb") as fp:
            G = pickle.load(fp)
    except:
        print('error')
    return G
        
def unpickle_bow_trends():
    try: 
        with open("/home/ericbarnhill/Documents/code/insight/rtr/bow_converted.txt", "rb") as fp:
            bow_trends = pickle.load(fp)
    except:
        print('error')
    return bow_trends

In [59]:
def graph_figure(G):
    nx.draw_networkx_nodes(G[2], nx.spring_layout(G[2]), node_size=10)
    nx.draw_networkx_edges(G[2], nx.spring_layout(G[2]), alpha=0.4)
    plt.xlim((-0.1, 0.1))
    plt.ylim((-0.1, 0.1))
    plt.show()

In [60]:
import operator
def centrality_measures(G):
    dc = nx.degree_centrality(G)
    bc = nx.betweenness_centrality(G)
    ec = nx.eigenvector_centrality_numpy(G)
    dc = sorted(dc.items(), key=operator.itemgetter(1), reverse=True)
    bc = sorted(bc.items(), key=operator.itemgetter(1), reverse=True)
    ec = sorted(ec.items(), key=operator.itemgetter(1), reverse=True)
    return dc, bc, ec
#    for key, value in sorted(bc.items(), key=operator.itemgetter(1), reverse=True):
#        print(key, value)
#        n += 1
#        if n > 10:
#            break

In [61]:
def export_to_app(G, trends_converted):
    GRAPH_PATH = "/home/ericbarnhill/Documents/code/insight_app/G.pickle"
    TRENDS_PATH = "/home/ericbarnhill/Documents/code/insight_app/trends_converted.pickle"
    with open(GRAPH_PATH, "wb") as graph_path:
        pickle.dump(G, graph_path)
    with open(TRENDS_PATH, "wb") as trends_path:
        pickle.dump(trends_converted, trends_path)
    

In [66]:
def run_nb(set_up=False):
    if set_up:
        setup()
    N_KEYWORDS = 50000
    L = 5
    trends, records, df = unpickle_data()
    trends_list = pair_trends_keywords(df, records,
                                       len(trends), N_KEYWORDS, from_sql = True)
    print("Top 20 trends:")
    print_trend_dict(trends_list, 20)
    G = populate_graph(trends_list)
    print("MRI trends:")
    mri_trends = top_N_trends(G, 'magnetic resonance imaging')
    print(mri_trends)
    print("Hottest keywords:")
    hot_keywords = get_hot_keywords(G)
    print(hot_keywords)
    dc, bc, ec = centrality_measures(G)
    print("Top degree centrality:", list(dc)[:L])
    print("Top betweenness centrality:", list(bc)[:L])
    print("Top eigencentrality:", list(ec)[:L])
    trends_converted = convert_trends(trends)
    export_to_app(G, trends_converted)
    return G, trends_converted

In [77]:
G, trends_converted = run_nb(False)

creating engine:
-  postgresql://ericbarnhill:carter0109@localhost/rtr_db
-  ['rtr_abstracts', 'rtr_keywords']
Top 20 trends:
{'cardiac computed tomography': Counter({'tomography': 36, 'image interpretation': 6, 'phantoms': 6})}
{'regression analysis': Counter({'tomography': 56, 'image interpretation': 28, 'imaging': 27, 'phantoms': 21, 'ventricular function': 18, 'models': 12, 'insemination': 9, 'heart septal defects': 9, 'chorionic gonadotropin': 9, 'infertility': 9, 'stress': 6, 'heart failure': 6, 'phobia': 6, 'ultrasonography': 5, 'endothelium': 4, 'depressive disorder': 3, 'antineoplastic agents': 3, 'spectroscopy': 1})}
{'medline embase': Counter()}
{'postoperative day': Counter({'tomography': 32, 'imaging': 12, 'anti-infective agents': 9, 'lens': 9, 'anesthesia': 6, 'fractures': 6, 'anastomosis': 6, 'thoracic surgery': 3, 'liver failure': 3, 'ultrasonography': 2})}
{'medical records': Counter({'tomography': 32, 'image interpretation': 9, 'chemotherapy': 9, 'ultrasonography': 6,

In [None]:
dc, bc, ec = centrality_measures(G)
L = 10
web_list = {'imaging':1, 'tomography':2, 'rats':3, 'ultrasonography':4, \
            'carcinoma':5, 'mice':6, 'echocardiography':7, 'diagnosis':8, \
           'microscopy':9, 'cartilage':10}
dc_list = {}
bc_list = {}
ec_list = {}
for n in range(L):
    dc_list.update({list(dc)[n][0]:n+1})
    bc_list.update({list(bc)[n][0]:n+1})
    ec_list.update({list(ec)[n][0]:n+1})
    


In [None]:
def pairwise_scatter_plots():
    import pandas as pd
    import numpy as np
    web_df['listnum'] = np.tile(1, (web_df.shape[0], 1))
    dc_df = pd.DataFrame(dc_list, index=[1]).melt()
    dc_df['listnum'] = np.tile(2, (web_df.shape[0], 1))
    ec_df = pd.DataFrame(ec_list, index=[2]).melt()
    ec_df['listnum'] = np.tile(3, (web_df.shape[0], 1))
    bc_df = pd.DataFrame(bc_list, index=[3]).melt()
    bc_df['listnum'] = np.tile(4, (web_df.shape[0], 1))
    import altair as alt
    chart = alt.Chart(df, width=400).mark_line().encode(
        x = 'listnum:O',
        y = 'value:O', 
        color = 'variable'
    )
    chart

In [79]:
trends, records, df = unpickle_data()

In [None]:
trends[list(trends)[100]]

In [74]:
os.getcwd()

'/home/ericbarnhill/Documents/code/insight/rtr'

In [86]:
len(df.iloc[0,0])

3

In [92]:
t1 = df.iloc[0,0]
sum(df['key']==t1)

1

In [None]:
df_filt = df.copy(deep=True)
is_subset = np.zeros(df.shape[0])
for i in range(df.shape[0]):
    single_let = False
    term = df.iloc[i,0]
    for element in term:
        if len(element) == 1:
            print("dropping ",term," as it contains a single letter term")
            df_filt.drop(df_filt[df_filt['key'] == term].index, inplace=True)
            single_let = True
    if not single_let:
        term_set = set(term)    
        for j in range(df.shape[0]):
            entry = df.iloc[j,0]
            entry_set = set(entry)
            if i != j:
                if entry_set.issubset(term_set):
                    df_filt.drop(df_filt[df_filt['key'] == entry].index, inplace=True)
            
print("df length", df.shape)
print("df filt length", df_filt.shape)

In [169]:
df_filt = df.copy(deep=True)
is_subset = np.zeros(df.shape[0])
t1 = 'cardiac'
t2 = {t1}
for n in range(10):
    df_filt = df_filt[t for t in ]
print("Subset sum: ", np.sum(is_subset))
print("df length", df.shape)
print("df filt length", df_filt.shape)

dropping
Subset sum:  1.0
df length (11727, 3)
df filt length (11726, 3)


In [118]:
a = set({'cardiac', 'troponin'})
b = set({'cardiac'})

True

In [158]:
df.head(10)

Unnamed: 0,key,score,total_mentions
8805,"(cardiac, computed, tomography)",[1.933281138305476],584.0
6379,"(regression, analysis)",[1.7439089503648177],10470.0
3689,"(medline, embase)",[1.6294859359844853],720.0
3274,"(postoperative, day)",[1.3618046482029307],785.0
3128,"(medical, records)",[1.3263085272642037],3728.0
2733,"(tomography, scans)",[1.3050110594157611],3480.0
11533,"(pathological, findings)",[1.264084417524167],821.0
8683,"(literature, search)",[1.2418180923726951],1037.0
5008,"(receiver, operating)",[1.229333882901239],8134.0
11256,"(imaging, method)",[1.184765666440886],1360.0


In [146]:
df[6]

KeyError: 6