# RTR Connectivity Graph

- Index pruned trends to pruned keywords
- Build graph

## Next steps:

- Make Network With Pruned Trends

Create new structure with: 
- keywords from given abstract
- trending words from given abstract

TODO: include pubmed link to original abstract

In [18]:
# SETUP
%run rtr.ipynb
import operator
import pickle
PATH = "/home/ericbarnhill/Documents/code/insight/rtr/12_mo_nodedupe/"
os.chdir(PATH)

In [19]:
def unpickle_data():
    with open("trends.pickle", "rb") as fp:
        trends = pickle.load(fp)
    with open("records.pickle", "rb") as fp:
        records = pickle.load(fp)
    with open("df.pickle", "rb") as fp:
        df = pickle.load(fp)
    return trends, records, df

Create data structure to hold trend and associated keywords

In [20]:
from collections import Counter
class Trend:
    def __init__(self, trend):
        self.trend = trend
        #using a set doesn't allow duplicates
        #self.keywords = set()
        self.keywords = Counter()
    def add_keyword(self, keyword):
        #self.keywords.add(keyword)
        self.keywords.update({keyword:1})
    def as_dict(self):
        return {self.trend:self.keywords}

In [21]:
def pair_trends_keywords(df, window_records, n_trends, n_keywords, from_sql = False):
    df_pruned = df.iloc[:n_trends,:]
    trends_list = []
    for entry in df_pruned.key:
        trend_str = ' '.join(entry)
        trend = Trend(trend_str)
        trends_list.append(trend)
    # get filtered keywords
    top_keywords = get_top_keywords(n_keywords)
    top_keywords_text = [keyword[0] for keyword in top_keywords]
    #print(top_keywords_text)
    # TRIPLE LOOP - sure to be a bottleneck
    for trend in trends_list:
        for window in window_records:
            # direct method only --
            # update to include sql compatibility
            # when sql is working
            for abstract_record in window:
                if from_sql:
                    abstract = abstract_record[0]
                else:
                    abstract = abstract_record['Abstract'][0]
                if trend.trend in abstract:
                    if from_sql:
                        keywords = str.split(abstract_record[2], ',')
                    else:
                        keywords = abstract_record['Keywords']
                    for keyword in keywords:
                        if keyword in top_keywords_text:
                            #print("matching trend", trend.trend, "and keyword", keyword)
                            trend.add_keyword(keyword)
                #else:
                    #print("not in abstract")
    return trends_list

In [22]:
def print_trend_dict(trends_list, N=50):
    n = 0
    for trend in trends_list:
        print(trend.as_dict())
        n += 1
        if n > N:
            return

## Building the graph

In [23]:
import networkx as nx
import operator

In [24]:
def populate_graph(trends_list):
    G = nx.Graph()
    for trend in trends_list:
        node1 = trend.trend
        for key, item in trend.keywords.items():
            node2 = key
            if G.has_edge(node1, node2):
                G[node1][node2]['weight'] += item
            else:
                # new edge. add with weight=1
                G.add_node(node1, is_key = False)
                G.add_node(node2, is_key = True)
                G.add_edge(node1, node2, weight=item)
    for u, v, d in G.edges(data=True):
        weight = d['weight']
    return G

## Find ten most heavily weighted edges of the graph

In [25]:
def top_N_trends(G, keyword, n=10):
    node_edges = G.edges(keyword.lower())
    edges_dict = {}
    for edge in node_edges:
        key = edge[1]
        value = G[edge[0]][edge[1]]['weight']
        edges_dict.update({key:value})
    index = 1
    trends = []
    for key, value in sorted(edges_dict.items(), key=operator.itemgetter(1), reverse=True):
        trends.append(' '.join([key, str(value)]))
        index += 1
        if index > n:
            break
    return trends

## Find which keywords made the graph

In [26]:
def get_hot_keywords(G, N=20):
    keywords =  {}
    for node in G.nodes(data=True):
        if node[1]['is_key']:
            total_wt = 0
            for edge in G.edges(node[0], data=True):
                total_wt += edge[2]['weight']
            keywords.update({node[0]: total_wt})
    n = 0
    hot_keywords = []
    for item in sorted(keywords.items(), key=operator.itemgetter(1), reverse=True):
        n += 1
        if n > N:
            break
        else:
            hot_keywords.append(item)
    return hot_keywords

In [27]:
def convert_trends(trends):
    trends_converted = {}
    for key, item in trends.items():
        key_new = ' '.join(key)
        item_new = item['vals']
        trends_converted.update({key_new:item_new})
    return trends_converted

# pickle graph
def pickle_graph():
    try: 
        with open("/home/ericbarnhill/Documents/code/insight/rtr/G.txt", "wb") as fp:
            pickle.dump(G, fp)
    except:
        print('error')
        
def unpickle_graph():
    try: 
        with open("/home/ericbarnhill/Documents/code/insight/rtr/G.txt", "rb") as fp:
            G = pickle.load(fp)
    except:
        print('error')
    return G
        
def unpickle_bow_trends():
    try: 
        with open("/home/ericbarnhill/Documents/code/insight/rtr/bow_converted.txt", "rb") as fp:
            bow_trends = pickle.load(fp)
    except:
        print('error')
    return bow_trends

In [28]:
def graph_figure(G):
    nx.draw_networkx_nodes(G[2], nx.spring_layout(G[2]), node_size=10)
    nx.draw_networkx_edges(G[2], nx.spring_layout(G[2]), alpha=0.4)
    plt.xlim((-0.1, 0.1))
    plt.ylim((-0.1, 0.1))
    plt.show()

In [29]:
import operator
def centrality_measures(G):
    dc = nx.degree_centrality(G)
    bc = nx.betweenness_centrality(G)
    ec = nx.eigenvector_centrality_numpy(G)
    dc = sorted(dc.items(), key=operator.itemgetter(1), reverse=True)
    bc = sorted(bc.items(), key=operator.itemgetter(1), reverse=True)
    ec = sorted(ec.items(), key=operator.itemgetter(1), reverse=True)
    return dc, bc, ec
#    for key, value in sorted(bc.items(), key=operator.itemgetter(1), reverse=True):
#        print(key, value)
#        n += 1
#        if n > 10:
#            break

In [30]:
def export_to_app(G, trends_converted):
    GRAPH_PATH = "/home/ericbarnhill/Documents/code/insight_app/G.pickle"
    TRENDS_PATH = "/home/ericbarnhill/Documents/code/insight_app/trends_converted.pickle"
    with open(GRAPH_PATH, "wb") as graph_path:
        pickle.dump(G, graph_path)
    with open(TRENDS_PATH, "wb") as trends_path:
        pickle.dump(trends_converted, trends_path)
    

In [31]:
def run_nb(set_up=False):
    if set_up:
        setup()
    N_KEYWORDS = 10000
    L = 5
    trends, records, df = unpickle_data()
    trends_list = pair_trends_keywords(df, records,
                                       len(trends), N_KEYWORDS, from_sql = True)
    print("Top 20 trends:")
    print_trend_dict(trends_list, 20)
    G = populate_graph(trends_list)
    print("MRI trends:")
    mri_trends = top_N_trends(G, 'magnetic resonance imaging')
    print(mri_trends)
    print("Hottest keywords:")
    hot_keywords = get_hot_keywords(G)
    print(hot_keywords)
    dc, bc, ec = centrality_measures(G)
    print("Top degree centrality:", list(dc)[:L])
    print("Top betweenness centrality:", list(bc)[:L])
    print("Top eigencentrality:", list(ec)[:L])
    trends_converted = convert_trends(trends)
    export_to_app(G, trends_converted)
    return G, trends_converted

In [32]:
G, trends_converted = run_nb(False)

creating engine:
-  postgresql://ericbarnhill:carter0109@localhost/rtr_db
-  ['rtr_abstracts', 'rtr_keywords']
Top 20 trends:
{'eyes normal': Counter()}
{'acute pancreatitis': Counter({'tomography': 66, 'carcinoma': 9, 'ultrasonography': 9, 'pancreatitis': 8, 'rats': 6, 'intubation': 5, 'embolization': 4, 'injections': 2, 'radiography': 1, 'anti-inflammatory agents': 1})}
{'performance proposed': Counter()}
{'chronic total': Counter({'ultrasonography': 47, 'tomography': 7, 'spectroscopy': 3, 'radiography': 3})}
{'iqr median': Counter()}
{'infarction stroke': Counter()}
{'inclusion exclusion': Counter()}
{'systemic therapy': Counter({'tomography': 9, 'anti-inflammatory agents': 6, 'microscopy': 6})}
{'mesial temporal lobe': Counter({'embolization': 12, 'image processing': 9, 'epilepsy': 9, 'dna': 6, 'ultrasonography': 4})}
{'unique identifier': Counter({'ventricular dysfunction': 8, 'tomography': 8, 'transplantation': 6, 'stroke': 6, 'ultrasonography': 3, 'spectroscopy': 3, 'muscle': 2}

In [33]:
dc, bc, ec = centrality_measures(G)
L = 10
web_list = {'imaging':1, 'tomography':2, 'rats':3, 'ultrasonography':4, \
            'carcinoma':5, 'mice':6, 'echocardiography':7, 'diagnosis':8, \
           'microscopy':9, 'cartilage':10}
dc_list = {}
bc_list = {}
ec_list = {}
for n in range(L):
    dc_list.update({list(dc)[n][0]:n+1})
    bc_list.update({list(bc)[n][0]:n+1})
    ec_list.update({list(ec)[n][0]:n+1})
    


In [34]:
def pairwise_scatter_plots():
    import pandas as pd
    import numpy as np
    web_df['listnum'] = np.tile(1, (web_df.shape[0], 1))
    dc_df = pd.DataFrame(dc_list, index=[1]).melt()
    dc_df['listnum'] = np.tile(2, (web_df.shape[0], 1))
    ec_df = pd.DataFrame(ec_list, index=[2]).melt()
    ec_df['listnum'] = np.tile(3, (web_df.shape[0], 1))
    bc_df = pd.DataFrame(bc_list, index=[3]).melt()
    bc_df['listnum'] = np.tile(4, (web_df.shape[0], 1))
    import altair as alt
    chart = alt.Chart(df, width=400).mark_line().encode(
        x = 'listnum:O',
        y = 'value:O', 
        color = 'variable'
    )
    chart

In [35]:
trends, records, df = unpickle_data()

In [36]:
trends[list(trends)[100]]

{'intercept': 1.4685314685314674,
 'slope': -0.08518753973299421,
 'resid': array([1.54159289]),
 'trend_score': array([-0.05525943]),
 'vals': array([26., 39., 58., 46., 48., 17., 30., 14., 29., 23., 22., 11.]),
 'total_mentions': 363.0}

In [37]:
os.getcwd()

'/home/ericbarnhill/Documents/code/insight/rtr/12_mo'

In [38]:
len(df.iloc[0,0])

2

In [39]:
t1 = df.iloc[0,0]
sum(df['key']==t1)

1

In [43]:
def filt_df(df):
    df_filt = df.copy(deep=True)
    df_filt = df_filt[df_filt.score > 0]
    for i in range(df.shape[0]):
        if i % 1000 == 0:
            print("Term ",i)
        single_let = False
        term = df.iloc[i,0]
        for element in term:
            if len(element) == 1:
                print("dropping ",term," as it contains a single letter term")
                df_filt.drop(df_filt[df_filt['key'] == term].index, inplace=True)
                single_let = True
        if not single_let:
            term_set = set(term)    
            for j in range(df.shape[0]):
                entry = df.iloc[j,0]
                entry_set = set(entry)
                if i != j:
                    if entry_set.issubset(term_set):
                        df_filt.drop(df_filt[df_filt['key'] == entry].index, inplace=True)            
    print("df length", df.shape)
    print("df filt length", df_filt.shape)
    return df_filt

In [44]:
df_filt = filt_df(df)
N_KEYWORDS = 10000
L = 10
num_above_zero = sum(df_filt.score > 0.1)
print("number of positive trends:", num_above_zero)
trends_list = pair_trends_keywords(df_filt, records,
                                   round(num_above_zero*3/4), N_KEYWORDS, from_sql = True)
print("Top 20 trends:")
print_trend_dict(trends_list, 20)
G = populate_graph(trends_list)
print("MRI trends:")
mri_trends = top_N_trends(G, 'magnetic resonance imaging')
print(mri_trends)
print("Hottest keywords:")
hot_keywords = get_hot_keywords(G)
print(hot_keywords)
dc, bc, ec = centrality_measures(G)
print("Top degree centrality:", list(dc)[:L])
print("Top betweenness centrality:", list(bc)[:L])
print("Top eigencentrality:", list(ec)[:L])
trends_converted = convert_trends(trends)
export_to_app(G, trends_converted)

Term  0
Term  1000
Term  2000
Term  3000
Term  4000
Term  5000
Term  6000
Term  7000
Term  8000
Term  9000
df length (9023, 3)
df filt length (997, 3)
number of positive trends: 93
creating engine:
-  postgresql://ericbarnhill:carter0109@localhost/rtr_db
-  ['rtr_abstracts', 'rtr_keywords']
Top 20 trends:
{'acute pancreatitis': Counter({'tomography': 66, 'carcinoma': 9, 'ultrasonography': 9, 'pancreatitis': 8, 'rats': 6, 'intubation': 5, 'embolization': 4, 'injections': 2, 'radiography': 1, 'anti-inflammatory agents': 1})}
{'performance proposed': Counter()}
{'infarction stroke': Counter()}
{'systemic therapy': Counter({'tomography': 9, 'anti-inflammatory agents': 6, 'microscopy': 6})}
{'images demonstrated': Counter({'tomography': 26, 'microscopy': 3, 'lasers': 3, 'carcinoma': 2})}
{'url http unique identifier': Counter()}
{'congenital anomalies': Counter({'chromosomes': 12, 'hernia': 9, 'tomography': 6, 'statistics': 6, 'image processing': 5, 'ultrasonography': 4, 'echocardiography':

In [None]:
trends_list = pair_trends_keywords(df_filt, records,
                                   round(num_above_zero*3/4), N_KEYWORDS, from_sql = True)

In [None]:
trends_converted = convert_trends(trends)
export_to_app(G, trends_converted)

In [None]:
np.min(df_filt.score)

In [None]:
df_sort = df_filt.sort_values(by='score', ascending=False)

In [None]:
sum(df_filt.score > 0.1)