# Imaging Edge Notebook 3: Convert Trends To Graph

ImagingEdge detects trends in the radiological research literature before they become mainstream publications, patents and products.

*Part 3: (this notebook) of the app creates a graph combining search terms and trending terms, and deploys this graph to the web app.*

Other parts:

Part 1: Scrape PubMed

Part 2: Convert PubMed abstracts to Bag of Words

Part 4: Graph "learns" from unstructured sources

### Created by Eric Barnhill for Insight Health Data Science
#### 2018 No License

Documentation follows the [Google Python Style Guide](http://google.github.io/styleguide/pyguide.html)

Create new structure with: 
- keywords from given abstract
- trending words from given abstract

TODO: include pubmed link to original abstract

In [114]:
# SETUP
import os
ROOT_PATH = "/home/ericbarnhill/Documents/code/insight/rtr/" 
os.chdir(ROOT_PATH)
%run rtr.ipynb
import operator
import pickle
YEAR = 2015
DATA_PATH = ROOT_PATH + str(YEAR) + "/"
os.chdir(DATA_PATH)

Python kernel:
/home/ericbarnhill/anaconda3/envs/ecb/bin/python
Logfile path:  /home/ericbarnhill/Documents/code/insight/rtr/scrape.log


In [115]:
def unpickle_data():
    """Recover BOW and trends data.
    """ 
    with open("trends.pickle", "rb") as fp:
        trends = pickle.load(fp)
    with open("records.pickle", "rb") as fp:
        records = pickle.load(fp)
    with open("df.pickle", "rb") as fp:
        df = pickle.load(fp)
    return trends, records, df

Create data structure to hold trend and associated keywords

In [116]:
from collections import Counter
class Trend:
    def __init__(self, trend):
        self.trend = trend
        #using a set doesn't allow duplicates
        #self.keywords = set()
        self.keywords = Counter()
    def add_keyword(self, keyword):
        #self.keywords.add(keyword)
        self.keywords.update({keyword:1})
    def as_dict(self):
        return {self.trend:self.keywords}

In [117]:
def pair_trends_keywords(df, window_records, n_trends, n_keywords, from_sql = False):
    """Filter list BOWs so that only common terms are contained
        
    Args:
        list of unfiltered BOW dicts
        
    Returns:
        list of filtered BOW dicts
    """ 
    df_pruned = df.iloc[:n_trends,:]
    trends_list = []
    for entry in df_pruned.key:
        trend_str = ' '.join(entry)
        trend = Trend(trend_str)
        trends_list.append(trend)
    # get filtered keywords
    top_keywords = get_top_keywords(n_keywords)
    top_keywords_text = [keyword[0] for keyword in top_keywords]
    #print(top_keywords_text)
    # TRIPLE LOOP - sure to be a bottleneck
    for trend in trends_list:
        for window in window_records:
            # direct method only --
            # update to include sql compatibility
            # when sql is working
            for abstract_record in window:
                if from_sql:
                    abstract = abstract_record[0]
                else:
                    abstract = abstract_record['Abstract'][0]
                if trend.trend in abstract:
                    if from_sql:
                        keywords = str.split(abstract_record[2], ',')
                    else:
                        keywords = abstract_record['Keywords']
                    for keyword in keywords:
                        if keyword in top_keywords_text:
                            #print("matching trend", trend.trend, "and keyword", keyword)
                            trend.add_keyword(keyword)
                #else:
                    #print("not in abstract")
    return trends_list

In [118]:
def print_trend_dict(trends_list, N=50):
    n = 0
    for trend in trends_list:
        print(trend.as_dict())
        n += 1
        if n > N:
            return

## Building the graph

In [119]:
import networkx as nx
import operator

In [120]:
def populate_graph(trends_list):
    G = nx.Graph()
    for trend in trends_list:
        #node1 = convert_trend(trend.trend)
        node1 = trend.trend
        for key, item in trend.keywords.items():
            node2 = key
            if G.has_edge(node1, node2):
                G[node1][node2]['weight'] += item
            else:
                # new edge. add with weight=1
                G.add_node(node1, is_key = False)
                G.add_node(node2, is_key = True)
                G.add_edge(node1, node2, weight=item)
    for u, v, d in G.edges(data=True):
        weight = d['weight']
    return G

## Find ten most heavily weighted edges of the graph

In [121]:
def top_N_trends(G, keyword, n=10):
    node_edges = G.edges(keyword.lower())
    edges_dict = {}
    for edge in node_edges:
        key = edge[1]
        value = G[edge[0]][edge[1]]['weight']
        edges_dict.update({key:value})
    index = 1
    trends = []
    for key, value in sorted(edges_dict.items(), key=operator.itemgetter(1), reverse=True):
        trends.append(' '.join([key, str(value)]))
        index += 1
        if index > n:
            break
    return trends

## Find which keywords made the graph

In [122]:
def get_hot_keywords(G, N=20):
    keywords =  {}
    for node in G.nodes(data=True):
        if node[1]['is_key']:
            total_wt = 0
            for edge in G.edges(node[0], data=True):
                total_wt += edge[2]['weight']
            keywords.update({node[0]: total_wt})
    n = 0
    hot_keywords = []
    for item in sorted(keywords.items(), key=operator.itemgetter(1), reverse=True):
        n += 1
        if n > N:
            break
        else:
            hot_keywords.append(item)
    return hot_keywords

In [123]:
def convert_trend(trend):
    if len(trend) > 1:
         trend = ' '.join(trend)
    return trend

def convert_trends(trends):
    trends_converted = {}
    for key, item in trends.items():
        key_new = convert_trend(key)
        item_new = item['vals']
        trends_converted.update({key_new:item_new})
    return trends_converted

In [124]:
def graph_figure(G):
    nx.draw_networkx_nodes(G[2], nx.spring_layout(G[2]), node_size=10)
    nx.draw_networkx_edges(G[2], nx.spring_layout(G[2]), alpha=0.4)
    plt.xlim((-0.1, 0.1))
    plt.ylim((-0.1, 0.1))
    plt.show()

In [125]:
import operator
def centrality_measures(G):
    dc = nx.degree_centrality(G)
    bc = nx.betweenness_centrality(G)
    ec = nx.eigenvector_centrality_numpy(G)
    dc = sorted(dc.items(), key=operator.itemgetter(1), reverse=True)
    bc = sorted(bc.items(), key=operator.itemgetter(1), reverse=True)
    ec = sorted(ec.items(), key=operator.itemgetter(1), reverse=True)
    return dc, bc, ec

In [126]:
def filt_df(df):
    df_filt = df.copy(deep=True)
    df_filt = df_filt[df_filt.score > 0]
    for i in range(df.shape[0]):
        if i % 1000 == 0:
            print("Term ",i)
        single_let = False
        term = df.iloc[i,0]
        for element in term:
            if len(element) == 1:
                print("dropping ",term," as it contains a single letter term")
                df_filt.drop(df_filt[df_filt['key'] == term].index, inplace=True)
                single_let = True
        if not single_let:
            term_set = set(term)    
            for j in range(df.shape[0]):
                entry = df.iloc[j,0]
                entry_set = set(entry)
                if i != j:
                    if entry_set.issubset(term_set):
                        df_filt.drop(df_filt[df_filt['key'] == entry].index, inplace=True)            
    print("df length", df.shape)
    print("df filt length", df_filt.shape)
    return df_filt

In [127]:
def export_to_app(G, trends_converted):
    GRAPH_PATH = "/home/ericbarnhill/Documents/code/insight_app/G.pickle"
    TRENDS_PATH = "/home/ericbarnhill/Documents/code/insight_app/trends_converted.pickle"
    with open(GRAPH_PATH, "wb") as graph_path:
        pickle.dump(G, graph_path)
    with open(TRENDS_PATH, "wb") as trends_path:
        pickle.dump(trends_converted, trends_path)
    

In [128]:
def make_presentation_graphic(): 
    # todo: clean up
    dc, bc, ec = centrality_measures(G)
    L = 10
    web_list = {'imaging':1, 'tomography':2, 'rats':3, 'ultrasonography':4, \
                'carcinoma':5, 'mice':6, 'echocardiography':7, 'diagnosis':8, \
               'microscopy':9, 'cartilage':10}
    dc_list = {}
    bc_list = {}
    ec_list = {}
    for n in range(L):
        dc_list.update({list(dc)[n][0]:n+1})
        bc_list.update({list(bc)[n][0]:n+1})
        ec_list.update({list(ec)[n][0]:n+1})



In [129]:
def pairwise_scatter_plots():
    import pandas as pd
    import numpy as np
    web_df['listnum'] = np.tile(1, (web_df.shape[0], 1))
    dc_df = pd.DataFrame(dc_list, index=[1]).melt()
    dc_df['listnum'] = np.tile(2, (web_df.shape[0], 1))
    ec_df = pd.DataFrame(ec_list, index=[2]).melt()
    ec_df['listnum'] = np.tile(3, (web_df.shape[0], 1))
    bc_df = pd.DataFrame(bc_list, index=[3]).melt()
    bc_df['listnum'] = np.tile(4, (web_df.shape[0], 1))
    import altair as alt
    chart = alt.Chart(df, width=400).mark_line().encode(
        x = 'listnum:O',
        y = 'value:O', 
        color = 'variable'
    )
    chart

In [140]:
def build_graph(): 
    trends, records, df = unpickle_data()
    df_filt = filt_df(df)
    N_KEYWORDS = 10000
    L = 10
    num_above_zero = sum(df_filt.score > 0.01)
    print("number of positive trends:", num_above_zero)
    trends_list = pair_trends_keywords(df_filt, records,
                                       round(num_above_zero*3/4), N_KEYWORDS, from_sql = True)
    print("Top 20 trends:")
    print_trend_dict(trends_list, 20)
    G = populate_graph(trends_list)
    print("MRI trends:")
    mri_trends = top_N_trends(G, 'magnetic resonance imaging')
    print(mri_trends)
    print("Hottest keywords:")
    hot_keywords = get_hot_keywords(G)
    print(hot_keywords)
    dc, bc, ec = centrality_measures(G)
    print("Top degree centrality:", list(dc)[:L])
    print("Top betweenness centrality:", list(bc)[:L])
    print("Top eigencentrality:", list(ec)[:L])
    trends_converted = convert_trends(trends)
    with open('G.pickle', "wb") as graph_path:
        pickle.dump(G, graph_path)
    with open('trends_converted.pickle', "wb") as trends_path:
        pickle.dump(trends_converted, trends_path)
    export_to_app(G, trends_converted)
    return df_filt

In [141]:
df_filt = build_graph()

Term  0
Term  1000
Term  2000
df length (2180, 3)
df filt length (805, 3)
number of positive trends: 663
Top 20 trends:
{'carotid artery ica': Counter()}
{'median interquartile iqr': Counter()}
{'chronic obstructive pulmonary': Counter({'tomography': 123, 'aorta': 15, 'image processing': 11, 'ventricular dysfunction': 9, 'endothelium': 6, 'receptor': 6, 'statistics': 3, 'echocardiography': 3, 'fractures': 1})}
{'prospective randomized controlled': Counter({'range of motion': 12, 'ultrasonography': 9, 'tomography': 3})}
{'radiofrequency ablation rfa': Counter()}
{'chronic total occlusion': Counter({'ultrasonography': 47, 'tomography': 7, 'spectroscopy': 3, 'radiography': 3})}
{'squamous cell carcinoma': Counter({'tomography': 93, 'carcinoma': 31, 'radiotherapy': 12, 'muscle': 9, 'statistics': 6, 'biopsy': 6, 'remission': 6, 'spectrum analysis': 6, 'printing': 6, 'injections': 6, 'radiography': 6, 'image processing': 5, 'neoplasms': 3, 'colon': 3})}
{'obstructive pulmonary disease': Coun

In [134]:
df_filt.head()

AttributeError: 'NoneType' object has no attribute 'head'

In [139]:
print(os.getcwd())

/home/ericbarnhill/Documents/code/insight/rtr/2015
