# Imaging Edge Notebook 3: Convert Trends To Graph

ImagingEdge detects trends in the radiological research literature before they become mainstream publications, patents and products.

*Part 3: (this notebook) of the app creates a graph combining search terms and trending terms, and deploys this graph to the web app.*

Other parts:

Part 1: Scrape PubMed

Part 2: Convert PubMed abstracts to Bag of Words

Part 4: Graph "learns" from unstructured sources

Part 5: Validation test suite

### Created by Eric Barnhill for Insight Health Data Science
#### 2018 No License

Documentation follows the [Google Python Style Guide](http://google.github.io/styleguide/pyguide.html)

Create new structure with: 
- keywords from given abstract
- trending words from given abstract

TODO: include pubmed link to original abstract

In [180]:
# SETUP
%run imedge_2_trends.ipynb
import networkx as nx
from collections import Counter

Setting Up...
Python kernel:
/home/ericbarnhill/anaconda3/envs/ecb/bin/python


In [181]:
def unpickle_part_2_data(year):
    """Recover BOW and trends data.
    """ 
    DATA_PATH = os.path.join(IMEDGE_PATH, str(year))
    with open(os.path.join(DATA_PATH, 'trends.pickle'), 'rb') as fp:
        trends = pickle.load(fp)
    with open(os.path.join(DATA_PATH, 'records.pickle'), 'rb') as fp:
        records = pickle.load(fp)
    with open(os.path.join(DATA_PATH, 'df.pickle'), 'rb') as fp:
        df = pickle.load(fp)
    return trends, records, df

Create data structure to hold trend and associated keywords

In [182]:
def get_hot_search_terms(G, cent_meas = 'eigenvector_centrality', N=20):
    # top nodes in metric of eigencentrality
    central_nodes = centrality_measures(G, cent_meas)
    hot_search_terms = []
    for item in central_nodes:
        if G.node[item]['is_key'] == True:
            # expected outcome
            hot_search_terms.append(item)
        else:
            # unexpected outcome - log
            logging.info("Top node is trending term: "+item)
        if len(hot_search_terms) >= N:
            break
    return hot_search_terms

In [183]:
class Trend:
    def __init__(self, trend):
        self.trend = trend
        #using a set doesn't allow duplicates
        #self.keywords = set()
        self.keywords = Counter()
    def add_keyword(self, keyword):
        #self.keywords.add(keyword)
        self.keywords.update({keyword:1})
    def as_dict(self):
        return {self.trend:self.keywords}

In [184]:
def pair_trends_keywords(df, window_records, n_trends, n_keywords, from_sql = False):
    """Filter list BOWs so that only common terms are contained
        
    Args:
        list of unfiltered BOW dicts
        
    Returns:
        list of filtered BOW dicts
    """ 
    df_pruned = df.iloc[:n_trends,:]
    trends_list = []
    for entry in df_pruned.key:
        trend_str = ' '.join(entry)
        trend = Trend(trend_str)
        trends_list.append(trend)
    # get filtered keywords
    top_keywords = get_top_keywords(n_keywords)
    top_keywords_text = [keyword[0] for keyword in top_keywords]
    for trend in trends_list:
        for window in window_records:
            for abstract_record in window:
                if from_sql:
                    abstract = abstract_record[0]
                else:
                    abstract = abstract_record['Abstract'][0]
                if trend.trend in abstract:
                    if from_sql:
                        keywords = str.split(abstract_record[2], ',')
                    else:
                        keywords = abstract_record['Keywords']
                    for keyword in keywords:
                        if keyword in top_keywords_text:
                            logging.debug("matching trend" + str(trend.trend) + "and keyword" + str(keyword))
                            trend.add_keyword(keyword)
                #else:
                    #print("not in abstract")
    return trends_list

In [185]:
def print_trend_dict(trends_list, N=50):
    """ for debugging. """
    n = 0
    for trend in trends_list:
        print(trend.as_dict())
        n += 1
        if n > N:
            return

## Building the graph

In [186]:
def populate_graph(trends_list):
    G = nx.Graph()
    for trend in trends_list:
        #node1 = convert_trend(trend.trend)
        node1 = trend.trend
        for key, item in trend.keywords.items():
            node2 = key
            if G.has_edge(node1, node2):
                G[node1][node2]['weight'] += item
            else:
                # new edge. add with weight=1
                G.add_node(node1, is_key = False)
                G.add_node(node2, is_key = True)
                G.add_edge(node1, node2, weight=item)
    for u, v, d in G.edges(data=True):
        weight = d['weight']
    return G

## Find top trending terms for a search term

In [187]:
def top_N_trends(G, search_term, n=10):
    node_edges = G.edges(search_term.lower())
    edges_dict = {}
    for edge in node_edges:
        key = edge[1]
        value = G[edge[0]][edge[1]]['weight']
        edges_dict.update({key:value})
    index = 1
    trends = []
    for key, value in sorted(edges_dict.items(), key=operator.itemgetter(1), reverse=True):
        trends.append(' '.join([key, str(value)]))
        index += 1
        if index > n:
            break
    return trends

In [188]:
def convert_trend(trend):
    if len(trend) > 1:
         trend = ' '.join(trend)
    return trend

def convert_trends(trends):
    trends_converted = {}
    for key, item in trends.items():
        key_new = convert_trend(key)
        item_new = item['vals']
        trends_converted.update({key_new:item_new})
    return trends_converted

In [189]:
def graph_figure(G):
    nx.draw_networkx_nodes(G[2], nx.spring_layout(G[2]), node_size=10)
    nx.draw_networkx_edges(G[2], nx.spring_layout(G[2]), alpha=0.4)
    plt.xlim((-0.1, 0.1))
    plt.ylim((-0.1, 0.1))
    plt.show()

In [190]:
def centrality_measures(G, cent_meas):
    if cent_meas == 'degree_centrality':
        dc = nx.degree_centrality(G)
        dc = sorted(dc, key=dc.get, reverse=True)
        return dc
    elif cent_meas == 'betweenness_centrality':
        bc = nx.betweenness_centrality(G)
        bc = sorted(bc, key=bc.get, reverse=True)
        return bc
    elif cent_meas == 'eigenvector_centrality':
        ec = nx.eigenvector_centrality_numpy(G)
        ec = sorted(ec, key=ec.get, reverse=True)
        return ec
    else:
        print("ImagingEdge ERROR: centrality measure not recognized.")
        return

In [191]:
def filt_df(df):
    """Filters and de-dupes trending terms
        
    Args:
        unfiltered data frame
        
    Returns:
        filtered data frame
    """ 
    df_filt = df.copy(deep=True)
    df_filt = df_filt[df_filt.score > 0]
    for i in range(df.shape[0]):
        if i % 1000 == 0:
            logging.debug("Term " + str(i))
        single_let = False
        term = df.iloc[i,0]
        for element in term:
            # DROP TRENDS WITH SINGLE LETTER TERMS
            if len(element) == 1:
                logging.debug("dropping " + str(term) + " as it contains a single letter term")
                df_filt.drop(df_filt[df_filt['key'] == term].index, inplace=True)
                single_let = True
        if not single_let:
            # DEDUPE
            term_set = set(term)    
            for j in range(df.shape[0]):
                entry = df.iloc[j,0]
                entry_set = set(entry)
                if i != j:
                    if entry_set.issubset(term_set):
                        df_filt.drop(df_filt[df_filt['key'] == entry].index, inplace=True)            
    logging.info("df length: " + str(df.shape))
    logging.info("filtered df length: " + str(df_filt.shape))
    return df_filt

In [192]:
def pairwise_scatter_plots():
    web_df['listnum'] = np.tile(1, (web_df.shape[0], 1))
    dc_df = pd.DataFrame(dc_list, index=[1]).melt()
    dc_df['listnum'] = np.tile(2, (web_df.shape[0], 1))
    ec_df = pd.DataFrame(ec_list, index=[2]).melt()
    ec_df['listnum'] = np.tile(3, (web_df.shape[0], 1))
    bc_df = pd.DataFrame(bc_list, index=[3]).melt()
    bc_df['listnum'] = np.tile(4, (web_df.shape[0], 1))
    import altair as alt
    chart = alt.Chart(df, width=400).mark_line().encode(
        x = 'listnum:O',
        y = 'value:O', 
        color = 'variable'
    )
    chart

In [193]:
def pickle_part_3_data(G, trends_converted, hot_search_terms, path):
    with open(os.path.join(path, 'G.pickle'), 'wb') as graph_path:
        pickle.dump(G, graph_path)
    with open(os.path.join(path, 'trends_converted.pickle'), 'wb') as trends_path:
        pickle.dump(trends_converted, trends_path)
    with open(os.path.join(path, 'hot_search_terms.pickle'), 'wb') as st_path:
        pickle.dump(hot_search_terms, st_path)

In [194]:
def export_to_app(G, trends_converted, hot_search_terms):
    GRAPH_PATH = os.path.join(APP_PATH, '/G.pickle')
    TRENDS_PATH = os.path.join(APP_PATH, '/trends_converted.pickle')
    SEARCH_TERMS_PATH = os.path.join(APP_PATH, '/hot_search_terms.pickle')
    with open(GRAPH_PATH, 'wb') as graph_path:
        pickle.dump(G, graph_path)
    with open(TRENDS_PATH, 'wb') as trends_path:
        pickle.dump(trends_converted, trends_path)
    with open(SEARCH_TERMS_PATH, 'wb') as st_path:
        pickle.dump(hot_search_terms, st_path)    

In [195]:
def build_graph(year=2017, export=False): 
    """Part 3 of ImagingEdge: Build Labeled Graph.
 
    Args:
        year: Year being evaluated for trends. Default is 2017, but can be used for historic periods for validation. 
        (TODO: allow custom date ranges)
        export: export results to app. Method can be called here, or in part 4 after the graph has learned from 
        unstructured sources.
    """
    # limit number of searched keywords, for computational reasons
    N_KEYWORDS = 10000
    # choose number of search terms displayed in app
    N_APP_SEARCH_TERMS = 20
    # Trends close to zero are noise. Set threshold for positive trend scores:
    TREND_THRESH = 0.01
    GRAPH_PATH = os.path.join(IMEDGE_PATH, str(year))
    reset_logging()
    logfile = os.path.join(IMEDGE_PATH, 'imedge_3_graph.log')
    print("Logfile path: ", logfile)
    logging.basicConfig(filename=logfile,level=logging.INFO)
    trends, records, df = unpickle_part_2_data(year)
    df_filt = filt_df(df)
    num_above_zero = sum(df_filt.score > TREND_THRESH)
    logging.info("number of positive trends:" + str(num_above_zero))
    trends_list = pair_trends_keywords(df_filt, records,
                                       num_above_zero, N_KEYWORDS, from_sql = True)
    G = populate_graph(trends_list)
    trends_converted = convert_trends(trends)
    hot_search_terms = get_hot_search_terms(G, N=N_APP_SEARCH_TERMS)
    pickle_part_3_data(G, trends_converted, hot_search_terms, GRAPH_PATH)
    if export:
        export_to_app(G, trends_converted, hot_search_terms)
    return df_filt

In [196]:
#df_filt = build_graph(year=2017)

Logfile path:  /home/ericbarnhill/Documents/code/insight/imedge/imedge_3_graph.log


In [197]:
#df_filt.head()

Unnamed: 0,key,score,total_mentions
214,"(provide, information)",[0.7688005207229552],441.0
2047,"(near, infrared)",[0.7015220347594769],259.0
1708,"(cell, line)",[0.4902761235578917],451.0
1946,"(brain, networks)",[0.4058959668338355],577.0
2282,"(imaging, techniques)",[0.37814831534738],1306.0
