# Imaging Edge Notebook 4: Mine Unstructured Sources

ImagingEdge detects trends in the radiological research literature before they become mainstream publications, patents and products.

*Part 4: In (this notebook) of the app, the graphs "learns" by adding unstructured sources.*

Other parts:

Part 1: Scrape PubMed

Part 2: Convert PubMed abstracts to Bag of Words

Part 3: Build graph connecting search terms and trends

### Created by Eric Barnhill for Insight Health Data Science
#### 2018 No License

Documentation follows the [Google Python Style Guide](http://google.github.io/styleguide/pyguide.html)

## Table of Contents

### Part 1: Scraping Arxiv
### Part 2: Scraping Twitter
### Part 3: Scraping Custom URLs

In [35]:
from bs4 import BeautifulSoup
from urllib import request
import datetime
from dateutil.relativedelta import relativedelta
import pickle
import os
import twitterscraper
import lxml
import logging
import contextlib
import time
YEAR = 2015
PATH = "/home/ericbarnhill/Documents/code/insight/rtr/" + str(YEAR) + "/"
os.chdir(PATH)
# copypasta from previous notebook -- todo: strategy to set these variables
N_WINDOWS = 12
N_MONTHS = 1
START_DATE = datetime.date(YEAR,1,1)

In [36]:
def scrape_arxiv(start_date, n_windows, n_months, search_term):
    """Pull abstracts within rolling date windows from the ArXiv
        
    Args:
        start_date: start date for all windows
        n_windows: number of rolling windows
        n_months: number of months in each window
        node: radiological search term used to probe the ArXiv
        
    Returns:
        List of arxiv abstract sets, one per rolling window
    """
    window_records = []
    for n in range(n_windows):
        start = start_date + n*relativedelta(months=+1)
        start_string = start.strftime("%Y-%m-%d")
        end = start + relativedelta(months=+n_months)
        end_string = end.strftime("%Y-%m-%d")
        #logging.info("Query from " + str(start) + " to " + str(end))
        # ArXiv API does not appear to handle combined abstract & date searches.
        # Consequently ArXiv is scraped through the 'front door'
        # Note that this has a hard limit of 200 per rolling window, however
        # this seems to be sufficient in preliminary testing
        arxiv_request = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term='+ \
        search_term+ \
        '&terms-0-field=abstract&classification-physics_archives=all&' + \
        'date-year=&date-filter_by=date_range&' \
        'date-from_date=' + start_string + '&date-to_date=' + \
        end_string + '&size=200'
        with request.urlopen(arxiv_request) as response:
            page = response.read()
        soup = BeautifulSoup(page, 'html.parser').get_text()
        soup_splits = soup.split('More')
        soup_abstracts = []
        for n in range(1, len(soup_splits)): # skip first one, it is preparatory text
            soup_split = soup_splits[n]
            soup_abstracts.append(soup_split.split('Less')[0])
        #("Acquired", len(soup_abstracts), "abstracts across time window", n)
        window_records.append(soup_abstracts)
    return window_records


In [37]:
def scrape_twitter(start_date, n_windows, n_months, search_term, trending_term):
    """Scrape Twitter within rolling date windows
    
    To handle Twitter, the methodology changes a bit. I search the dates
    for tweets containing search AND trending terms together.
        
    Args:
        start_date: start date for all windows
        n_windows: number of rolling windows
        n_months: number of months in each window
        node: radiological search term used to probe the ArXiv
        
    Returns:
        List of twitter mentions, one per rolling window
    """
    window_records = []
    for n in range(n_windows):
        start = start_date + n*relativedelta(months=+1)
        end = start + relativedelta(months=+n_months)
        query = search_term + ' AND ' + trending_term
        logger = logging.getLogger('twitterscraper')
        logger.disabled = True
        twitter_response = twitterscraper.query_tweets(query, limit=200, 
                            begindate=start, 
                            enddate=end, poolsize=20, lang='')
        logger.disabled = False
        window_records.append(len(twitter_response))
    return window_records

In [38]:
def load_graph_and_trends(path):
    with open(path+'/G.pickle', 'rb') as fp:
        G = pickle.load(fp)
    with open(path+'/trends_converted.pickle', 'rb') as fp:
        trends = pickle.load(fp)
    return G, trends

def save_graph_and_trends(path, G, trends):
    with open(path+'/G_x.pickle', 'wb') as fp:
        pickle.dump(G)
    with open(path+'/trends_converted_x.pickle', 'wb') as fp:
        pickle.dump(trends)

In [39]:
def develop_graph(path):
    G, trends = load_graph_and_trends(path)
    ## loop through nodes
    for node,data in G.nodes(data=True):
        if data['is_key']:
            node_neighbors = list(G.neighbors(node))
            # scrape the ArXiv
            print("Scraping ArXiv for:", node)
            print("number of neighbors", len(node_neighbors))
            start=time.time()
            arxiv_abstracts = scrape_arxiv(START_DATE, N_WINDOWS, N_MONTHS, node)
            for i, abstract_set in enumerate(arxiv_abstracts):
                for abstract in abstract_set:
                    for neighbor in node_neighbors:
                        if neighbor in abstract:
                            # add a weight to the edge of the graph
                            G[node][neighbor]['weight'] = G[node][neighbor]['weight'] + 1
                            # add a mention in the trends data
                            #print("Adding one to", neighbor, \
                            #      "new mentions total", trends[neighbor][i] + 1)
                            trends[neighbor][i] = trends[neighbor][i] + 1
                            # If found in this time period in the ArXiv, look
                            # at Twitter output.
                            # This was done to reduce the computational scope.
                            #print("Searching Twitter for", node, '+', neighbor)
                            #twitter_mentions = scrape_twitter(START_DATE, N_WINDOWS, N_MONTHS, 
                            #                                  node, neighbor)
                            #for i, tweet_count in enumerate(twitter_mentions):
                            #    G[node][neighbor]['weight'] = G[node][neighbor]['weight'] + 1
                            #    trends[neighbor][i] = trends[neighbor][i] + 1
            print("Elapsed time ", time.time()-start)
    save_graph_and_trends(path, G, trends)
    return G, trends

In [None]:
develop_graph(PATH)