## Querying data from Arxiv API

Atom parsing code in this notebook is mostly taken from the sample code on Arxiv API [documentation](https://arxiv.org/help/api/examples/python_arXiv_parsing_example.txt).  

In [110]:
import urllib.request
import feedparser
from itertools import combinations
from pyspark.sql import Row
from pyspark.sql.types import *

In [111]:
def query_arxiv(search_query, start, max_results):
    base_url = 'http://export.arxiv.org/api/query?'

    query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                         start,
                                                         max_results)
    # perform a GET request using the base_url and query
    response = urllib.request.urlopen(base_url+query).read()

    # parse the response using feedparser
    feed = feedparser.parse(response)

    return feed

In [112]:
def read_authors(feed):
    author_list = []
    for entry in feed.entries:
        for name in (author.name for author in entry.authors):
            # maybe consider case insensitive comparing
            if name not in map(lambda t: t[1], author_list):
                author_list.append((len(author_list),name))        
    return author_list

In [113]:
def read_collabs(feed):
    collab_list = []
    for entry in feed.entries:
        title = entry.title
        arxiv_id = entry.id.split('/abs/')[-1]
        authors = (author.name for author in entry.authors)
        for a1,a2 in combinations(authors,2):
            collab_list.append((find_id(a1,author_list),find_id(a2,author_list),arxiv_id,title))     
    return collab_list

In [114]:
def find_id(a, author_list):
    return [y[1] for y in author_list].index(a)

In [115]:
def create_df(l, schema):
    rdd = sc.parallelize(l)
    return spark.createDataFrame(rdd,schema)

### Querying Arxiv API

In [116]:
# Query parameters
search_query = 'au:%22daphne+koller%22'
start = 0
max_results = 5

# Querying the Arxiv API
feed = query_arxiv(search_query, start, max_results) 
print('Feed last updated: %s' % feed.feed.updated)
print('Total results for this query: %s' % feed.feed.opensearch_totalresults)
print('Max results for this query: %s' % len(feed.entries))

Feed last updated: 2019-02-10T00:00:00-05:00
Total results for this query: 43
Max results for this query: 5


In [117]:
# Create a data frame for unique authors <AuthorID, AuthorName>
author_list = read_authors(feed)
schema = StructType([
        StructField("id", IntegerType(), True),
        StructField("name", StringType(), True)
    ])
author_df = create_df(author_list, schema)
author_df.collect()

[Row(id=0, name='M. Pawan Kumar'),
 Row(id=1, name='Daphne Koller'),
 Row(id=2, name='Marc Teyssier'),
 Row(id=3, name='Urszula Chajewska'),
 Row(id=4, name='Nir Friedman'),
 Row(id=5, name='Ron Parr')]

In [118]:
# Create a data frame for collaborations <Author1ID, Author2ID, PaperArxivID, PaperTitle>
collab_list = read_collabs(feed)
schema = StructType([
        StructField("src", IntegerType(), True),
        StructField("dest", IntegerType(), True),
        StructField("arxiv", StringType(), True),
        StructField("title", StringType(), True)
    ])
collab_df = create_df(collab_list, schema)
collab_df.collect()

[Row(src=0, dest=1, arxiv='1205.2633v1', title='MAP Estimation of Semi-Metric MRFs via Hierarchical Graph Cuts'),
 Row(src=2, dest=1, arxiv='1207.1429v1', title='Ordering-Based Search: A Simple and Effective Algorithm for Learning\n  Bayesian Networks'),
 Row(src=3, dest=1, arxiv='1301.3840v1', title='Utilities as Random Variables: Density Estimation and Structure\n  Discovery'),
 Row(src=4, dest=1, arxiv='1301.3856v1', title='Being Bayesian about Network Structure'),
 Row(src=1, dest=5, arxiv='1301.3869v1', title='Policy Iteration for Factored MDPs')]

### Save Author  and Collab Dataframes to Disk

In [119]:
author_df.write.mode('overwrite').parquet("Data/authors-%s.parquet" % search_query)
collab_df.write.mode('overwrite').parquet("Data/collab-%s.parquet" % search_query)

In [120]:
spark.sparkContext