## Querying data from Arxiv API

Atom parsing code in this notebook is mostly taken from the sample code on Arxiv API [documentation](https://arxiv.org/help/api/examples/python_arXiv_parsing_example.txt).  

In [2]:
import urllib.request
import feedparser
from itertools import combinations
from pyspark.sql import Row
from pyspark.sql.types import *

In [17]:
def query_arxiv(search_query, start, max_results = -1):
    base_url = 'http://export.arxiv.org/api/query?'
    query = 'search_query=%s&start=%i%s' % (search_query,
                                          start,
                                          "" if max_results == -1 else ("&max_results=%i"% max_results))
    # perform a GET request using the base_url and query
    response = urllib.request.urlopen(base_url+query).read()

    # parse the response using feedparser
    feed = feedparser.parse(response)

    return feed

In [4]:
def read_authors(feed):
    author_list = []
    for entry in feed.entries:
        for name in (author.name for author in entry.authors):
            # maybe consider case insensitive comparing
            if name not in map(lambda t: t[1], author_list):
                author_list.append((len(author_list),name))        
    return author_list

In [5]:
def read_collabs(feed):
    collab_list = []
    for entry in feed.entries:
        title = entry.title
        arxiv_id = entry.id.split('/abs/')[-1]
        authors = (author.name for author in entry.authors)
        for a1,a2 in combinations(authors,2):
            collab_list.append((find_id(a1,author_list),find_id(a2,author_list),arxiv_id,title))     
    return collab_list

In [6]:
def find_id(a, author_list):
    return [y[1] for y in author_list].index(a)

In [7]:
def create_df(l, schema):
    rdd = sc.parallelize(l)
    return spark.createDataFrame(rdd,schema)

### Querying Arxiv API

In [20]:
# Query parameters
# search_query = 'au:%22daphne+koller%22'
search_query = 'cat:stat.ML'
start = 0
max_results = 2000

# Querying the Arxiv API
feed = query_arxiv(search_query, start, max_results) 
print('Feed last updated: %s' % feed.feed.updated)
print('Total results for this query: %s' % feed.feed.opensearch_totalresults)
print('Max results for this query: %s' % len(feed.entries))

&max_results=2000
Feed last updated: 2019-02-10T00:00:00-05:00
Total results for this query: 22672
Max results for this query: 2000


In [23]:
# Create a data frame for unique authors <AuthorID, AuthorName>
author_list = read_authors(feed)
schema = StructType([
        StructField("id", IntegerType(), True),
        StructField("name", StringType(), True)
    ])
author_df = create_df(author_list, schema)
author_df.filter(author_df.id  < 10).collect()
# author_df.collect()

[Row(id=0, name='Marten Wegkamp'),
 Row(id=1, name='Bharath K. Sriperumbudur'),
 Row(id=2, name='Gert R. G. Lanckriet'),
 Row(id=3, name='Patrick Erik Bradley'),
 Row(id=4, name='Roberto C. Alamino'),
 Row(id=5, name='Nestor Caticha'),
 Row(id=6, name='Yen-Jen Oyang'),
 Row(id=7, name='Darby Tien-Hao Chang'),
 Row(id=8, name='Yu-Yen Ou'),
 Row(id=9, name='Hao-Geng Hung')]

In [25]:
author_df.filter(author_df.name == "Larry Wasserman").collect()

[Row(id=90, name='Larry Wasserman')]

In [26]:
# Create a data frame for collaborations <Author1ID, Author2ID, PaperArxivID, PaperTitle>
collab_list = read_collabs(feed)
schema = StructType([
        StructField("src", IntegerType(), True),
        StructField("dest", IntegerType(), True),
        StructField("arxiv", StringType(), True),
        StructField("title", StringType(), True)
    ])
collab_df = create_df(collab_list, schema)
# collab_df.collect()

### Save Author  and Collab Dataframes to Disk

In [27]:
author_df.write.mode('overwrite').parquet("Data/authors-%s.parquet" % search_query)
collab_df.write.mode('overwrite').parquet("Data/collab-%s.parquet" % search_query)

In [120]:
spark.sparkContext