In [1]:
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd

In [3]:
base_url = "https://papers.nips.cc"
proc_url = base_url + "/book/advances-in-neural-information-processing-systems-31-2018"

In [23]:
def get_soup(url):
    page = requests.get(url, timeout=20)
    soup = BeautifulSoup(page.content, "html.parser")
    return soup


def get_contents(a):
    return "".join([str(c) for c in a.contents])


def get_pdf_link(page):
    try:
        return [a for a in paper_page.findAll('a') if a.string == '[PDF]'][0]['href']
    except:
        return None

    
def get_abstract_text(page):
    try:
        return get_contents(paper_page.find("p", {"class": "abstract"}))
    except:
        return None


def get_supplemental_link(page):
    try:
        return [a for a in paper_page.findAll('a') if a.string == '[Supplemental]'][0]['href']
    except:
        return None


def get_event_type(page):
    try:
        return [a for a in page.findAll('h3') if 'Conference Event Type' in a.string][0].string.split(':')[-1]
    except:
        return None


def get_paper_details(paper_page):
    res = {'pdf_link': get_pdf_link(paper_page),
           'abstract_txt': get_abstract_text(paper_page),
           'supplemental_link': get_supplemental_link(paper_page),
           'event_type': get_event_type(paper_page)}
    return res

## Download and parse the proceedings paper listing page

In [4]:
proc_content = get_soup(proc_url)
bullets = proc_content.find_all('li')

## Extract high-level metadata for each paper
- Title
- Authors
- Link to paper page

In [22]:
papers = []
for b in bullets:
    author_divs = b.findAll("a", {"class": "author"})
    paperlinktitle = b.findNext('a')
    res = {'paper_link': paperlinktitle['href'],
           'paper_title': get_contents(paperlinktitle),
           'paper_authors': []}
    for a in author_divs:
        res['paper_authors'].append({'author_name': get_contents(a),
                                    'author_link': a['href']})
    papers.append(res)
papers[-3:]

[{'paper_link': '/paper/8293-transfer-of-deep-reactive-policies-for-mdp-planning',
  'paper_title': 'Transfer of Deep Reactive Policies for MDP Planning',
  'paper_authors': [{'author_name': 'Aniket (Nick) Bajpai',
    'author_link': '/author/aniket-nick-bajpai-12405'},
   {'author_name': 'Sankalp Garg',
    'author_link': '/author/sankalp-garg-12406'},
   {'author_name': 'None', 'author_link': '/author/none-12407'}]},
 {'paper_link': '/paper/8294-the-price-of-fair-pca-one-extra-dimension',
  'paper_title': 'The Price of Fair PCA: One Extra dimension',
  'paper_authors': [{'author_name': 'Samira Samadi',
    'author_link': '/author/samira-samadi-12408'},
   {'author_name': 'Uthaipon Tantipongpipat',
    'author_link': '/author/uthaipon-tantipongpipat-12054'},
   {'author_name': 'Jamie H. Morgenstern',
    'author_link': '/author/jamie-h-morgenstern-8075'},
   {'author_name': 'Mohit Singh', 'author_link': '/author/mohit-singh-12409'},
   {'author_name': 'Santosh Vempala',
    'author_li

In [25]:
print(len(papers))

1010


## Get full metadata for each paper
- PDF link
- Supplemental Link
- Abstract text
- event type

In [26]:
for i, paper in enumerate(papers):
    url = paper['paper_link']
    print(i, paper['paper_title'])
    paper_url = base_url + url
    paper_page = get_soup(paper_url)
    paper.update(get_paper_details(paper_page))

0 Books
1 Efficient Algorithms for Non-convex Isotonic Regression through Submodular Optimization
2 Structure-Aware Convolutional Neural Networks
3 Kalman Normalization: Normalizing Internal Representations Across Network Layers
4 HOGWILD!-Gibbs can be PanAccurate
5 Text-Adaptive Generative Adversarial Networks: Manipulating Images with Natural Language
6 IntroVAE: Introspective Variational Autoencoders for Photographic Image Synthesis
7 Doubly Robust Bayesian Inference for Non-Stationary Streaming Data with <var>\beta</var>-Divergences
8 Adapted Deep Embeddings: A Synthesis of Methods for k-Shot Inductive Transfer Learning
9 Generalized Inverse Optimization through Online Learning
10 An Off-policy Policy Gradient Theorem Using Emphatic Weightings
11 Supervised autoencoders: Improving generalization performance with unsupervised regularizers
12 Visual Object Networks: Image Generation with Disentangled 3D Representations
13 Understanding Weight Normalized Deep Neural Networks with Rect

122 FRAGE: Frequency-Agnostic Word Representation
123 Generative Neural Machine Translation
124 Found Graph Data and Planted Vertex Covers
125 Joint Active Feature Acquisition and Classification with Variable-Size Set Encoding
126 Regularization Learning Networks: Deep Learning for Tabular Datasets
127 Multitask Boosting for Survival Analysis with Competing Risks
128 Geometry Based Data Generation
129 SLAYER: Spike Layer Error Reassignment in Time
130 On Oracle-Efficient PAC RL with Rich Observations
131 Gradient Descent for Spiking Neural Networks
132 Generalizing Tree Probability Estimation via Bayesian Networks
133 Where Do You Think You're Going?: Inferring Beliefs about Dynamics from Behavior
134 Designing by Training: Acceleration Neural Network for Fast High-Dimensional Convolution
135 Understanding the Role of Adaptivity in Machine Teaching: The Case of Version Space Learners
136 A loss framework for calibrated anomaly detection
137 PacGAN: The power of two samples in generativ

246 Causal Discovery from Discrete Data using Hidden Compact Representation
247 Natasha 2: Faster Non-Convex Optimization Than SGD
248 Minimax Statistical Learning with Wasserstein distances
249 Provable Variational Inference for Constrained Log-Submodular Models
250 Learning Hierarchical Semantic Image Manipulation through Structured Representations
251 Processing of missing data by neural networks
252 Safe Active Learning for Time-Series Modeling with Gaussian Processes
253 Optimal Algorithms for Non-Smooth Distributed Optimization in Networks
254 Computing Higher Order Derivatives of Matrix and Tensor Expressions
255 Paraphrasing Complex Network: Network Compression via Factor Transfer
256 Analytic solution and stationary phase approximation for the Bayesian lasso and elastic net
257 Demystifying excessively volatile human learning: A Bayesian persistent prior and a neural approximation
258 Empirical Risk Minimization Under Fairness Constraints
259 Unsupervised Learning of Shape and

365 Learning from discriminative feature feedback
366 RetGK: Graph Kernels based on Return Probabilities of Random Walks
367 Deep Generative Markov State Models
368 Early Stopping for Nonparametric Testing
369 Solving Non-smooth Constrained Programs with Lower Complexity than <var>\mathcal{O}(1/\varepsilon)</var>: A Primal-Dual Homotopy Smoothing Approach
370 Heterogeneous Bitwidth Binarization in Convolutional Neural Networks
371 Unsupervised Learning of Object Landmarks through Conditional Image Generation
372 Probabilistic Neural Programmed Networks for Scene Generation
373 The streaming rollout of deep networks - towards fully model-parallel execution
374 KONG: Kernels for ordered-neighborhood graphs
375 GumBolt: Extending Gumbel trick to Boltzmann priors
376 Neural Networks Trained to Solve Differential Equations Learn General Representations
377 Beauty-in-averageness and its contextual modulations: A Bayesian statistical account
378 Distributed Weight Consolidation: A Brain Segme

483 Assessing Generative Models via Precision and Recall
484 Multiple-Step Greedy Policies in Approximate and Online Reinforcement Learning
485 A Convex Duality Framework for GANs
486 Horizon-Independent Minimax Linear Regression
487 Exploiting Numerical Sparsity for Efficient Learning : Faster Eigenvector Computation and Regression
488 Experimental Design for Cost-Aware Learning of Causal Graphs
489 Task-Driven Convolutional Recurrent Models of the Visual System
490 Meta-Reinforcement Learning of Structured Exploration Strategies
491 Sample Efficient Stochastic Gradient Iterative Hard Thresholding Method for Stochastic Sparse Linear Regression with Limited Attribute Observation
492 Semi-supervised Deep Kernel Learning: Regression with Unlabeled Data by Minimizing Predictive Variance
493 Generalizing to Unseen Domains via Adversarial Data Augmentation
494 Hyperbolic Neural Networks
495 Breaking the Curse of Horizon: Infinite-Horizon Off-Policy Estimation
496 Learning Task Specification

604 Proximal SCOPE for Distributed Sparse Learning
605 On Coresets for Logistic Regression
606 Neural Ordinary Differential Equations
607 Unsupervised Learning of Artistic Styles with Archetypal Style Analysis
608 Approximating Real-Time Recurrent Learning with Random Kronecker Factors
609 Contamination Attacks and Mitigation in Multi-Party Machine Learning
610 An Improved Analysis of Alternating Minimization for Structured Multi-Response Regression
611 Incorporating Context into Language Encoding Models for fMRI
612 CatBoost: unbiased boosting with categorical features
613 Query K-means Clustering and the Double Dixie Cup Problem
614 Training Neural Networks Using Features Replay
615 Modeling Dynamic Missingness of Implicit Feedback for Recommendation
616 Representation Learning of Compositional Data
617 Model-based targeted dimensionality reduction for neuronal population data
618 On gradient regularizers for MMD GANs
619 Heterogeneous Multi-output Gaussian Process Prediction
620 Lar

726 A Reduction for Efficient LDA Topic Reconstruction
727 Cluster Variational Approximations for Structure Learning of Continuous-Time Bayesian Networks from Incomplete Data
728 RenderNet: A deep convolutional network for differentiable rendering from 3D shapes
729 Robust Hypothesis Testing Using Wasserstein Uncertainty Sets
730 Robust Detection of Adversarial Attacks by Modeling the Intrinsic Properties of Deep Neural Networks
731 Monte-Carlo Tree Search for Constrained POMDPs
732 Learning to Repair Software Vulnerabilities with Generative Adversarial Networks
733 Layer-Wise Coordination between Encoder and Decoder for Neural Machine Translation
734 Dirichlet belief networks for topic structure learning
735 Stochastic Expectation Maximization with Variance Reduction
736 Submodular Maximization via Gradient Ascent: The Case of Deep Submodular   Functions
737 The challenge of realistic music generation: modelling raw audio at scale
738 Spectral Signatures in Backdoor Attacks
739 Reward

847 Deep Predictive Coding Network with Local Recurrent Processing for Object Recognition
848 PAC-Bayes bounds for stable algorithms with instance-dependent priors
849 Beyond Grids: Learning Graph Representations for Visual Recognition
850 The Limit Points of (Optimistic) Gradient Descent in Min-Max Optimization
851 Coordinate Descent with Bandit Sampling
852 Deep Dynamical Modeling and Control of Unsteady Fluid Flows
853 Confounding-Robust Policy Improvement
854 The Importance of Sampling inMeta-Reinforcement Learning
855 Representer Point Selection for Explaining Deep Neural Networks
856 The Effect of Network Width on the Performance of  Large-batch Training
857 SNIPER: Efficient Multi-Scale Training
858 The Sample Complexity of Semi-Supervised Learning with Nonparametric Mixture Models
859 Hardware Conditioned Policies for Multi-Robot Transfer Learning
860 Co-regularized Alignment for Unsupervised Domain Adaptation
861 Statistical and Computational Trade-Offs in Kernel K-Means
862 A

970 Fast deep reinforcement learning using online adjustments from the past
971 Improved Network Robustness with Adversary Critic
972 Regret Bounds for Online Portfolio Selection with a Cardinality Constraint
973 Sketching Method for Large Scale Combinatorial Inference
974 Connecting Optimization and Regularization Paths
975 Fully Neural Network Based Speech Recognition on Mobile and Embedded Devices
976 Understanding Regularized Spectral Clustering via Graph Conductance
977 Data-Driven Clustering via Parameterized Lloyd's Families
978 Learning Beam Search Policies via Imitation Learning
979 Benefits of over-parameterization with EM
980 Thermostat-assisted continuously-tempered Hamiltonian Monte Carlo for Bayesian learning
981 Robust Subspace Approximation in a Stream
982 Mean Field for the Stochastic Blockmodel: Optimization Landscape and Convergence Issues
983 Analysis of Krylov Subspace Solutions of  Regularized Non-Convex Quadratic Problems
984 Autoconj: Recognizing and Exploiting 

In [30]:
papers_df = pd.DataFrame(papers)

In [39]:
papers_df.to_clipboard()

In [37]:
papers_df['event_type'].value_counts()

 Poster    979
 Oral       30
Name: event_type, dtype: int64

In [36]:
papers_df['paper_authors'].apply(lambda x: len(x)).value_counts()

3     278
4     219
2     184
5     155
6      74
7      36
1      35
8      16
10      5
9       3
12      2
0       2
11      1
Name: paper_authors, dtype: int64

## Save Metadata to json file

In [34]:
with open('data/NeurIPS2018/metadata/NeurIPS_Proceedings_Metadata.json','w') as f:
    json.dump(papers, f)