In [1]:
import numpy as np
import pandas as pd

# Load Data

## Author loading

In [2]:
authors = pd.read_csv("./kaggle_archive/authors.csv")

# check how many entries are deleted trough dropping missing data
print(f"deleted entries through missing data {authors.shape[0]-authors.dropna().shape[0]}")

# drop missing data
authors = authors.dropna()

deleted entries through missing data 1185


In [3]:
authors

Unnamed: 0,paper_index,name,institution,is_chinese,name_in_database
1,12020,Sangnie Bhardwaj,google,0,Sangnie Bhardwaj
2,12020,Ian Fischer,google,0,Ian Fischer
3,12020,Johannes Ballé,google,0,Johannes Ballé
4,12020,Troy Chinen,google,0,Troy Chinen
5,22020,Jean-Baptiste Alayrac,massachusetts institute of technology,0,Jean-Baptiste Alayrac
...,...,...,...,...,...
36823,86932019,Joshua Wang,google,1,Joshua Wang
36824,23022019,Ruho Kondo,"toyota central r&d labs., inc.",0,Ruho Kondo
36825,23022019,Keisuke Kawano,"toyota central r&d labs., inc",0,Keisuke Kawano
36826,23022019,Satoshi Koide,toyota central r&d labs.,0,Satoshi Koide


## Paper loading

In [4]:
papers = pd.read_csv("./kaggle_archive/papers.csv")

# check how many entries are deleted trough dropping missing data
print(f"deleted entries through missing data {papers.shape[0]-papers.dropna().shape[0]}")

# drop missing data
papers = papers.dropna()

deleted entries through missing data 3342


In [5]:
papers

Unnamed: 0.1,Unnamed: 0,title,abstract,year,paper_index,full_text
0,0,A graph similarity for deep learning,Graph neural networks (GNNs) have been success...,2020,2020,A Graph Similarity for Deep Learning\nSeongmin...
1,1,An Unsupervised Information-Theoretic Perceptu...,Tractable models of human perception have prov...,2020,12020,An Unsupervised Information-Theoretic\nPercept...
2,2,Self-Supervised MultiModal Versatile Networks,Videos are a rich source of multi-modal superv...,2020,22020,Self-Supervised MultiModal Versatile Networks\...
3,3,"Benchmarking Deep Inverse Models over time, an...",We consider the task of solving generic invers...,2020,32020,"Benchmarking Deep Inverse Models over time, an..."
4,4,Off-Policy Evaluation and Learning for Externa...,We consider the evaluation and training of a n...,2020,42020,Off-Policy Evaluation and Learning\nfor Extern...
...,...,...,...,...,...,...
11275,11275,Discrete Object Generation with Reversible Ind...,The success of generative modeling in continuo...,2019,54522019,Discrete Object Generation\n\nwith Reversible ...
11276,11276,Adaptively Aligned Image Captioning via Adapti...,Recent neural models for image captioning usua...,2019,47992019,Adaptively Aligned Image Captioning via\n\nAda...
11277,11277,Fully Dynamic Consistent Facility Location,We consider classic clustering problems in ful...,2019,18272019,Fully Dynamic Consistent Facility Location\n\n...
11278,11278,Efficient Rematerialization for Deep Networks,"When training complex neural networks, memory ...",2019,86932019,Efﬁcient Rematerialization for Deep Networks\n...


## Institution loading

In [6]:
institutions = pd.read_csv("./kaggle_archive/institutions.csv")

# check how many entries are deleted trough dropping missing data
print(f"deleted entries through missing data {institutions.shape[0]-institutions.dropna().shape[0]}")

# drop missing data
institutions = institutions.dropna()

deleted entries through missing data 0


In [7]:
institutions

Unnamed: 0.1,Unnamed: 0,institutions,city,state,country
0,0,stanford university,Stanford,CA,US
1,1,mit,Cambridge,MA,US
2,2,carnegie mellon university,Pittsburgh,PA,US
3,3,uc berkeley,Berkeley,CA,US
4,4,google,Mountain View,CA,US
...,...,...,...,...,...
95,95,the university of texas at austin,Austin,TX,US
96,96,university of virginia,Charlottesville,VA,US
97,97,facebook,Menlo Park,CA,US
98,98,rutgers university,Newark and Camden,NJ,US


# Preprocess Data

After dropping all NaN in the list - the dataframes should be now updated, that only papers and authors with full information are still in the dataframes.

## Papers

In [8]:
# delete fulltext since not necessary and dataframe gets big through that
del papers["full_text"]

# delete Unnamed: 0 since it is only a duplication of the indices
del papers['Unnamed: 0']

In [9]:
papers

Unnamed: 0,title,abstract,year,paper_index
0,A graph similarity for deep learning,Graph neural networks (GNNs) have been success...,2020,2020
1,An Unsupervised Information-Theoretic Perceptu...,Tractable models of human perception have prov...,2020,12020
2,Self-Supervised MultiModal Versatile Networks,Videos are a rich source of multi-modal superv...,2020,22020
3,"Benchmarking Deep Inverse Models over time, an...",We consider the task of solving generic invers...,2020,32020
4,Off-Policy Evaluation and Learning for Externa...,We consider the evaluation and training of a n...,2020,42020
...,...,...,...,...
11275,Discrete Object Generation with Reversible Ind...,The success of generative modeling in continuo...,2019,54522019
11276,Adaptively Aligned Image Captioning via Adapti...,Recent neural models for image captioning usua...,2019,47992019
11277,Fully Dynamic Consistent Facility Location,We consider classic clustering problems in ful...,2019,18272019
11278,Efficient Rematerialization for Deep Networks,"When training complex neural networks, memory ...",2019,86932019


## Authors

In [10]:
# delete information if the author is chineese, since it is not relevant for us
del authors["is_chinese"]

# delete names because only name in database is used
del authors["name"]

# intersection of the paper_indices which are in both lists after NaN deletion
unique_papers = np.intersect1d(papers["paper_index"].unique(), authors["paper_index"].unique())

# only keep the entries which are in the intersection
authors = authors[authors.paper_index.isin(unique_papers)]
papers = papers[papers.paper_index.isin(unique_papers)]

# just for convenience: 
# change column order so name_in_database is before instiution 
# and renaming name_in_database to name
authors = authors[['paper_index', 'name_in_database', 'institution']].rename(columns={"name_in_database": "name"})

In [11]:
authors

Unnamed: 0,paper_index,name,institution
1,12020,Sangnie Bhardwaj,google
2,12020,Ian Fischer,google
3,12020,Johannes Ballé,google
4,12020,Troy Chinen,google
5,22020,Jean-Baptiste Alayrac,massachusetts institute of technology
...,...,...,...
36823,86932019,Joshua Wang,google
36824,23022019,Ruho Kondo,"toyota central r&d labs., inc."
36825,23022019,Keisuke Kawano,"toyota central r&d labs., inc"
36826,23022019,Satoshi Koide,toyota central r&d labs.


In [12]:
papers

Unnamed: 0,title,abstract,year,paper_index
1,An Unsupervised Information-Theoretic Perceptu...,Tractable models of human perception have prov...,2020,12020
2,Self-Supervised MultiModal Versatile Networks,Videos are a rich source of multi-modal superv...,2020,22020
3,"Benchmarking Deep Inverse Models over time, an...",We consider the task of solving generic invers...,2020,32020
4,Off-Policy Evaluation and Learning for Externa...,We consider the evaluation and training of a n...,2020,42020
5,Neural Methods for Point-wise Dependency Estim...,"Since its inception, the neural estimation of ...",2020,52020
...,...,...,...,...
11275,Discrete Object Generation with Reversible Ind...,The success of generative modeling in continuo...,2019,54522019
11276,Adaptively Aligned Image Captioning via Adapti...,Recent neural models for image captioning usua...,2019,47992019
11277,Fully Dynamic Consistent Facility Location,We consider classic clustering problems in ful...,2019,18272019
11278,Efficient Rematerialization for Deep Networks,"When training complex neural networks, memory ...",2019,86932019


## Save data into HDF5

In [13]:
# file is allways appended
authors.to_hdf("nips_dataset_cleared.h5", key='authors', format='table')
papers.to_hdf("nips_dataset_cleared.h5", key='papers', format='table')
institutions.to_hdf("nips_dataset_cleared.h5", key='institutions', format='table')