In [122]:
import numpy as np
import pandas as pd
import csv
import json

# Load Data

## Author loading

In [145]:
authors = pd.read_csv("./kaggle_archive/authors.csv")

# check how many entries are deleted trough dropping missing data
print(f"deleted entries through missing data {authors.shape[0]-authors.dropna().shape[0]}")

# drop missing data
authors = authors.dropna()

deleted entries through missing data 1185


In [36]:
authors

Unnamed: 0,paper_index,name,institution,is_chinese,name_in_database
1,12020,Sangnie Bhardwaj,google,0,Sangnie Bhardwaj
2,12020,Ian Fischer,google,0,Ian Fischer
3,12020,Johannes Ballé,google,0,Johannes Ballé
4,12020,Troy Chinen,google,0,Troy Chinen
5,22020,Jean-Baptiste Alayrac,massachusetts institute of technology,0,Jean-Baptiste Alayrac
...,...,...,...,...,...
36823,86932019,Joshua Wang,google,1,Joshua Wang
36824,23022019,Ruho Kondo,"toyota central r&d labs., inc.",0,Ruho Kondo
36825,23022019,Keisuke Kawano,"toyota central r&d labs., inc",0,Keisuke Kawano
36826,23022019,Satoshi Koide,toyota central r&d labs.,0,Satoshi Koide


## Paper loading

In [81]:
paper = pd.read_csv("./kaggle_archive/papers.csv")

# check how many entries are deleted trough dropping missing data
print(f"deleted entries through missing data {paper.shape[0]-paper.dropna().shape[0]}")

# drop missing data
paper = paper.dropna()

deleted entries through missing data 3342


In [18]:
paper

Unnamed: 0.1,Unnamed: 0,title,abstract,year,paper_index,full_text
0,0,A graph similarity for deep learning,Graph neural networks (GNNs) have been success...,2020,2020,A Graph Similarity for Deep Learning\nSeongmin...
1,1,An Unsupervised Information-Theoretic Perceptu...,Tractable models of human perception have prov...,2020,12020,An Unsupervised Information-Theoretic\nPercept...
2,2,Self-Supervised MultiModal Versatile Networks,Videos are a rich source of multi-modal superv...,2020,22020,Self-Supervised MultiModal Versatile Networks\...
3,3,"Benchmarking Deep Inverse Models over time, an...",We consider the task of solving generic invers...,2020,32020,"Benchmarking Deep Inverse Models over time, an..."
4,4,Off-Policy Evaluation and Learning for Externa...,We consider the evaluation and training of a n...,2020,42020,Off-Policy Evaluation and Learning\nfor Extern...
...,...,...,...,...,...,...
11275,11275,Discrete Object Generation with Reversible Ind...,The success of generative modeling in continuo...,2019,54522019,Discrete Object Generation\n\nwith Reversible ...
11276,11276,Adaptively Aligned Image Captioning via Adapti...,Recent neural models for image captioning usua...,2019,47992019,Adaptively Aligned Image Captioning via\n\nAda...
11277,11277,Fully Dynamic Consistent Facility Location,We consider classic clustering problems in ful...,2019,18272019,Fully Dynamic Consistent Facility Location\n\n...
11278,11278,Efficient Rematerialization for Deep Networks,"When training complex neural networks, memory ...",2019,86932019,Efﬁcient Rematerialization for Deep Networks\n...


## Institution loading

In [21]:
institutions = pd.read_csv("./kaggle_archive/institutions.csv")

# check how many entries are deleted trough dropping missing data
print(f"deleted entries through missing data {institutions.shape[0]-institutions.dropna().shape[0]}")

# drop missing data
institutions = institutions.dropna()

deleted entries through missing data 0


In [22]:
institutions

Unnamed: 0.1,Unnamed: 0,institutions,city,state,country
0,0,stanford university,Stanford,CA,US
1,1,mit,Cambridge,MA,US
2,2,carnegie mellon university,Pittsburgh,PA,US
3,3,uc berkeley,Berkeley,CA,US
4,4,google,Mountain View,CA,US
...,...,...,...,...,...
95,95,the university of texas at austin,Austin,TX,US
96,96,university of virginia,Charlottesville,VA,US
97,97,facebook,Menlo Park,CA,US
98,98,rutgers university,Newark and Camden,NJ,US


# Preprocess Data

## Papers

In [82]:
# drop fulltext since not necessary and dataframe gets big through that
paper = paper.drop(['full_text'], axis=1)

## Authors

In [146]:
## get unique paper_ids which are stored in authors
unique_paper_authors = authors["paper_index"].unique()

## get unique paper_ids which are stored in paper
### distortion because NaN removal
unique_paper_paper = paper["paper_index"].unique()

## have only values, which are in both arrays
unique_papers = np.fromiter((x for x in unique_paper_authors if x in unique_paper_paper),dtype='int')
print(f"All together, there are {unique_papers.shape[0]} unique papers")

## drop in autohr-dataframe all papers without information in the the paper-dataframe
drop_paper_indices = np.fromiter((x for x in unique_paper_authors if not(x in unique_paper_paper)),dtype='int')

authors = authors[~authors.paper_index.isin(drop_paper_indices)]
# authors["paper_index"].unique().shape[0] # controll

All together, there are 7918 unique papers


In [128]:
# get collection of all papers for each author

# get unique names
unique_authors = authors["name_in_database"].unique()
print(f"number of unique authors: {unique_authors.shape[0]}")
number_uniq_authors = unique_authors.shape[0]

number of unique authors: 13105


In [129]:
# create dictionary with unique authors as keys and paper_index as values
author_dict = {}
for uniq_authr in unique_authors:
    
    author_dict[uniq_authr] = list(authors.where(authors["name_in_database"] == 
                                                 uniq_authr).dropna()['paper_index'])

# author_dict

In [130]:
# save data into csv and json

# save into csv
f_csv = open("author_dict.csv", "w")
w = csv.writer(f_csv)
for key, val in author_dict.items():
    w.writerow([key, val])
f_csv.close()


# save into json
json = json.dumps(author_dict)
f = open("author_dict.json","w")
f.write(json)
f.close()

In [138]:
# create connection matrix between authors - based on number of common publications
## should be symmetrical TODO: more efficient way to save!
helper_matrix_conncetions = np.zeros((number_uniq_authors, number_uniq_authors))
connection_matrix = pd.DataFrame(helper_matrix_conncetions, columns=unique_authors, 
                                 index=unique_authors, dtype='int')
connection_matrix.head()

Unnamed: 0,Sangnie Bhardwaj,Ian Fischer,Johannes Ballé,Troy Chinen,Jean-Baptiste Alayrac,Adrià Recasens,Relja Arandjelovic,Jason Ramapuram,Jeffrey De Fauw,Sander Dieleman,...,Seth Lloyd,Ari Seff,Wenda Zhou,Farhan Damani,Abigail Doyle,Lun Huang,Yaxian Xia,Niklas Oskar Hjuler,Erik Vee,Ruho Kondo
Sangnie Bhardwaj,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ian Fischer,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Johannes Ballé,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Troy Chinen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Jean-Baptiste Alayrac,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [188]:
for current_author in unique_authors:
    for paper_list_index, paper_index in enumerate(author_dict[current_author]):
        a = authors[authors['paper_index'] == author_dict[current_author][paper_list_index]]['name_in_database']
        delete_own_name = a.index[a==current_author]
        a = a.drop(delete_own_name)
        for i in a:
            connection_matrix[current_author][i] += 1

In [191]:
connection_matrix

Unnamed: 0,Sangnie Bhardwaj,Ian Fischer,Johannes Ballé,Troy Chinen,Jean-Baptiste Alayrac,Adrià Recasens,Relja Arandjelovic,Jason Ramapuram,Jeffrey De Fauw,Sander Dieleman,...,Seth Lloyd,Ari Seff,Wenda Zhou,Farhan Damani,Abigail Doyle,Lun Huang,Yaxian Xia,Niklas Oskar Hjuler,Erik Vee,Ruho Kondo
Sangnie Bhardwaj,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ian Fischer,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Johannes Ballé,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Troy Chinen,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Jean-Baptiste Alayrac,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Lun Huang,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Yaxian Xia,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Niklas Oskar Hjuler,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Erik Vee,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [193]:
# save connection matrix

## save as csv
connection_matrix.to_csv('connection_matrix.csv')

## save as json
result = connection_matrix.to_json('connection_matrix.json')
#parsed = json.loads(result)
#json.dumps(parsed, indent=4)  