In [1]:
#%matplotlib inline

import pylab

import xml.etree.ElementTree as ET
import json
import os
import pandas as pd
import re
import numpy as np

import matplotlib

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-white')

pylab.ion()

<h1> Read INSPIRE and HEPNAMES </h1>

Read INSPIREjson and HEPNAMESjson. In both case the 'item' entry is in integer format. Turn it to unicode, as this is the format in other part of the table.

In [2]:
df=pd.read_json('INSPIREjson')
data=df.copy()

In [3]:
data.sort_index(inplace=True)
data.item = data.item.astype(unicode)

In [5]:
df_names=pd.read_json('HEPNAMESjson')
data_names=df_names.copy()

In [6]:
data_names.sort_index(inplace=True)
data_names.item = data_names.item.astype(unicode)

Show some example

In [45]:
data.head(1)

Unnamed: 0,authors,cat,item,refs,title,year
0,[],0,1,"[1256272, 1256272, 51394]","Isoclinic N planes in Euclidean 2N space, Clif...",2001


In [46]:
data_names.head(1)

Unnamed: 0,author,item
0,"Zyskin, Maxim V.",981872


<h1> Build citation graph </h1>

Build the citation graph, to be used to calculate the PageRank

In [None]:
list_item=data['item'].tolist()
list_item_U=[unicode(i) for i in list_item]
list_cit=data['refs'].tolist()
cit_graph=dict(zip(list_item_U,list_cit))
with open('cit_graph.json', 'w') as f:
    json.dump(cit_graph, f)

<h1> Manipulations </h1>

In [9]:
data_HEP=data[data['cat'].isin(['hep-ph','hep-th'])]

List of unique HEP authors

In [10]:
temp=data_HEP['authors'].tolist()
unique_authors = [item for sublist in temp for item in sublist]
unique_authors = set(unique_authors)

In [41]:
print 'Number of unique authors: ', len(unique_authors)

Number of unique authors:  25788


List of unique HEP young authors

In [12]:
data_HEP_old=data_HEP[data_HEP['year']<2006]
temp=data_HEP_old['authors'].tolist()
unique_authors_old = [item for sublist in temp for item in sublist]
unique_authors_old = set(unique_authors_old)
unique_authors_young = [x for x in unique_authors if x not in unique_authors_old]
s_young=set(unique_authors_young)

In [42]:
print 'Number of unique young authors: ', len(unique_authors_young)

Number of unique young authors:  10983


Define 'data_HEP_young' to keep only papers in data_HEP written by at least one young author.

In [14]:
lista_HEP_auth=data_HEP['authors'].tolist()
mask=[set(i).intersection(s_young)!=set() for i in lista_HEP_auth]
data_HEP_young=data_HEP.copy()
data_HEP_young=data_HEP_young[mask]

Calculate the number of citations for any paper in 'data_HEP_young'.

In [15]:
#Papers in HEP_young
lista_HEP_young_papers=data_HEP_young['item'].tolist()
#Papers on arxiv
lista_papers=data['item'].tolist()

In [16]:
#references on arxiv
reflist=data['refs'].tolist()
reflist=[item for sublist in reflist for item in sublist]

In [17]:
print 'Total number of references: ' , len(reflist)

Total number of references:  23111089


In [18]:
from collections import Counter

In [19]:
cites_count=Counter(reflist)

Construct the number of citations per paper and the normalized (to the number of authors) citations per paper.

In [20]:
cites=[cites_count[i] for i in lista_HEP_young_papers]
data_HEP_young= data_HEP_young.assign(cites=pd.Series(cites).values)
#Divide by number of authors per paper
num_authors_HEP_young_papers=data_HEP_young['authors'].apply(len).tolist()
cites_N=[cites_count[str(lista_HEP_young_papers[i])]*1.0/num_authors_HEP_young_papers[i]\
             for i in range(len(lista_HEP_young_papers))]
data_HEP_young= data_HEP_young.assign(cites_N=pd.Series(cites_N).values)

Papers per author

In [21]:
authorship_young=data_HEP_young['authors'].tolist()
authorship_young=[item for sublist in authorship_young for item in sublist]
authorship_count_young=Counter(authorship_young)

In [22]:
data_authors=pd.Series(unique_authors_young).to_frame(name='authorID')
num_papers=[authorship_count_young[i] for i in unique_authors_young]
data_authors= data_authors.assign(n_papers=pd.Series(num_papers).values)

Append name to author ID

In [23]:
map_names=pd.Series(data_names.author.values,index=data_names.item).to_dict()
data_authors['name']=data_authors['authorID'].map(map_names)
lastnames=data_authors['name'].tolist()
lastnames=[lastnames[i].split(',')[0] for i in range(len(lastnames))]
data_authors = data_authors.assign(lastname=pd.Series(lastnames).values)

<h1> PageRank </h1>

Read the PageRank of every paper in 'data_HEP_young', calculated using PageRankCentrality function in Mathematica. The author PageRank is the sum of individual papers PageRanks, normalized to the number of authors, and normalized to the total number of papers on 'data_HEP_young'. It is also multiplied by an arbitrary constant C=10^5.

In [26]:
Cg85=pd.read_csv("result85.csv")
Cg50=pd.read_csv("result50.csv")
pagerank_papers85=pd.Series(Cg85.pagerank85.values,index=Cg85.PaperID.apply(unicode)).to_dict()
pagerank_papers50=pd.Series(Cg50.pagerank50.values,index=Cg50.PaperID.apply(unicode)).to_dict()

  interactivity=interactivity, compiler=compiler, result=result)


In [31]:
#Divide by number of authors per paper
num_authors_HEP_young_papers=data_HEP_young['authors'].apply(len).tolist()
#85
pagerank85=[pagerank_papers85[str(i)] for i in lista_HEP_young_papers]
tot_pagerunk85_young=np.sum(pagerank85)
pagerank85_N=[pagerank_papers85[str(lista_HEP_young_papers[i])]*1.0e5/\
              (num_authors_HEP_young_papers[i]*tot_pagerunk85_young) for i in range(len(lista_HEP_young_papers))]
data_HEP_young= data_HEP_young.assign(pagerank85_N=pd.Series(pagerank85_N).values)
#50
pagerank50=[pagerank_papers50[str(lista_HEP_young_papers[i])] for i in range(len(lista_HEP_young_papers))]
tot_pagerunk50_young=np.sum(pagerank50)
pagerank50_N=[pagerank_papers50[str(lista_HEP_young_papers[i])]*1.0e5/\
              (num_authors_HEP_young_papers[i]*tot_pagerunk50_young) for i in range(len(lista_HEP_young_papers))]
data_HEP_young= data_HEP_young.assign(pagerank50_N=pd.Series(pagerank50_N).values)

To extract total citations and H-index: unroll authors and citations of papers

In [33]:
#To extract total citations and H-index: unroll authors and citations of papers
data_HY_cites=data_HEP_young[['authors','cites','cites_N','pagerank85_N','pagerank50_N','year']]
temp = data_HY_cites.authors.apply(pd.Series).unstack()
data_HY_cites = data_HY_cites.join(pd.DataFrame(temp.reset_index(level=0, drop=True)))
data_HY_cites.rename(columns={0: 'author'}, inplace=True)
data_HY_cites.dropna(subset=['author'], inplace=True)
data_HY_cites = data_HY_cites.drop('authors', 1)

<h1> h-Index </h1>

Calculate the h-Index of every young author A as the maximum N for which the following holds: A has N papers each of which is cited at least N times.

In [34]:
def hIndex(citations):
    citations.sort(reverse=True)
    return max([min(k+1, v) for k,v in enumerate(citations)]) if citations else 0

In [35]:
#H-index
H_series=data_HY_cites.groupby('author').cites.apply(list).apply(hIndex)[data_authors['authorID']]
data_authors= data_authors.assign(H_index=pd.Series(H_series.tolist()).values)

<h1> Put everything together </h1>

In [36]:
#total  citations
cites_series=data_HY_cites.groupby('author')['cites'].sum()[data_authors['authorID']]
data_authors= data_authors.assign(cites=pd.Series(cites_series.tolist()).values)
#total  citations normalized
cites_N_series=data_HY_cites.groupby('author')['cites_N'].sum()[data_authors['authorID']]
data_authors= data_authors.assign(cites_N=pd.Series(cites_N_series.tolist()).values)
#pagerank75
pagerank85_series=data_HY_cites.groupby('author')['pagerank85_N'].sum()[data_authors['authorID']]
data_authors= data_authors.assign(pagerank85=pd.Series(pagerank85_series.tolist()).values)
#pagerank50
pagerank50_series=data_HY_cites.groupby('author')['pagerank50_N'].sum()[data_authors['authorID']]
data_authors= data_authors.assign(pagerank50=pd.Series(pagerank50_series.tolist()).values)

Example

In [44]:
data_authors.head(5)

Unnamed: 0,authorID,n_papers,name,lastname,H_index,cites,cites_N,pagerank85,pagerank50
0,988539,1,"Sinnis, Constantine",Sinnis,1,3,0.056604,0.027979,0.034396
1,999759,2,"Martinez, Domingo Louis",Martinez,1,4,4.0,3.223412,3.829613
2,1498218,2,"Caputo, Andrea",Caputo,1,4,1.083333,0.888997,1.081584
3,1498216,2,"Faroughy, Darius",Faroughy,2,27,8.833333,1.018512,1.170332
4,1498217,2,"Tezuka, Masaki",Tezuka,1,46,5.111111,0.815815,0.915048


<h1> Save results </h1>

Save the results to a csv file.

In [None]:
data_authors.to_csv('results_authors',encoding='utf-8')