## Import NBER graph and add department data

In [24]:
import pandas as pd
import networkx as nx
import numpy as np
import nltk.metrics as nm
import editdistance


In [25]:
# Load NBER Graph from File
# NBER graph contains coauthorship edges annotated with JEL codes 
path = '../save/nber.graphml'
G = nx.read_graphml(path)

In [26]:
## Merge in Departments (scraped from REPEC)
department = pd.read_csv('../../../work_erica/EconIdeaNetwork/save/REPEC_Paper_Info.csv', delimiter=',')
department = department.drop_duplicates()

In [27]:
department['authorClean'] = department.Author.apply(lambda x: x.replace(',','').split(' '))
department['authorClean'] = department['authorClean'].apply(lambda x: x[1] + x[0])

In [28]:
# HACK: Drop duplicates by taking first institution 
department = department.groupby('authorClean').first().reset_index()

In [13]:
# Manually Check for matches using Levenstein Distance (tolerance = 2)

authorDF = []

def getAuthorMatches(author):

    matches = pd.DataFrame(ndf.author[ndf.author.apply(lambda x: editdistance.eval(x, author)) < 2])
    matches['repecAuthor'] = author

    return matches

In [14]:
# Get list of Author Matches

%%time
deptMatches = department.authorClean.apply(getAuthorMatches)
deptMatches = pd.concat(list(deptMatches))

CPU times: user 1min 36s, sys: 1.25 s, total: 1min 37s
Wall time: 1min 45s


In [109]:
# Fix Author Errors in NBER by merging
# Nodes with wrong names

s = deptMatches.groupby('repecAuthor').size()
nberAuthorErrors = s[s > 1].index

for currAuthor in nberAuthorErrors:

    prevAuthorNames = deptMatches[deptMatches.repecAuthor == currAuthor].author.values

    # For each prev name, collect all edges
    # then delete node

    allPrevEdges = []

    for prevName in prevAuthorNames:

        prevEdges = G.edges(prevName)
        prevEdges = [x[1] for x in prevEdges]
        allPrevEdges = allPrevEdges + prevEdges

        G.remove_node(prevName)

    # Create new node with full set of edges

    for target in allPrevEdges:
        G.add_edge(prevAuthorNames[0], target)

In [20]:
# Merge Department Affiliations onto Author Matches
authDept = pd.merge(deptMatches, department, how='left', left_on='repecAuthor', right_on='authorClean')

In [21]:
authDept

Unnamed: 0,author,repecAuthor,authorClean,Institution,Author
0,ACameron,A.Cameron,A.Cameron,"University of California, Davis CA","Cameron, A. Colin"
1,AGallant,A.Gallant,A.Gallant,"Pennsylvania State University, State College PA","Gallant, A. Ronald"
2,AakashMohpal,AakashMohpal,AakashMohpal,"University of Michigan, Ann Arbor MI","Mohpal, Aakash"
3,AaronEdlin,AaronEdlin,AaronEdlin,"University of California, Berkeley CA","Edlin, Aaron"
4,AaronTornell,AaronTornell,AaronTornell,"University of California, Los Angeles CA","Tornell, Aaron"
5,AbhishekGupta,AbhishekGupta,AbhishekGupta,"Johns Hopkins University, Baltimore MD","Gupta, Abhishek"
6,AdamGuren,AdamGuren,AdamGuren,"Boston University, Boston MA","Guren, Adam"
7,AdamOsman,AdamOsman,AdamOsman,"Yale University, New Haven CT","Osman, Adam"
8,AhmedMobarak,AhmedMobarak,AhmedMobarak,"Yale University, New Haven CT","Mobarak, Ahmed Mushfiq"
9,AimeeChin,AimeeChin,AimeeChin,"University of Houston, Houston TX","Chin, Aimee"


In [None]:
# Add department affiliations to G
for author in authDept
    authDept["Institution"]
    
end

In [None]:
# Basic full-graph summary stats
degree_dist = nx.degree_histogram(G)


## Examine JEL subgraphs

In [3]:
# getJELSubgraph generates the subgraph of G containing all papers with 
# JEL code 'jelcode'
def getJELSubgraph(G, jelcode):
    SG = nx.Graph()
    sge = [e for e in G.edges(data=True) if jelcode in e[2]['jelcode']]
    SG.add_edges_from(sge)
    return SG

In [4]:
# Returns the Mean all-pairs node connectivity of a JEL Code
def getConnEst(G, jelcode):
    return np.mean(nx.all_pairs_node_connectivity(getJELSubgraph(G, jelcode)).values()[0].values()) 

In [9]:
# Get List of JEL codes
e = G.edges(data=True)
jels = set(','.join([x[2]['jelcode'] for x in e]).split(','))

# Get List of Nodes
n = G.nodes()
ndf = pd.DataFrame(n, columns=['author'])

## Examine institutional subgraphs 

In [116]:
# Get list of NBER authors corresponding to Institution

instGraphs = {}
insts = department.Institution.unique()

for currInst in insts:
    instGraphs[currInst] = G.subgraph(list(authDept[authDept.Institution == currInst].author))
    nx.write_graphml(instGraphs[currInst], \
                     '../save/' + currInst.replace(',','').replace(' ','_') + '.graphml')

In [188]:
# Summarize Institutional Graphs

instDf = pd.DataFrame()
instDf['Institution'] = insts
instDf['Graph'] = instDf.Institution.apply(lambda x: instGraphs[x])
# Avg Degree, Avg Centrality, Number of COmponents, Number of Nodes, 
# Longest Shortest Path, Total # of Papers, Avg # of Papers, % of Papers within Dept
# Rank

instDf['AvgDegree'] = instDf.Graph.apply(lambda x: \
                                         np.mean(nx.degree(x, x.nodes()).values()))
instDf['nNodes'] = instDf.Graph.apply(lambda x: len(x.nodes()))

# Drop Empty Departments
instDf = instDf[instDf.nNodes != 0]

instDf['nCC'] = instDf.Graph.apply(lambda x: \
                                   len([c for c in nx.connected_components(x)]))
instDf['AvgSizeCC'] = instDf.Graph.apply(lambda x: \
                                         np.mean([len(c) for c in nx.connected_components(x)]))
instDf['MaxSizeCC'] = instDf.Graph.apply(lambda x: \
                                         np.max([len(c) for c in nx.connected_components(x)]))


In [192]:
NBERPapers = pd.read_csv('../save/NBER_Paper_Info.csv', delimiter='|')

Unnamed: 0.1,Unnamed: 0,title,authors,jel
0,0,""". . . and six hundred thousand men were dead.""",Herschel I. Grossman,D7
1,1,"""Aggregation Bias"" DOES Explain the PPP Puzzle","Jean Imbs, Haroon Mumtaz, Morten O. Ravn, Hlne...","C2, F1, F3"
2,2,"""Basket"" Cases: International Joint Ventures A...","Mihir A. Desai, James R. Hines, Jr.","F2, H8"
3,3,"""Beauty Is the Promise of Happiness""?","Daniel S. Hamermesh, Jason Abrevaya","C2, I3, J1"
4,4,"""Conditional scholarships"" for HIV/AIDS Health...","Till Brnighausen, David E. Bloom","I1, I2, J2"
5,5,"""Convergence in the Age of Mass Migration""","Alan M. Taylor, Jeffrey G. Williamson","F2, N1"
6,6,"""Currency Manipulation"" and World Trade","Robert W. Staiger, Alan O. Sykes","F0, F1, F3, K3"
7,7,"""Do the Right Thing:"" The Effects of Moral Sua...","Ernesto Dal B, Pedro Dal B","C9, H4"
8,8,"""Excess Volatility"" and the German Stock Marke...","J. Bradford De Long, Marco Becht",N2
9,9,"""Fifty-four Forty or Fight!""",Herschel I. Grossman,D7


## Netflix feature matrix

In [35]:
# make a JEL lookup table to map jels to indices                                                     
jelLookup = dict()
for i, jel in enumerate(jels):
    jelLookup[jel] = i

# make an array of authors x JELs 
# to count how many papers in each JEL someone has
authors = nx.nodes(G)
authorCodes = np.zeros((len(authors), len(jels)))
for a in range(len(authors)):
    papers = G.edges(authors[a])
    for p in papers:
        paperAttrs = G.get_edge_data(p[0], p[1])
        paperJels  = paperAttrs['jelcode'].split(',')
        jelInds = [jelLookup[jel] for jel in paperJels]
        authorCodes[a, jelInds] += 1

In [36]:
# Pageranks
ranks = nx.pagerank(G)

# add to graph                                                                  
nx.set_node_attributes(G, 'rank', ranks)

# in case we want to look at the top authors                                    
sortedRanks = sorted(ranks.items(), key=lambda x: x[1])

In [37]:
# Extract PageRank to a vector                                                  
authors = G.nodes()
nodeRanks = np.empty(len(authors))
for i in range(len(authors)):
    nodeRanks[i] = ranks.get(authors[i])
nodeRanks = np.reshape(nodeRanks, (len(nodeRanks),1))

array([[  7.71754785e-05],
       [  5.07534963e-05],
       [  3.67663721e-04],
       ..., 
       [  4.27016639e-05],
       [  3.48634514e-05],
       [  3.64916202e-05]])

In [39]:
# make feature dataframe
data = np.hstack((nodeRanks,authorCodes))
df = pd.DataFrame(data)
df.columns = ['rank'] + [jel for jel in jels]
df['author'] = authors

In [40]:
df

Unnamed: 0,rank,O5,O4,O3,O2,O1,O0,G3,G2,G1,...,D9,D6,D7,D4,D5,D2,D3,D0,D1,author
0,0.000077,0,0,0,0,0,0,2,1,2,...,0,0,0,0,0,0,0,2,0,MarcusOpp
1,0.000051,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,DanielGreenwald
2,0.000368,0,0,0,0,0,0,1,4,1,...,0,0,0,0,0,0,0,0,0,EugeneWhite
3,0.000086,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,SusanCollins
4,0.000092,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,GiuliaSestieri
5,0.000049,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,LisaKlein
6,0.000100,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,MarcusHagedorn
7,0.000056,0,0,0,0,0,0,0,2,2,...,0,0,0,0,0,0,0,0,0,HongpingTan
8,0.000130,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,MichaelElsby
9,0.000056,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,EdouardSchaal
