In [3]:
import pandas as pd
import networkx as nx
import numpy as np
import nltk.metrics as nm
import editdistance


In [89]:
# Load NBER Graph from File

path = '../save/nber.graphml'
G = nx.read_graphml(path)

In [5]:
# We can generate subgraphs this way
def getJELSubgraph(G, jelcode):
    SG = nx.Graph()
    sge = [e for e in G.edges(data=True) if jelcode in e[2]['jelcode']]
    SG.add_edges_from(sge)
    return SG

In [6]:
# Returns the Mean all-pairs node connectivity of a JEL Code
def getConnEst(G, jelcode):
    return np.mean(nx.all_pairs_node_connectivity(getJELSubgraph(G, jelcode)).values()[0].values()) 

In [7]:
# Get List of JEL codes
e = G.edges(data=True)
jels = set(','.join([x[2]['jelcode'] for x in e]).split(','))

# Get List of Nodes
n = G.nodes()
ndf = pd.DataFrame(n, columns=['author'])

In [23]:
## Merge in Departments
department = pd.read_csv('../../../work_erica/EconIdeaNetwork/save/REPEC_Paper_Info.csv', delimiter=',')
department = department.drop_duplicates()


In [24]:
department['authorClean'] = department.Author.apply(lambda x: x.replace(',','').split(' '))
department['authorClean'] = department['authorClean'].apply(lambda x: x[1] + x[0])

In [107]:
# HACK: Drop duplicates by taking first institution 
department = department.groupby('authorClean').first().reset_index()

In [27]:
# Manually Check for matches using Levenstein Distance (tolerance = 2)

authorDF = []

def getAuthorMatches(author):

    matches = pd.DataFrame(ndf.author[ndf.author.apply(lambda x: editdistance.eval(x, author)) < 2])
    matches['repecAuthor'] = author

    return matches

In [108]:
%%time
# Get list of Author Matches
deptMatches = department.authorClean.apply(getAuthorMatches)
deptMatches = pd.concat(list(deptMatches))

CPU times: user 1min 27s, sys: 468 ms, total: 1min 28s
Wall time: 1min 29s


In [109]:
s = deptMatches.groupby('repecAuthor').size()
nberAuthorErrors = s[s > 1].index

# Fix Author Errors in NBER by merging
# Nodes with wrong names

for currAuthor in nberAuthorErrors:

    prevAuthorNames = deptMatches[deptMatches.repecAuthor == currAuthor].author.values

    # For each prev name, collect all edges
    # then delete node

    allPrevEdges = []

    for prevName in prevAuthorNames:

        prevEdges = G.edges(prevName)
        prevEdges = [x[1] for x in prevEdges]
        allPrevEdges = allPrevEdges + prevEdges

        G.remove_node(prevName)

    # Create new node with full set of edges

    for target in allPrevEdges:
        G.add_edge(prevAuthorNames[0], target)

In [110]:
# Merge Department Affiliations onto Author Matches
authDept = pd.merge(deptMatches, department, how='left', left_on='repecAuthor', right_on='authorClean')

In [116]:
# Get list of NBER authors corresponding to Institution

instGraphs = {}
insts = department.Institution.unique()

for currInst in insts:
    instGraphs[currInst] = G.subgraph(list(authDept[authDept.Institution == currInst].author))
    nx.write_graphml(instGraphs[currInst], \
                     '../save/' + currInst.replace(',','').replace(' ','_') + '.graphml')

In [188]:
# Summarize Institutional Graphs

instDf = pd.DataFrame()
instDf['Institution'] = insts
instDf['Graph'] = instDf.Institution.apply(lambda x: instGraphs[x])
# Avg Degree, Avg Centrality, Number of COmponents, Number of Nodes, 
# Longest Shortest Path, Total # of Papers, Avg # of Papers, % of Papers within Dept
# Rank

instDf['AvgDegree'] = instDf.Graph.apply(lambda x: \
                                         np.mean(nx.degree(x, x.nodes()).values()))
instDf['nNodes'] = instDf.Graph.apply(lambda x: len(x.nodes()))

# Drop Empty Departments
instDf = instDf[instDf.nNodes != 0]

instDf['nCC'] = instDf.Graph.apply(lambda x: \
                                   len([c for c in nx.connected_components(x)]))
instDf['AvgSizeCC'] = instDf.Graph.apply(lambda x: \
                                         np.mean([len(c) for c in nx.connected_components(x)]))
instDf['MaxSizeCC'] = instDf.Graph.apply(lambda x: \
                                         np.max([len(c) for c in nx.connected_components(x)]))


In [192]:
NBERPapers = pd.read_csv('../save/NBER_Paper_Info.csv', delimiter='|')

Unnamed: 0.1,Unnamed: 0,title,authors,jel
0,0,""". . . and six hundred thousand men were dead.""",Herschel I. Grossman,D7
1,1,"""Aggregation Bias"" DOES Explain the PPP Puzzle","Jean Imbs, Haroon Mumtaz, Morten O. Ravn, Hlne...","C2, F1, F3"
2,2,"""Basket"" Cases: International Joint Ventures A...","Mihir A. Desai, James R. Hines, Jr.","F2, H8"
3,3,"""Beauty Is the Promise of Happiness""?","Daniel S. Hamermesh, Jason Abrevaya","C2, I3, J1"
4,4,"""Conditional scholarships"" for HIV/AIDS Health...","Till Brnighausen, David E. Bloom","I1, I2, J2"
5,5,"""Convergence in the Age of Mass Migration""","Alan M. Taylor, Jeffrey G. Williamson","F2, N1"
6,6,"""Currency Manipulation"" and World Trade","Robert W. Staiger, Alan O. Sykes","F0, F1, F3, K3"
7,7,"""Do the Right Thing:"" The Effects of Moral Sua...","Ernesto Dal B, Pedro Dal B","C9, H4"
8,8,"""Excess Volatility"" and the German Stock Marke...","J. Bradford De Long, Marco Becht",N2
9,9,"""Fifty-four Forty or Fight!""",Herschel I. Grossman,D7
