In [None]:
# -------------------------
# DATA SCHEMA 
# -------------------------
#
# .........................
# Node Types & Properties
# .........................
# - Paper (title*, paperId, referenceCount, citationCount, publicationDate)
# - Author (name*, id, url, paperCount, citationCount, hIndex)
# - PubVenue (name*, e.g. str: Journal of Vision)
# - PubYear (year*, e.g. dateYr: 2009)
# - Field (name*, e.g. str: Computer Science)
# - Institution (name*, e.g. str: The University of Melbourne)
#
#
# .........................
# Relationships related to Papers
# .........................
# 'venue' ... title PUBLISHED_IN venue
# 'year' ... title PUB_YEAR year
# 'fieldsOfStudy' ... title IS_ABOUT field-of-study  (note: field needs to be expanded)
# 'authors' ... author.name CO_AUTHORED title  (note: field needs to be expanded)


---


In [2]:
import json
from etl_papers_authors import *

# Load JSON data
input_file = 'storage/MEGA_paper_details.json'

with open(input_file, 'r') as file:
    data = json.load(file)

print(f"Paper table has {len(data)} papers")

Paper table has 9212 papers


In [7]:
data[0]

{'paperId': '01e77cd46ab75bab8f4b176455f0daa592e5f979',
 'title': 'Please Scroll down for Article Visual Cognition Modelling Search for People in 900 Scenes: a Combined Source Model of Eye Guidance',
 'abstract': 'This article may be used for research, teaching and private study purposes. Any substantial or systematic reproduction, redistribution , reselling , loan or sub-licensing, systematic supply or distribution in any form to anyone is expressly forbidden. The publisher does not give any warranty express or implied or make any representation that the contents will be complete or accurate or up to date. The accuracy of any instructions, formulae and drug doses should be independently verified with primary sources. The publisher shall not be liable for any loss, actions, claims, proceedings, demand or costs or damages whatsoever or howsoever caused arising directly or indirectly in connection with or arising out of the use of this material.',
 'venue': nan,
 'year': nan,
 'reference

In [60]:
# Transform data

papers_df = transform_papers(data)
print(papers_df.shape)

authors_df, co_authorships_df = transform_authors(data)
authors_df = authors_df.rename(columns={"name": "author"})
print(authors_df.shape)

fields_df, paper_fields_df = transform_fields_of_study(data)
fields_df = fields_df.rename(columns={"name": "field"})
print(fields_df.shape)

venues_df, paper_venues_df = transform_venues(data)
venues_df = venues_df.rename(columns={"name": "venue"})
print(venues_df.shape)

years_df, paper_years_df = transform_years(data)
print(years_df.shape)


(8040, 8)
(23710, 2)
(19, 1)
(1885, 1)
(20, 1)


In [61]:
# Save to intermediate files
output_dir = 'transformed/'

papers_df.to_csv(f"{output_dir}papers.csv", index=False)

authors_df.to_csv(f"{output_dir}authors.csv", index=False)
co_authorships_df.to_csv(f"{output_dir}co_authorships.csv", index=False)

fields_df.to_csv(f"{output_dir}fields_of_study.csv", index=False)
paper_fields_df.to_csv(f"{output_dir}paper_fields.csv", index=False)

venues_df.to_csv(f"{output_dir}venues.csv", index=False)
paper_venues_df.to_csv(f"{output_dir}paper_venues.csv", index=False)

years_df.to_csv(f"{output_dir}years.csv", index=False)
paper_years_df.to_csv(f"{output_dir}paper_years.csv", index=False)

print("Transformation complete. Data saved as CSV files.")

Transformation complete. Data saved as CSV files.


## ISSUE ENCOUNTERED:

Deciding which authors to prioritize for additional information in a large dataset is a nuanced task. Here are some approaches you can consider to prioritize author nodes in a scalable and unbiased way, while keeping relevance at the forefront:

Define "Relevance" Dynamically -- Instead of hardcoding assumptions about what makes an author "important," let your dataset suggest criteria by:

* **Number of Papers**: Authors with a high paperCount (number of papers in your dataset) are likely more central to your dataset.  
* **Citation Count**: Authors with papers that have high aggregate citationCount could be impactful in their field.  
* **Co-Authorship Networks**: Authors with many connections (e.g., high degree in a co-authorship graph) may indicate collaboration hubs.  
* **Fields of Study**: Focus on authors whose papers dominate in specific fields of interest, like Computer Science or Medicine.  


### 2. Aggregate Author-Level Metrics
Start by aggregating key metrics to get an overview of each author’s contributions:


In [5]:
# 2. Aggregate Author-Level Metrics

authors_metrics = co_authorships_df.merge(papers_df, on="paperId")
author_stats = authors_metrics.groupby("authorId").agg({
    # "name": "first",
    "paperId": "count",  # Paper count
    "citationCount": "sum",  # Total citations
    "referenceCount": "sum"  # Total references
}).rename(columns={"paperId": "paperCount"}).reset_index()

# Sort by a composite metric (e.g., citationCount or paperCount)
author_stats = author_stats.sort_values(by="citationCount", ascending=False)

# Preview the top authors
author_stats.head(10)


Unnamed: 0,authorId,paperCount,citationCount,referenceCount
4413,1838674,2,41915.0,347.0
14139,3142556,1,41763.0,265.0
13102,2657155,1,41763.0,265.0
8641,2157222093,1,41763.0,265.0
2006,144828948,1,41763.0,265.0
15795,39863668,1,41763.0,265.0
15824,39978391,1,41763.0,265.0
12946,2574060,1,41763.0,265.0
4028,1761978,1,41763.0,265.0
1881,144638781,3,11503.0,745.0


### 3. Develop Prioritization Criteria
Use your aggregated metrics to design selection criteria:

**High Citation Impact:**  Focus on the top N authors with the highest total citationCount.

**Publication Volume:**  Select authors with the highest paperCount.

**Field-Specific Focus:**  Prioritize authors contributing to a particular fieldsOfStudy.


### 4. Explore Centrality in the Co-Authorship Network
Use graph analysis to identify influential authors in the collaboration network. Metrics like degree centrality, betweenness centrality, or PageRank can help:

In [34]:
import networkx as nx

# Build a co-authorship graph
G = nx.Graph()
for _, row in co_authorships_df.iterrows():
    paper_id = row["paperId"]
    author_id = row["authorId"]
    if (paper_id is not None) & (author_id is not None):
        G.add_node(author_id, type="author")
        G.add_edge(author_id, paper_id)

# Compute centrality metrics
centrality = nx.degree_centrality(G)
sorted_centrality = sorted(centrality.items(), key=lambda x: x[1], reverse=True)

# Filter top authors by centrality
# top_authors = [node for node, centrality_score in sorted_centrality if node is not None: if G.nodes[node]["type"] == "author")]

top_authors=[]
for node, centrality_score in sorted_centrality: 
    if len(G.nodes[node]) > 0: 
        if G.nodes[node]["type"] == "author":
            top_authors.append(node)

top_authors


['3177797',
 '1717172',
 '1696991',
 '153386875',
 '144897958',
 '7326223',
 '1895768',
 '2846354',
 '3064665',
 '3376641',
 '3194727',
 '3369864',
 '4331863',
 '40089171',
 '1405907659',
 '2839864',
 '3124688',
 '1743773',
 '3388419',
 '145654220',
 '1789744',
 '2870903',
 '143852685',
 '1731199',
 '2506727',
 '1884159',
 '145953515',
 '31647771',
 '1527103472',
 '2108305595',
 '145165599',
 '1917767',
 '3258874',
 '37066739',
 '1904707',
 '143865718',
 '143868587',
 '144078005',
 '2356016',
 '2997408',
 '2848854',
 '34040188',
 '38995848',
 '47849651',
 '1980700',
 '3140440',
 '2200541',
 '1744045',
 '1727853',
 '2693875',
 '2246414',
 '48981841',
 '34735743',
 '2444586',
 '145937014',
 '4769631',
 '3326347',
 '1680975',
 '2418491',
 '1944405',
 '144826390',
 '1776651',
 '143805211',
 '1729760',
 '1865091',
 '143694777',
 '144248893',
 '1867276',
 '34646933',
 '5181971',
 '1924112',
 '145470864',
 '31441082',
 '2237786979',
 '48354614',
 '1681157',
 '46760255',
 '144397175',
 '262407

In [None]:
# Guiding Principles
# Scalability: Avoid pulling data for authors unlikely to affect your analysis.
# Flexibility: Design criteria that can adapt as your understanding of relevance evolves.
# Inclusivity: Avoid overly narrow assumptions (e.g., only focusing on authors with high citation counts might miss emerging or interdisciplinary contributors).
