## ADM Assignment 5 
### Network Scholar: Graph-Based Analysis and Visualization of Academic Collaborations

### Libraries 


In [46]:
import pandas as pd
import numpy as np
import ijson
import os
import time
from decimal import Decimal
import networkx as nx
import ast

### A script to convert the data from JSON to a PD dataframe 
##### Reference: https://www.kaggle.com/code/devintheai/citation-network-eda

In [2]:
papers = []

start = time.process_time()

with open('citation.json', "rb") as f:
    for i, element in enumerate(ijson.items(f, "item")):
        paper = {}
        paper['id'] = element['id']
        paper['title'] = element['title']

        author = element.get('authors')
        if author:
            Author = element['authors']
            author_name = []
            author_id = []

            for j in Author:
                if 'name' in j and 'id' in j :
                    author_name.append(str(j['name']))  
                    author_id.append(str(j['id']))
                else:
                    author_name.append(str(np.nan))  
                    author_id.append(str(np.nan))

            paper['author_name'] = ';'.join(author_name)
            paper['author_id'] = ';'.join(author_id)
        
        year = element.get('year')
        if year:
            paper['year']= year
        else:
            paper['year']= np.nan
        
        n_citation= element.get('n_citation')
        if n_citation:
             paper['n_citation']= element['n_citation']
        else:
            paper['n_citation']= 0
        
        doc_type= element.get('doc_type') 
        if doc_type:
            paper['doc_type']= element['doc_type']
        else:
            paper['doc_type']= np.nan   

        references= element.get('references')
        if references:
            paper['reference_count']= len(references)
            paper['references'] = [int(r) for r in references]
        else:
            paper['reference_count']= np.nan 
            paper['references']= np.nan
        
        doi = element.get('doi')
        if doi:
            paper['doi']= f"https://doi.org/{element['doi']}"
        else:
            paper['doi']= np.nan               
        
        
        papers.append(paper)
        
        if i%48000 ==0:
            print(f"{i}:{round((time.process_time() - start),2)}s ",end="")


0:0.0s 48000:1.59s 96000:3.16s 144000:4.94s 192000:6.62s 240000:8.39s 288000:10.78s 336000:13.7s 384000:15.86s 432000:18.47s 480000:21.17s 528000:23.53s 576000:26.22s 624000:28.45s 672000:30.86s 720000:33.44s 768000:35.95s 816000:38.34s 864000:40.73s 912000:42.86s 960000:45.27s 1008000:47.84s 1056000:50.72s 1104000:53.19s 1152000:56.3s 1200000:59.25s 1248000:61.72s 1296000:64.61s 1344000:67.36s 1392000:70.47s 1440000:73.03s 1488000:75.11s 1536000:77.73s 1584000:80.81s 1632000:82.73s 1680000:85.94s 1728000:87.69s 1776000:91.17s 1824000:93.8s 1872000:96.66s 1920000:99.33s 1968000:101.5s 2016000:104.59s 2064000:107.2s 2112000:110.06s 2160000:113.11s 2208000:116.03s 2256000:118.22s 2304000:121.39s 2352000:124.3s 2400000:127.22s 2448000:129.3s 2496000:132.09s 2544000:134.97s 2592000:137.61s 2640000:139.69s 2688000:143.25s 2736000:147.3s 2784000:151.0s 2832000:153.36s 2880000:156.73s 2928000:158.81s 2976000:162.34s 3024000:164.02s 3072000:167.31s 3120000:170.33s 3168000:171.61s 3216000:173.6

### Data pre-processing

In [3]:
df = pd.DataFrame(papers)
df.head()

Unnamed: 0,id,title,author_name,author_id,year,n_citation,doc_type,reference_count,references,doi
0,1091,Preliminary Design of a Network Protocol Learn...,Makoto Satoh;Ryo Muramatsu;Mizue Kayama;Kazuno...,2312688602;2482909946;2128134587;2101782692;21...,2013.0,1,Conference,2.0,"[2005687710, 2018037215]",https://doi.org/10.1007/978-3-642-39476-8_19
1,1388,Further Results on Independence in Direct-Prod...,Pranava K. Jha,2718958994,2000.0,1,Journal,,,
2,1674,A methodology for the physically accurate visu...,G. Beale;G. Earl,2103626414;2117665592,2011.0,1,Conference,15.0,"[1535888970, 1992876689, 1993710814, 203565334...",https://doi.org/10.2312/VAST/VAST11/137-144
3,1688,"Comparison of GARCH, Neural Network and Suppor...",Altaf Hossain;Faisal Zaman;M. Nasser;M. Mufakh...,2300589394;2308774408;2126056503;2425818370,2009.0,6,Conference,3.0,"[1560724230, 1986968751, 2156909104]",https://doi.org/10.1007/978-3-642-11164-8_97
4,5411,COMPARING GNG3D AND QUADRIC ERROR METRICS METH...,Rafael Álvarez;Leandro Tortosa;José-Francisco ...,2125293936;2101693188;2159120860;2146570697,2009.0,0,Conference,,,


In [4]:
print('There are', len(df),'papers in the dataset')

There are 4894081 papers in the dataset


#### Filtering the data to find the top 10000 papers

In [5]:
df = df.sort_values(by='n_citation', ascending=False)
df = df.head(10000).copy()
df.head()

Unnamed: 0,id,title,author_name,author_id,year,n_citation,doc_type,reference_count,references,doi
4696136,2041404167,The Mathematical Theory of Communication,C. E. Shannon,2247248564,1949.0,48327,Book,,,
4630907,1639032689,"Genetic algorithms in search, optimization, an...",David E. Goldberg,2102678951,1989.0,44175,Book,,,
4092588,2912565176,Fuzzy sets,Lotfi A. Zadeh,2252586558,1996.0,42437,,,,
2937610,2151103935,Distinctive Image Features from Scale-Invarian...,David G. Lowe,2104328312,2004.0,35541,Journal,33.0,"[19720318, 1541642243, 1560959218, 1676552347,...",https://doi.org/10.1023/B:VISI.0000029664.9961...
4088311,2911964244,Random Forests,Leo Breiman,2569376642,2001.0,34741,,11.0,"[1507255258, 1580948147, 1605688901, 197584664...",https://doi.org/10.1023/A:1010933404324


In [6]:
df.describe()

Unnamed: 0,id,year,n_citation,reference_count
count,10000.0,10000.0,10000.0,8795.0
mean,2039274000.0,2000.128,1451.9958,21.359068
std,310312700.0,10.066367,1954.416052,32.146003
min,852874.0,1899.0,596.0,1.0
25%,2013153000.0,1996.0,716.0,7.0
50%,2106909000.0,2002.0,924.0,14.0
75%,2144150000.0,2007.0,1421.0,25.0
max,3003663000.0,2019.0,48327.0,1287.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 4696136 to 2109151
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               10000 non-null  int64  
 1   title            10000 non-null  object 
 2   author_name      10000 non-null  object 
 3   author_id        10000 non-null  object 
 4   year             10000 non-null  float64
 5   n_citation       10000 non-null  int64  
 6   doc_type         9650 non-null   object 
 7   reference_count  8795 non-null   float64
 8   references       8795 non-null   object 
 9   doi              8451 non-null   object 
dtypes: float64(2), int64(2), object(6)
memory usage: 859.4+ KB


In [8]:
# Convert the semicolon-separated strings to lists of IDs
#df['author_id'] = df['author_id'].str.split(';')

# Convert the semicolon-separated strings to lists of names
#df['author_name'] = df['author_name'].str.split(';')

# convert the year, n_citation, and reference_count columns to int
df['year'] = df['year'].astype('Int64')
df['n_citation'] = df['n_citation'].astype('Int64')
df['reference_count'] = df['reference_count'].astype('Int64')

In [9]:
# saving the dataframe to a csv file
df.to_csv('citation.csv', index=False)


In [20]:
df = pd.read_csv('citation.csv')
df.head()

Unnamed: 0,id,title,author_name,author_id,year,n_citation,doc_type,reference_count,references,doi
0,2041404167,The Mathematical Theory of Communication,['C. E. Shannon'],['2247248564'],1949,48327,Book,,,
1,1639032689,"Genetic algorithms in search, optimization, an...",['David E. Goldberg'],['2102678951'],1989,44175,Book,,,
2,2912565176,Fuzzy sets,['Lotfi A. Zadeh'],['2252586558'],1996,42437,,,,
3,2151103935,Distinctive Image Features from Scale-Invarian...,['David G. Lowe'],['2104328312'],2004,35541,Journal,33.0,"[19720318, 1541642243, 1560959218, 1676552347,...",https://doi.org/10.1023/B:VISI.0000029664.9961...
4,2911964244,Random Forests,['Leo Breiman'],['2569376642'],2001,34741,,11.0,"[1507255258, 1580948147, 1605688901, 197584664...",https://doi.org/10.1023/A:1010933404324


#### Citation Graph

In [27]:
# create the citation graph
citation_graph = nx.DiGraph()

##### **Adding Nodes:** Each node in the graph will be identified by the paper ID and will have an attribute called title that has the paper's title 

In [29]:
# iterate through the dataframe rows
for index, row in df.iterrows():
    # getting the paper id and title
    paper_id = row['id']
    title = row['title']
    # adding the node to the graph
    citation_graph.add_node(paper_id, title=title)

In [30]:
citation_graph.number_of_nodes()

10000

In [34]:
# first 10 nodes data in the citation graph
c = 0
for node, data in citation_graph.nodes(data=True):
    c += 1
    print(f"Node {node}: {data}")
    if c > 10:
        break

Node 2041404167: {'title': 'The Mathematical Theory of Communication'}
Node 1639032689: {'title': 'Genetic algorithms in search, optimization, and machine learning'}
Node 2912565176: {'title': 'Fuzzy sets'}
Node 2151103935: {'title': 'Distinctive Image Features from Scale-Invariant Keypoints'}
Node 2911964244: {'title': 'Random Forests'}
Node 1973948212: {'title': 'Applied Logistic Regression'}
Node 2153635508: {'title': 'LIBSVM: A library for support vector machines'}
Node 2156909104: {'title': 'The Nature of Statistical Learning Theory'}
Node 1791587663: {'title': 'Perceived usefulness, perceived ease of use, and user acceptance of information technology'}
Node 2119821739: {'title': 'Support-Vector Networks'}
Node 1995875735: {'title': 'A mathematical theory of communication'}


##### **Adding the Edges:** If paper A has cited paper B, an edge from node A to B is added.

In [36]:
# Iterate through the DataFrame rows
for index, row in df.iterrows():
    # Getting the paper id and references
    paper_id = row['id']
    references = row['references']

    # Check if references is not null and is a list
    if references and isinstance(references, list):
        for reference in references:
            citation_graph.add_edge(paper_id, reference)
    # references is not null but not a list (single value)
    elif references:  
        citation_graph.add_edge(paper_id, references)


##### Checking the correctness of the edges

In [50]:
# Node of interest
node_to_check = 2911964244

# Get the outgoing edges of the node
outgoing_edges = citation_graph.out_edges(node_to_check)

# Get the incoming edges of the node
incoming_edges = citation_graph.in_edges(node_to_check)

# Print the results
print(f"Outgoing edges of node {node_to_check}: {list(outgoing_edges)}")
print(f"Incoming edges of node {node_to_check}: {list(incoming_edges)}")

# references of the paper
references_list = df[df['id'] == node_to_check]['references'].values[0]
print(f"References of node {node_to_check}: {references_list}")



Outgoing edges of node 2911964244: [(2911964244, '[1507255258, 1580948147, 1605688901, 1975846642, 2077200366, 2099968818, 2112076978, 2113242816, 2120240539, 2152761983, 2912934387]')]
Incoming edges of node 2911964244: []
References of node 2911964244: [1507255258, 1580948147, 1605688901, 1975846642, 2077200366, 2099968818, 2112076978, 2113242816, 2120240539, 2152761983, 2912934387]


The **references_list** and the **ougoing_edges** match which ensures the correctness of the graph.