## ADM Assignment 5 
### Network Scholar: Graph-Based Analysis and Visualization of Academic Collaborations

### Libraries 


In [1]:
import pandas as pd
import numpy as np
import ijson
import time
import networkx as nx


### A script to convert the data from JSON to a PD dataframe 
##### Reference: https://www.kaggle.com/code/devintheai/citation-network-eda

In [2]:
papers = []

start = time.process_time()

with open('citation.json', "rb") as f:
    for i, element in enumerate(ijson.items(f, "item")):
        paper = {}
        paper['id'] = element['id']
        paper['title'] = element['title']

        author = element.get('authors')
        if author:
            Author = element['authors']
            author_name = []
            author_id = []

            for j in Author:
                if 'name' in j and 'id' in j :
                    author_name.append(str(j['name']))  
                    author_id.append(str(j['id']))
                else:
                    author_name.append(str(np.nan))  
                    author_id.append(str(np.nan))

            paper['author_name'] = ', '.join(author_name)
            paper['author_id'] = ', '.join(author_id)
        
        year = element.get('year')
        if year:
            paper['year']= year
        else:
            paper['year']= np.nan
        
        n_citation= element.get('n_citation')
        if n_citation:
             paper['n_citation']= element['n_citation']
        else:
            paper['n_citation']= 0
        
        doc_type= element.get('doc_type') 
        if doc_type:
            paper['doc_type']= element['doc_type']
        else:
            paper['doc_type']= np.nan   

        references= element.get('references')
        if references:
            paper['reference_count']= len(references)
            paper['references'] = [int(r) for r in references]
        else:
            paper['reference_count']= np.nan 
            paper['references']= np.nan
        
        doi = element.get('doi')
        if doi:
            paper['doi']= f"https://doi.org/{element['doi']}"
        else:
            paper['doi']= np.nan               
        
        
        papers.append(paper)
        
        if i%48000 == 0:
            print(f"{i}:{round((time.process_time() - start),2)}s ",end="")


0:0.0s 48000:1.67s 96000:3.2s 144000:4.47s 192000:5.47s 240000:6.25s 288000:7.28s 336000:8.7s 384000:9.95s 432000:11.22s 480000:13.06s 528000:14.7s 576000:15.81s 624000:16.75s 672000:17.81s 720000:19.36s 768000:20.88s 816000:22.06s 864000:23.55s 912000:24.58s 960000:25.86s 1008000:27.27s 1056000:28.59s 1104000:30.02s 1152000:31.72s 1200000:33.02s 1248000:34.39s 1296000:35.83s 1344000:37.08s 1392000:38.36s 1440000:39.91s 1488000:41.19s 1536000:42.23s 1584000:43.62s 1632000:44.64s 1680000:46.3s 1728000:47.27s 1776000:48.52s 1824000:52.0s 1872000:54.33s 1920000:56.53s 1968000:58.77s 2016000:61.28s 2064000:63.5s 2112000:66.08s 2160000:69.16s 2208000:71.2s 2256000:74.2s 2304000:77.09s 2352000:78.55s 2400000:79.89s 2448000:80.95s 2496000:82.84s 2544000:84.14s 2592000:85.23s 2640000:86.27s 2688000:87.75s 2736000:89.5s 2784000:91.2s 2832000:92.28s 2880000:93.62s 2928000:96.09s 2976000:98.19s 3024000:99.47s 3072000:101.92s 3120000:103.47s 3168000:105.52s 3216000:106.28s 3264000:108.41s 3312000:

### 1. Data pre-processing

In [3]:
df = pd.DataFrame(papers)
df.head()

Unnamed: 0,id,title,author_name,author_id,year,n_citation,doc_type,reference_count,references,doi
0,1091,Preliminary Design of a Network Protocol Learn...,"Makoto Satoh, Ryo Muramatsu, Mizue Kayama, Kaz...","2312688602, 2482909946, 2128134587, 2101782692...",2013.0,1,Conference,2.0,"[2005687710, 2018037215]",https://doi.org/10.1007/978-3-642-39476-8_19
1,1388,Further Results on Independence in Direct-Prod...,Pranava K. Jha,2718958994,2000.0,1,Journal,,,
2,1674,A methodology for the physically accurate visu...,"G. Beale, G. Earl","2103626414, 2117665592",2011.0,1,Conference,15.0,"[1535888970, 1992876689, 1993710814, 203565334...",https://doi.org/10.2312/VAST/VAST11/137-144
3,1688,"Comparison of GARCH, Neural Network and Suppor...","Altaf Hossain, Faisal Zaman, M. Nasser, M. Muf...","2300589394, 2308774408, 2126056503, 2425818370",2009.0,6,Conference,3.0,"[1560724230, 1986968751, 2156909104]",https://doi.org/10.1007/978-3-642-11164-8_97
4,5411,COMPARING GNG3D AND QUADRIC ERROR METRICS METH...,"Rafael Álvarez, Leandro Tortosa, José-Francisc...","2125293936, 2101693188, 2159120860, 2146570697",2009.0,0,Conference,,,


In [4]:
print('There are', len(df),'papers in the dataset')

There are 4894081 papers in the dataset


#### Filtering the data to find the top 10000 papers

In [5]:
df = df.sort_values(by='n_citation', ascending=False)
df = df.head(10000).copy()
df.head()

Unnamed: 0,id,title,author_name,author_id,year,n_citation,doc_type,reference_count,references,doi
4696136,2041404167,The Mathematical Theory of Communication,C. E. Shannon,2247248564,1949.0,48327,Book,,,
4630907,1639032689,"Genetic algorithms in search, optimization, an...",David E. Goldberg,2102678951,1989.0,44175,Book,,,
4092588,2912565176,Fuzzy sets,Lotfi A. Zadeh,2252586558,1996.0,42437,,,,
2937610,2151103935,Distinctive Image Features from Scale-Invarian...,David G. Lowe,2104328312,2004.0,35541,Journal,33.0,"[19720318, 1541642243, 1560959218, 1676552347,...",https://doi.org/10.1023/B:VISI.0000029664.9961...
4088311,2911964244,Random Forests,Leo Breiman,2569376642,2001.0,34741,,11.0,"[1507255258, 1580948147, 1605688901, 197584664...",https://doi.org/10.1023/A:1010933404324


In [6]:
df.describe()

Unnamed: 0,id,year,n_citation,reference_count
count,10000.0,10000.0,10000.0,8795.0
mean,2039274000.0,2000.128,1451.9958,21.359068
std,310312700.0,10.066367,1954.416052,32.146003
min,852874.0,1899.0,596.0,1.0
25%,2013153000.0,1996.0,716.0,7.0
50%,2106909000.0,2002.0,924.0,14.0
75%,2144150000.0,2007.0,1421.0,25.0
max,3003663000.0,2019.0,48327.0,1287.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 4696136 to 2109151
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               10000 non-null  int64  
 1   title            10000 non-null  object 
 2   author_name      10000 non-null  object 
 3   author_id        10000 non-null  object 
 4   year             10000 non-null  float64
 5   n_citation       10000 non-null  int64  
 6   doc_type         9650 non-null   object 
 7   reference_count  8795 non-null   float64
 8   references       8795 non-null   object 
 9   doi              8451 non-null   object 
dtypes: float64(2), int64(2), object(6)
memory usage: 859.4+ KB


In [42]:
# convert the year, n_citation, and reference_count columns to int
df['year'] = df['year'].astype('Int64')
df['n_citation'] = df['n_citation'].astype('Int64')
df['reference_count'] = df['reference_count'].astype('Int64')

In [10]:
# saving the dataframe to a csv file
df.to_csv('citation.csv', index=False)


In [14]:
df = pd.read_csv('citation.csv')
df.head(15)

Unnamed: 0,id,title,author_name,author_id,year,n_citation,doc_type,reference_count,references,doi
0,2041404167,The Mathematical Theory of Communication,C. E. Shannon,2247248564,1949,48327,Book,,,
1,1639032689,"Genetic algorithms in search, optimization, an...",David E. Goldberg,2102678951,1989,44175,Book,,,
2,2912565176,Fuzzy sets,Lotfi A. Zadeh,2252586558,1996,42437,,,,
3,2151103935,Distinctive Image Features from Scale-Invarian...,David G. Lowe,2104328312,2004,35541,Journal,33.0,"[19720318, 1541642243, 1560959218, 1676552347,...",https://doi.org/10.1023/B:VISI.0000029664.9961...
4,2911964244,Random Forests,Leo Breiman,2569376642,2001,34741,,11.0,"[1507255258, 1580948147, 1605688901, 197584664...",https://doi.org/10.1023/A:1010933404324
5,1973948212,Applied Logistic Regression,"David W. Hosmer, Stanley Lemeshow","2102544963, 1990110535",1989,32053,Book,,,
6,2153635508,LIBSVM: A library for support vector machines,"Chih-Chung Chang, Chih-Jen Lin","2895256545, 2168176072",2011,31047,Journal,34.0,"[1510526001, 1543810117, 1556115774, 156879334...",https://doi.org/10.1145/1961189.1961199
7,2156909104,The Nature of Statistical Learning Theory,Vladimir N. Vapnik,2022407533,1995,28886,Book,,,
8,1791587663,"Perceived usefulness, perceived ease of use, a...",Fred D. Davis,2427477170,1989,25855,Journal,22.0,"[157213131, 1520103841, 1778357938, 1972888601...",https://doi.org/10.2307/249008
9,2119821739,Support-Vector Networks,"Corinna Cortes, Vladimir Vapnik","2134830209, 2022407533",1995,22276,Journal,3.0,"[2087347434, 2154579312, 2168228682]",https://doi.org/10.1023/A:1022627411411


#### Citation Graph

In [3]:
# create the citation graph
citation_graph = nx.DiGraph()

##### **Adding Nodes:** Each node in the graph will be identified by the paper ID and will have an attribute called title that has the paper's title 

In [4]:
# iterate through the dataframe rows
for index, row in df.iterrows():
    # getting the paper id and title
    paper_id = row['id']
    title = row['title']
    # adding the node to the graph
    citation_graph.add_node(paper_id, title=title)

In [5]:
citation_graph.number_of_nodes()

10000

In [6]:
# first 10 nodes data in the citation graph
c = 0
for node, data in citation_graph.nodes(data=True):
    c += 1
    print(f"Node {node}: {data}")
    if c > 10:
        break

Node 2041404167: {'title': 'The Mathematical Theory of Communication'}
Node 1639032689: {'title': 'Genetic algorithms in search, optimization, and machine learning'}
Node 2912565176: {'title': 'Fuzzy sets'}
Node 2151103935: {'title': 'Distinctive Image Features from Scale-Invariant Keypoints'}
Node 2911964244: {'title': 'Random Forests'}
Node 1973948212: {'title': 'Applied Logistic Regression'}
Node 2153635508: {'title': 'LIBSVM: A library for support vector machines'}
Node 2156909104: {'title': 'The Nature of Statistical Learning Theory'}
Node 1791587663: {'title': 'Perceived usefulness, perceived ease of use, and user acceptance of information technology'}
Node 2119821739: {'title': 'Support-Vector Networks'}
Node 1995875735: {'title': 'A mathematical theory of communication'}


##### **Adding the Edges:** If paper A has cited paper B, an edge from node A to B is added.

In [7]:
# Iterate through the DataFrame rows
for index, row in df.iterrows():
    # Getting the paper id and references
    paper_id = row['id']
    references = row['references']

    # Check if references is not null and is a list
    if references and isinstance(references, list):
        for reference in references:
            citation_graph.add_edge(paper_id, reference)
    # references is not null but not a list (single value)
    elif references:  
        citation_graph.add_edge(paper_id, references)


##### Checking the correctness of the edges

In [8]:
# Node of interest
node_to_check = 2911964244

# Get the outgoing edges of the node
outgoing_edges = citation_graph.out_edges(node_to_check)

# Get the incoming edges of the node
incoming_edges = citation_graph.in_edges(node_to_check)

# Print the results
print(f"Outgoing edges of node {node_to_check}: {list(outgoing_edges)}")
print(f"Incoming edges of node {node_to_check}: {list(incoming_edges)}")

# references of the paper
references_list = df[df['id'] == node_to_check]['references'].values[0]
print(f"References of node {node_to_check}: {references_list}")



Outgoing edges of node 2911964244: [(2911964244, '[1507255258, 1580948147, 1605688901, 1975846642, 2077200366, 2099968818, 2112076978, 2113242816, 2120240539, 2152761983, 2912934387]')]
Incoming edges of node 2911964244: []
References of node 2911964244: [1507255258, 1580948147, 1605688901, 1975846642, 2077200366, 2099968818, 2112076978, 2113242816, 2120240539, 2152761983, 2912934387]


The **references_list** and the **ougoing_edges** match which ensures the correctness of the graph.

#### Collaboration graph 

##### create a dictionary of authors and their IDs

In [9]:
# A dictionary to store authors and their IDs
authors_dict = dict()

# Iterate through the DataFrame rows
for index, row in df.iterrows():
    # Getting the author names and IDs
    author_names = row['author_name'].split(', ')
    author_ids = row['author_id'].split(', ')

    # Iterate through the authors and author IDs
    for author_name, author_id in zip(author_names, author_ids):
        # Update the authors_dict
        if author_id not in authors_dict:
            authors_dict[author_id] = []

        # Check for duplicates before appending to the list
        if author_name not in authors_dict[author_id]:
            authors_dict[author_id].append(author_name)

# removing the different versions of the names of the authors 
authors_dict = {key: values[0] for key, values in authors_dict.items()}


In [10]:
collab_graph = nx.Graph()

##### **Add the nodes to the graph:** the nodes are the authors' IDs and each node has the author's name as an attribute

In [11]:
# adding the nodes to the graph
for key, value in authors_dict.items():
    collab_graph.add_node(key, name=value)

In [12]:
# first 10 nodes data in the citation graph
c = 0
for node, data in collab_graph.nodes(data=True):
    c += 1
    print(f"Node {node}: {data}")
    if c > 10:
        break

Node 2247248564: {'name': 'C. E. Shannon'}
Node 2102678951: {'name': 'David E. Goldberg'}
Node 2252586558: {'name': 'Lotfi A. Zadeh'}
Node 2104328312: {'name': 'David G. Lowe'}
Node 2569376642: {'name': 'Leo Breiman'}
Node 2102544963: {'name': 'David W. Hosmer'}
Node 1990110535: {'name': 'Stanley Lemeshow'}
Node 2895256545: {'name': 'Chih-Chung Chang'}
Node 2168176072: {'name': 'Chih-Jen Lin'}
Node 2022407533: {'name': 'Vladimir N. Vapnik'}
Node 2427477170: {'name': 'Fred D. Davis'}


##### Generating a matrix to track the collaborative efforts of authors, where the frequency will serve as the weight for the graph.


In [15]:
matrix = pd.DataFrame(columns=authors_dict.keys(), index=authors_dict.keys())
matrix.iloc[:, :] = 0
matrix.head()

Unnamed: 0,2247248564,2102678951,2252586558,2104328312,2569376642,2102544963,1990110535,2895256545,2168176072,2022407533,...,176466469,2344230363,469273356,2477509334,2257320792,2119171673,2973489965,1974600515,299775663,282527274
2247248564,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2102678951,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2252586558,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2104328312,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2569376642,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
for index, row in matrix.iterrows():
    # Get the author ID
    author_id = index
    
    # Get the author papers
    for index_df, row_df in df.iterrows():
        # Get the author IDs of the paper
        IDs = row_df['author_id'].split(', ')
        
        # Check if the author ID is in the IDs list
        if author_id in IDs:
            # Iterate through the IDs
            for ID in IDs:
                # Check if the author ID is not the same as the ID in the loop
                if ID != author_id:
                    # Add 1 to the corresponding cell
                    matrix.loc[author_id, ID] += 1

In [22]:
matrix.to_csv('collaboration.csv', index=False)

##### Creating the edges and assigning the weights based on the matrix 

In [23]:
# adding the edges to the graph
for index in matrix.index:
    for column in matrix.columns:
        # check if the weight is more than zero and the edge does nto exist
        if matrix.loc[index, column] > 0 and not collab_graph.has_edge(index, column):
            collab_graph.add_edge(index, column, weight=matrix.loc[index, column])

In [24]:
# Save to GraphML
nx.write_graphml(collab_graph, "collab_graph.graphml")

In [9]:
# Load from GraphML
collab_graph = nx.read_graphml("collab_graph.graphml")

In [10]:
collab_graph.number_of_nodes()

21235

In [16]:
# example of a node in the collaboration graph
node_to_check = '2150011549'
collab_graph.edges(node_to_check, data=True)

EdgeDataView([('2150011549', '2102678951', {'weight': 3}), ('2150011549', '2120996014', {'weight': 2}), ('2150011549', '2175835430', {'weight': 1}), ('2150011549', '1208012566', {'weight': 2}), ('2150011549', '177795906', {'weight': 2}), ('2150011549', '2156635946', {'weight': 2}), ('2150011549', '2467761819', {'weight': 1}), ('2150011549', '2096079875', {'weight': 1}), ('2150011549', '2069070276', {'weight': 1}), ('2150011549', '2641050018', {'weight': 1}), ('2150011549', '2423330096', {'weight': 1}), ('2150011549', '1996855907', {'weight': 1})])