In [31]:
import networkx as nx
import matplotlib.pyplot as plt
import os
import json
import random
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import csv
import re
from time import time

In [32]:
nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vedch\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vedch\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [8]:
import pickle

# Load the graph using pickle
with open('new_build_parallel/citation_graph.gpickle', 'rb') as f:
    G = pickle.load(f)


# Display basic graph information
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 6545
Number of edges: 32874


In [25]:
def get_paper_info(node_id):
    """Retrieve title and abstract information for a paper given its node ID.
    
    Args:
        node_id: The ID of the paper node in the citation graph
        
    Returns:
        tuple: (title, abstract) where both can be None if not found
    """
    # Load the mapping from citation graph id to folder
    with open("new_build_parallel/citation_graph_id_to_folder.json", "r") as f:
        id_to_folder = json.load(f)
    

    # Get the folder path for the given node_id
    folder_path = id_to_folder.get(f"{node_id}")

    if not folder_path:
        return None, None
    
    folder_path = os.path.join("../dataset_papers/dataset_papers", folder_path) # set path as needed
    # Extract title from title.txt
    title = None
    title_path = os.path.join(folder_path, "title.txt")
    if os.path.exists(title_path):
        with open(title_path, "r", encoding='utf-8', errors='ignore') as f:
            title = f.read().strip()
            
    # Extract abstract from abstract.txt
    abstract = None
    abstract_path = os.path.join(folder_path, "abstract.txt")
    if os.path.exists(abstract_path):
        with open(abstract_path, "r", encoding='utf-8', errors='ignore') as f:
            abstract = f.read().strip()
            
    return title, abstract

### Create Train-Test Split

In [37]:
import networkx as nx
import pandas as pd
import random
from sklearn.model_selection import train_test_split

# Set random seed
random.seed(42)

# Step 1: Extract all real edges (positive samples)
edges = list(G.edges())

# Step 2: Train-test split on positive edges
train_edges, test_edges = train_test_split(edges, test_size=0.2, random_state=42)

# Step 3: Create training graph (with test edges removed)
train_G = G.copy()
train_G.remove_edges_from(test_edges)

# Step 4: Sample negative edges (non-edges)

nodes = list(train_G.nodes())

# Use sets to avoid duplicate samples
train_non_edges = set()
while len(train_non_edges) < len(train_edges):
    u, v = random.choice(nodes), random.choice(nodes)
    if u != v and not train_G.has_edge(u, v):
        train_non_edges.add((u, v))

test_non_edges = set()
while len(test_non_edges) < len(test_edges):
    u, v = random.choice(nodes), random.choice(nodes)
    if u != v and not G.has_edge(u, v):  # for test set, check on original full graph
        test_non_edges.add((u, v))

# Step 5: Prepare training and testing datasets
X_train = train_edges + list(train_non_edges)
y_train = [1] * len(train_edges) + [0] * len(train_non_edges)

X_test = test_edges + list(test_non_edges)
y_test = [1] * len(test_edges) + [0] * len(test_non_edges)

# Step 6: Create DataFrames
train_df = pd.DataFrame({
    'source': [u for u, v in X_train],
    'target': [v for u, v in X_train],
    'label': y_train
})

test_df = pd.DataFrame({
    'source': [u for u, v in X_test],
    'target': [v for u, v in X_test],
    'label': y_test
})

# Step 7: Print dataset statistics
print("\nTraining Set Statistics:")
print(f"Total samples: {len(train_df)}")
print(f"Positive samples: {train_df['label'].sum()}")
print(f"Negative samples: {len(train_df) - train_df['label'].sum()}")

print("\nTest Set Statistics:")
print(f"Total samples: {len(test_df)}")
print(f"Positive samples: {test_df['label'].sum()}")
print(f"Negative samples: {len(test_df) - test_df['label'].sum()}")

# Optionally preview the data
print("\nTraining Set Preview:")
display(train_df)
print("\nTest Set Preview:")
display(test_df)



Training Set Statistics:
Total samples: 52598
Positive samples: 26299
Negative samples: 26299

Test Set Statistics:
Total samples: 13150
Positive samples: 6575
Negative samples: 6575

Training Set Preview:


Unnamed: 0,source,target,label
0,4862,615,1
1,87,1150,1
2,91,4125,1
3,2349,6532,1
4,5547,1947,1
...,...,...,...
52593,2235,3092,0
52594,6403,4819,0
52595,3922,6388,0
52596,134,3347,0



Test Set Preview:


Unnamed: 0,source,target,label
0,3152,4222,1
1,772,471,1
2,6538,2791,1
3,4119,10,1
4,2060,1455,1
...,...,...,...
13145,5553,219,0
13146,3442,567,0
13147,1757,3733,0
13148,790,1729,0


### Feature Engineering

In [38]:
# Define useful information for the nodes
# Example: build id_to_title using your get_paper_info function
id_to_title = {}
id_to_abstract = {}
for node in G.nodes():
    title, abstract = get_paper_info(node)
    id_to_title[node] = title if title else ""
    id_to_abstract[node] = abstract if abstract else ""


In [39]:
id_to_title

{0: 'Diffusion Twigs with Loop Guidance for Conditional Graph Generation',
 1: 'CoPriv: Network/Protocol Co-Optimization for Communication-Efficient Private Inference',
 2: 'Anonymous Bandits for Multi-User Systems',
 3: 'Adversarially Robust Dense-Sparse Tradeoffs via Heavy-Hitters',
 4: 'Diffusion Priors for Variational Likelihood Estimation and Image Denoising',
 5: 'Expanding Sparse Tuning for Low Memory Usage',
 6: 'Better Mini-Batch Algorithms via Accelerated Gradient Methods',
 7: 'Conditional Adversarial Domain Adaptation',
 8: 'Strategic Classification in the Dark',
 9: 'Towards Practical Preferential Bayesian Optimization with Skew Gaussian Processes',
 10: 'Understanding Anomaly Detection with Deep Invertible Networks through\n  Hierarchies of Distributions and Features',
 11: 'Learning Policies for Contextual Submodular Prediction',
 12: 'Learning and Testing Causal Models with Interventions',
 13: 'Reward-Free RL is No Harder Than Reward-Aware RL in Linear Markov Decision 