In [2]:
import os
import re
import json
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from difflib import get_close_matches

In [12]:
dataset_path = "dataset_papers/dataset_papers"
json_path = "parsed_citations.json"
nodes = [os.path.splitext(file)[0] for file in os.listdir(dataset_path)]
G = nx.DiGraph()

for paper in os.listdir(dataset_path):
    paper_path = os.path.join(dataset_path, paper)
    title, abstract = None, None
    with open(os.path.join(paper_path, "title.txt"), "r", encoding="utf-8") as f:
        title = f.read().strip()
    with open(os.path.join(paper_path, "abstract.txt"), "r", encoding="utf-8") as f:
        abstract = f.read().strip()
    G.add_node(paper, title=title, abstract=abstract)

all_titles = {node[0]: node[1]["title"].upper() for node in G.nodes(data=True)}
title_to_paper = {title: paper for paper, title in all_titles.items()}

In [13]:
def title_match(title, title_to_paper):
    title = title.upper()
    if title in title_to_paper:
        return title_to_paper[title]
    best_match = get_close_matches(title, title_to_paper.keys(), n=1, cutoff=0.9)
    if best_match:
        print(f"Best match for {title}: {best_match}")
        return title_to_paper[best_match]
    return None

def add_edges(G, paper_id, citations, all_titles):
    for citation in citations:
        cited_paper_id = citation["key"]
        cited_paper_title = citation["title"].upper()
        if cited_paper_id in G.nodes:
            G.add_edge(paper_id, cited_paper_id)
            continue
        best_match_id = title_match(cited_paper_title, all_titles)
        if best_match_id:
            G.add_edge(paper_id, best_match_id)
            continue
        else:
            # print(f"Warning: No match found for {cited_paper_title} in {paper_id}.")
            G.add_node(cited_paper_id, title=cited_paper_title)
            G.add_edge(paper_id, cited_paper_id)
            all_titles[cited_paper_id] = cited_paper_title

def visualize_graph(G, output_path):
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G, k=0.5, iterations=50)
    nx.draw(G, pos, with_labels=True, node_size=700, node_color="lightblue", font_size=10, font_weight="bold")
    plt.title("Citation Network")
    plt.savefig(output_path)
    plt.show()

In [None]:
import random

with open(json_path, "r", encoding="utf-8") as f:
    json_data = json.load(f)
    for paper in json_data:
        paper_id = paper["paper_id"]
        print(paper_id)
        citations = paper["citations"]
        add_edges(G, paper_id, citations, all_titles)

0709.0928v1
0804.1302v1
0809.1493v1
0809.2085v1
0811.1790v1
0906.2027v2
0909.1884v2
0910.5461v1
0911.5372v1
1003.0024v1
1005.0027v2
1005.1860v2
1005.5581v2
1006.0868v2
1008.5325v4
1010.3091v2
1010.3812v2
1011.0097v1
1011.0415v1
1012.1501v2
1102.0836v2
1103.0790v1
1103.2816v2
1104.2018v1
1104.5525v1
1105.0697v1
1106.0800v3
1106.1622v1
1106.2436v3
1106.2774v1
1106.4574v1
1106.4729v1
1107.1283v2
1107.1744v2
1107.3258v1
1107.4080v1
1107.4976v2
1107.4985v1
1108.2401v3
1108.4217v1
1108.6211v2
1109.0367v1
1109.1990v1
1109.2415v2
1109.3701v2
1110.4300v1
1110.4411v1
1110.6416v1
1111.0352v2
1111.2664v1
1112.0391v2
1112.4394v1
1203.4523v2
1204.3251v2
1204.5043v2
1204.5243v2
1205.0079v2
1205.0288v2
1205.0622v1
1205.1671v1
1205.2171v2
1205.2874v3
1205.4213v2
1205.4810v3
1205.6432v2
1206.1800v1
1206.1898v2
1206.5162v2
1206.6389v3
1207.2491v1
1207.4404v1
1207.4747v4
1209.0833v1
1209.1076v1
1209.1077v1
1209.1145v1
1209.2434v1
1209.2784v1
1209.3056v1
1209.3230v1
1209.3352v4
1210.5196v1
1211.0439v1
1211