In [1]:
import pymupdf  
import argparse 
import sys
from pathlib import Path

In [2]:
import re

In [3]:
pdf_path = "C:/Users/dylan/Documents/Spatial/Networks/knowledge map/article.pdf"
pdf_path_obj = Path(pdf_path).resolve()
if not pdf_path_obj.exists():
    raise FileNotFoundError(f"PDF file not found: {pdf_path}")

if not pdf_path_obj.is_file():
    raise ValueError(f"Path is not a file: {pdf_path}")

In [4]:
def parse_text(pdf_path:str) ->list :
    try:
        doc = pymupdf.open(pdf_path) 
        page = doc[1]
        page_count = len(doc) - 1
        text = []   # list where each item is the text within each paragraph
        for page_num in range(page_count):
            page = doc[page_num]
            # a block contains text descriptions for each section in the page
            blocks = page.get_text("blocks", sort = False)
            # for every section in the page
            for block in blocks:
                # print("block #", block[5])
                # a block contains multiple paragraphs
                paragraphs = block[4]
                # a "." follwed by " " and then "\n" indicates a new paragraph within the text
                # get paragraphs, a list with each paragraph as an item
                paragraphs = re.split(r'(?<=\.) \r?\n', paragraphs)
                for paragraph in paragraphs:
                    paragraph = paragraph.replace("\n", "")
                    text.append(paragraph)
                # _ = [print(paragraph, '\n NEW PARAGRAPH \n') for paragraph in paragraphs] 
        print(f"[MAIN.PY DEBUG] Extracted {len(text)} paragraphs from {page_count} pages", file=sys.stderr, flush=True)
        return text
    except Exception as e:
        print(f"[MAIN.PY DEBUG] Error extracting PDF text: {e}", file=sys.stderr, flush=True)
        raise Exception(f"Failed to extract text from PDF: {e}")

In [5]:
text = parse_text(pdf_path)
# text[9] corresponds to first page Abstract

[MAIN.PY DEBUG] Extracted 150 paragraphs from 6 pages


In [6]:
import networkx as nx

In [11]:
semantic_path = "C:/Users/dylan/Documents/Spatial/Networks/knowledge map/Bibliographic-Network-Representation-of-Pdf/docs/semantic_net_1.2.gexf"
network = nx.read_gexf(semantic_path)

Visualization

In [23]:
# nx.write_network_text(graph = nx.Graph, path = semantic_path)
nx.write_network_text(network)

╟── visits mortality
╎   └── mortality
╎       ├── smooth
╎       │   ├── mortality rate ─ mortality
╎       │   │   ├── local ─ mortality
╎       │   │   │   ├── heat ─ mortality rate, mortality
╎       │   │   │   │   ├── heat exposure
╎       │   │   │   │   │   └── exposure ─ heat
╎       │   │   │   │   │       └── environmental ─ heat
╎       │   │   │   │   │           ├── vulnerability ─ heat, mortality
╎       │   │   │   │   │           │   ├── theme ─ heat
╎       │   │   │   │   │           │   │   ├── house
╎       │   │   │   │   │           │   │   ├── individual
╎       │   │   │   │   │           │   │   │   ├── predictor ─ heat
╎       │   │   │   │   │           │   │   │   │   ├── heat illness ─ heat
╎       │   │   │   │   │           │   │   │   │   │   └── illness ─ heat, predictor
╎       │   │   │   │   │           │   │   │   │   │       ├── hospital emergency
╎       │   │   │   │   │           │   │   │   │   │       │   ├── hospital ─ heat, illness
╎       

In [24]:
nx.write_network_text(network, sources = ['method']) # node of degree 1

╙── method
    └── stabilize
        └── smooth
            ├── mortality rate
            │   ├── local
            │   │   ├── heat ─ mortality rate
            │   │   │   ├── heat exposure
            │   │   │   │   └── exposure ─ heat
            │   │   │   │       └── environmental ─ heat
            │   │   │   │           ├── vulnerability ─ heat
            │   │   │   │           │   ├── theme ─ heat
            │   │   │   │           │   │   ├── house
            │   │   │   │           │   │   ├── individual
            │   │   │   │           │   │   │   ├── predictor ─ heat
            │   │   │   │           │   │   │   │   ├── heat illness ─ heat
            │   │   │   │           │   │   │   │   │   └── illness ─ heat, predictor
            │   │   │   │           │   │   │   │   │       ├── hospital emergency
            │   │   │   │           │   │   │   │   │       │   ├── hospital ─ heat, illness
            │   │   │   │           │   │   │   │   │       │   

In [12]:
type(network.nodes)
network.nodes
nodes = list(network.nodes)

In [13]:
len(list(network.nodes))
print(len(nodes))
print(nodes)
type(text)

204
['heat exposure', 'house', 'theme', 'census tract', 'smooth', 'human', 'disproportionately', 'table', 'hospital admission', 'health', 'local', 'supplementary data', 'study', 'multiple', 'empirical baye', 'vulnerable areas', 'metropolitan areas', 'stabilize', 'hospital emergency', 'higher social', 'statistical analysis', 'statistical relationship', 'heat related morbidity', 'socio economic status', 'socioeconomic status', 'spatially', 'visits mortality', 'additionally', 'identify', 'schwartz', 'social vulnerability', 'empirical bayesian', 'white', 'wilhelmi', 'heat morbidity', 'tool', 'linke', 'heat related mortality', 'individual', 'generally', 'data', 'heat illness', 'high', 'hazardous heat event', 'heat related death', 'total heat', 'geographically weight', 'cramp', 'caus', 'method', 'level', 'status', 'observe', 'unstable', 'bivariate', 'elderly population', 'metropolitan', 'distribution', 'determinant', 'heat', 'death', 'county level', 'department', 'public health', 'illness', 

list

In [14]:
import numpy as np

In [None]:
# we can add node, edge info using nx and don't actually need to fill in the biblio matrix.
# however, using the biblio matrix as a model helps conceptualize how we are adding edge data.
b_matrix = np.zeros((len(nodes), len(nodes)))

In [36]:
b_matrix.shape

(204, 204)

In [108]:
test = set()
test.add('joe')
test.add('john')
test2 = test - {'john'}
type(test2)

set

In [None]:
# initialize an undirected graph with self edges allowed
# The network will store node data: (id: string)
#                    and edge data: (node_id:string, node_id:string, weight: integer)           
biblio_net = nx.MultiGraph()    
# biblio_dict will store each keyword as a key and as value a set of keywords that the key cites (occurs in same paragraph)
biblio_dict = {}   # keyword (str) : cocitations (set)
info_before = {}  # same structure as biblio_dict, but contains citation info before the new paragraph is read 

for paragraph in text:
    # keep track of keywords in paragraph so edge data can be added
    kwords_in_pg = set()
    keyword_count = {}
    info_before = biblio_dict
    for keyword in nodes:
        matches = re.findall(keyword, paragraph)
        # if the keyword is found in the paragraph
        if matches:    # if the matches list is non-empty
            kwords_in_pg.add(keyword)
            keyword_count[keyword] = len(matches)

    # update biblio_dict with any new unique keyword citations
    for keyword in kwords_in_pg:
        try:
            updated_citations = (biblio_dict[keyword] | kwords_in_pg) - {keyword}
            biblio_dict.update({keyword: updated_citations})
        # execpt KeyError if the keyword has not yet occured in article and is absent in biblio_dict
        except KeyError:
            biblio_dict[keyword] = kwords_in_pg - {keyword}

    # now that citation info is updated, the node and edge data can be added/updated
    for keyword in kwords_in_pg:
        # Add node data to network
        # if the keyword already exists as node in the network
        if biblio_net.has_node(keyword):
            try:
                biblio_net.nodes[keyword]['count'] += keyword_count[keyword]
            # if a KeyError occurs, its because the node has been added as edge data and not node data with 'count' attribute
            except KeyError:
                biblio_net.add_node(keyword, count = keyword_count[keyword])
        else:
            try:
                biblio_net.add_node(keyword, count = keyword_count[keyword])
            except Exception as e:
                raise Exception('Some error HERE {e}')

        # Add edge data to network
        for target_kword in (kwords_in_pg - {keyword}):
            # if there is no edge data for keyword, target_kword
            if biblio_net.get_edge_data(keyword, target_kword) is None:
                edge = (keyword, target_kword)
                # *edge is the unpacked edges, weight is the edge attribute of the num of citations in common
                cocitations = len(kwords_in_pg) - 2
                biblio_net.add_edge(*edge, weight = cocitations)
            # else if there is edge data and cocitations already exist between keyword, target_kword
            else:
                edge = (keyword, target_kword)
                # get a set of all new cocitations in the paragraph between keyword and target_kword
                new_cocitations = biblio_dict[target_kword] - info_before[target_kword]
                # retrieve past cocitations stored as edge data
                past_cocitations = biblio_net.get_edge_data(keyword,target_kword)[0]['weight']
                # now get a count of total cocitations so far in the paper 
                total_cocitations = len(new_cocitations) + past_cocitations 
                # we will avoid multi-edges in the network, so remove and replace the existing edge
                biblio_net.remove_edge(keyword, target_kword)
                biblio_net.add_edge(*edge, weight = total_cocitations)

        # Check case to add/update the data for a self edge
        if biblio_net.get_edge_data(keyword, keyword) is None:
            edge = (keyword, keyword)
            cocitations = len(kwords_in_pg) - 1
            biblio_net.add_edge(*edge, weight = cocitations)
        else:
            edge = (keyword, keyword)
            new_cocitations = biblio_dict[keyword] - info_before[keyword]
            past_cocitations = biblio_net.get_edge_data(keyword,keyword)[0]['weight']
            total_cocitations = len(new_cocitations) + past_cocitations
            biblio_net.remove_edge(keyword, keyword)
            biblio_net.add_edge(*edge, weight = total_cocitations)
        

TypeError: 'NoneType' object is not subscriptable

In [112]:
G = nx.MultiGraph()
# G.add_node('joe', count=2)
# G.add_node('john', count = 4)
e = ('joe','john')
G.add_edge(*e, weight =4)
# G.nodes[1]
# G.nodes['joe']['count'] += 3
if (G.has_node('joe')):
    print("yes")

# c = (1,3)
# G.add_edge(*c, weight = 7)
# G.add_edge(1,1, weight = 2)
# G.add_edge(1,1, weight = 4)
# d = (3,1)
# G.add_edge(*d, weight = 7)
# G.remove_edge(3,1)
# print(G.edges.data())

# print(G.nodes[1])

# G.has_node(0)
# get the first instance at index 0 of the (1,1) edge and access the value for the weight key
# weight = G.get_edge_data(1,2)[0]['weight']    
# weight = G.get_edge_data(1,1) 
# print(weight)
# weight = G.edges.data('weight')
# print(weight)
# _ = print(u,v,w) for u,v,w in G.edges.data

yes


In [38]:
nx.write_gexf(G, path = "C:/Users/dylan/Documents/Spatial/Networks/knowledge map/Bibliographic-Network-Representation-of-Pdf/path.gexf")