Permalink
Cannot retrieve contributors at this time
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
140 lines (113 sloc)
4.55 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import codecs | |
| import os | |
| import re | |
| import sys | |
| from functools import partial | |
| import frontmatter | |
| import networkx as nx | |
| from bs4 import BeautifulSoup | |
| THIS_DIR = os.getcwd() | |
| def build_graph(input_dir, directed=True): | |
| # input_dir = os.path.abspath(os.path.join(THIS_DIR, input_dir)) | |
| G = nx.DiGraph() if directed else nx.Graph() | |
| # Inverted index, used to handle Obsidian's 'shortest path' strategy | |
| exclude = {'.git', '.obsidian', '.trash'} | |
| names_relpath = build_inverted_index(input_dir, exclude) | |
| get_path = partial(label_to_path, names_relpath) | |
| page_ref = {} | |
| def get_set_id(value): | |
| if value not in page_ref: | |
| page_ref[value] = len(page_ref) + 1 | |
| return page_ref[value] | |
| # Generating the graph | |
| for root, dirs, files in os.walk(input_dir, topdown=True): | |
| dirs[:] = [d for d in dirs if d not in exclude] | |
| for file_name in files: | |
| source_file = os.path.join(root, file_name) | |
| name, _ = os.path.splitext(file_name) | |
| with codecs.open(source_file, 'r', encoding='utf-8') as f: | |
| name, extension = os.path.splitext(file_name) | |
| def add_node(fm): | |
| content = fm.content | |
| metadata = fm.metadata | |
| unique_name = os.path.relpath(source_file, start=input_dir) | |
| _id = get_set_id(unique_name) | |
| page_ref[_id] = unique_name | |
| node = { | |
| 'id': _id, | |
| 'title': get_title(metadata, content, source_file), | |
| 'source_file': source_file, | |
| 'metadata': metadata, | |
| # 'content': content, | |
| # 'links': links | |
| } | |
| # Add nodes | |
| G.add_nodes_from([ | |
| (_id, node), | |
| ]) | |
| # Add edges | |
| for label in get_links(content): | |
| link = get_path(label) | |
| if link is not None: | |
| G.add_edge(_id, get_set_id(link)) | |
| if extension == '.md': | |
| try: | |
| fm = frontmatter.load(f) | |
| add_node(fm) | |
| except AttributeError as err: | |
| print("AttributeError error: {0}".format(err)) | |
| print("while processing front-matter of: {}".format(source_file)) | |
| except: | |
| print("Unexpected error:", sys.exc_info()[0]) | |
| print("while processing front-matter of: {}".format(source_file)) | |
| return G, page_ref | |
| def build_inverted_index(input_dir, exclude): | |
| names_relpath = {} | |
| for current_dir, dirs, files in os.walk(input_dir, topdown=True): | |
| dirs[:] = [d for d in dirs if d not in exclude] | |
| for file in files: | |
| if file not in names_relpath: | |
| names_relpath[file] = [] | |
| relative_path = os.path.relpath(current_dir, start=input_dir) | |
| names_relpath[file].append(relative_path) | |
| return names_relpath | |
| def label_to_path(inverted_index, label): | |
| # If the label contains a pipe '|', then it has an alias. | |
| tokens = label.split(sep='|', maxsplit=1) | |
| assert (len(tokens) in {1, 2}) | |
| name, label_extension = os.path.splitext(tokens[0]) | |
| file = f'{name}.md' if label_extension == '' else tokens[0] | |
| # Obsidian's way | |
| # If the label does not contain a path, it's unique. We lookup the path. | |
| # Otherwise, we do nothing | |
| dir, filename = os.path.split(file) | |
| if filename not in inverted_index: | |
| # The reference does not exist | |
| result = None | |
| elif dir == '': | |
| # Lookup the path | |
| paths = inverted_index[file] | |
| if (len(paths) != 1): | |
| raise Exception('There are ambiguous paths', label, paths) | |
| # assert (len(paths) == 1) | |
| path = '' if paths[0] == '.' else paths[0] | |
| result = os.path.join(path, filename) | |
| else: | |
| # Has already the path | |
| result = filename | |
| return result | |
| def get_links(contents): | |
| WIKILINK_RE = r'\[\[([^\]\]]+)\]\]' | |
| pattern = re.compile(WIKILINK_RE) | |
| return pattern.findall(contents) | |
| def get_title(metadata, content, source_file): | |
| if 'title' in metadata: | |
| title = metadata['title'] | |
| else: | |
| bs = BeautifulSoup(content, 'html.parser') | |
| h1 = bs.find('h1') | |
| if h1 is not None: | |
| title = h1.get_text() | |
| else: | |
| _, title = os.path.split(source_file) | |
| return title |