In [None]:
import praw
import mistune
import os
import yaml
from datetime import datetime 

PROJECTS_DIR = './projects'
OUTPUT_DIR = './output'

os.makedirs(PROJECTS_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [2]:
# Create a Reddit instance
reddit = praw.Reddit('HFY2EPUB', config_interpolation='basic')

## HFY2EPUB: A Job For A Deathworlder (by u/Lanzen_Jars)

### Parsing the wiki for links and saving them locally
https://www.reddit.com/r/HFY/wiki/series/a_job_for_a_deathworlder/

In [3]:
project_name = 'A Job For A Deathworlder'
project_path = os.path.join(PROJECTS_DIR, project_name)
if not os.path.exists(project_path):
    os.makedirs(project_path)


In [8]:
# Extract the links to the chapters from wiki
SUBREDDIT = reddit.subreddit('HFY')
WIKI_PAGE = SUBREDDIT.wiki['series/a_job_for_a_deathworlder']
WIKI_PAGE.content_md

"[**Lanzen_Jars**](/r/HFY/wiki/authors/lanzen_jars)\n\n##**Discord**##\nhttps://discord.gg/GeHjWEaTuh\n\n##**Patreon**##\n[Patreon](https://www.patreon.com/Lanzen_Jars)\n\n##**A Job For A Deathworlder**\n* [A job for a deathworlder [Chapter one]](https://www.reddit.com/r/HFY/comments/m71tyw/a_job_for_a_deathworlder_chapter_one/) \n* [A job for a deathworlder [Chapter 2]](https://www.reddit.com/r/HFY/comments/m7sqqd/a_job_for_a_deathworlder_chapter_2/) \n* [A job for a deahtworlder [Chapter 3]](https://www.reddit.com/r/HFY/comments/mbgi4e/a_job_for_a_deahtworlder_chapter_3/) \n* [A job for a deathworlder [Chapter 4]](https://www.reddit.com/r/HFY/comments/mgh2va/a_job_for_a_deathworlder_chapter_4/) \n* [A job for a deathworlder [Chapter 5]](https://www.reddit.com/r/HFY/comments/mlc75v/a_job_for_a_deathworlder_chapter_5/) \n* [A job for a deathworlder [Chapter 6]](https://www.reddit.com/r/HFY/comments/mq2smh/a_job_for_a_deathworlder_chapter_6/)\n* [A job for a deathworlder [Chapter 7] [Pa

In [9]:
import re

def fix_headings(text):
    # Use a regular expression to add a space after # if it's missing
    return re.sub(r'(^|\s)(#+)(?=[^\s#])', r'\1\2 ', text)

# Example usage
markdown_text = "#Heading 1\n##Heading 2"
fixed_text = fix_headings(markdown_text)
print(fixed_text)

# Heading 1
## Heading 2


In [4]:
# Extract the links to the chapters from wiki
SUBREDDIT = reddit.subreddit('HFY')
WIKI_PAGE = SUBREDDIT.wiki['series/a_job_for_a_deathworlder']

class SectionLinkRenderer(mistune.HTMLRenderer):
    def __init__(self, section_title):
        super().__init__()
        self.links =[]
        self.in_section = False
        self.section_title = section_title

    def strong(self, text):
        return text
    
    def emphasis(self, text):
        return text

    def heading(self, text, level, **attrs):
        if text.strip().lower() == self.section_title.strip().lower():
            self.in_section = True
        else:
            self.in_section = False
        return f'<h{level}>{text}</h{level}>'
    
    def link(self, text, url, title=None):
        if self.in_section:
            text = text.replace('/','-')
            self.links.append((url, text))
        return super().link(url, text, title)

def get_chapter_links(section_name):
    renderer = SectionLinkRenderer(section_name)
    markdown = mistune.create_markdown(renderer=renderer)
    markdown(WIKI_PAGE.content_md.replace('##', '## '))
    return renderer.links

chapter_links = get_chapter_links('A Job For A Deathworlder')

# Saving the chapter links to a YAML file with date
revision_date = datetime.fromtimestamp(WIKI_PAGE.revision_date).strftime('%Y-%m-%d')

file_name = f'wiki_page_{revision_date}.yaml'
file_path = os.path.join(project_path, 'wiki', file_name)

if not os.path.exists(os.path.join(project_path, 'wiki')):
    os.makedirs(os.path.join(project_path, 'wiki'))

data = {
    'revision_date': revision_date,
    'timestamp': WIKI_PAGE.revision_date,
    'chapters': [{"url": link[0], "text": link[1]} for link in chapter_links]
}

with open(file_path, 'w', encoding='utf8') as yaml_file:
    yaml.dump(data, yaml_file, allow_unicode=True)

### Comparison of downloaded chapters and ones in the wiki file

Download missing chapters and save them in the raw directory.


In [5]:
import queue
from glob import glob

raw_dir = os.path.join(project_path, 'raw')
if not os.path.exists(raw_dir):
    os.makedirs(raw_dir)

downloaded_chapters = set()
for chapter in os.listdir(raw_dir):
    if chapter.endswith('.md'):
        downloaded_chapters.add(os.path.splitext(chapter)[0])

wiki_pages = glob(os.path.join(project_path, 'wiki', '*.yaml'))
wiki_pages.sort(key=os.path.getmtime, reverse=True)
wiki_chapters = []

with open(wiki_pages[0], 'r', encoding='utf8') as yaml_file:
    wiki_data = yaml.safe_load(yaml_file)
    for chapter in wiki_data['chapters']:
        wiki_chapters.append(chapter)

chapter_queue = queue.Queue()
for chapter in wiki_chapters:
    if chapter['text'] not in downloaded_chapters:
        chapter_queue.put(chapter)

In [6]:
def download_chapter(url, text):
    try:
        submission = reddit.submission(url=url)

        file_name = f"{text}.md"
        file_path = os.path.join(raw_dir, file_name)

        content = []
        content.append(submission.selftext)
        
        post_author = submission.author.name
        top_level_comments = submission.comments.list()

        top_author_comment = None
        for comment in top_level_comments:
            if comment.author and comment.author.name == post_author:
                content.append(comment.body)
                top_author_comment = comment
                break
        
        if not top_author_comment:
            print(f"No top-level comment by the author found for {text}.")
            with open(file_path, 'w', encoding='utf8') as f:
                f.write('\n'.join(content))
                print(f"Downloaded: {file_name}")
            return

        for reply in top_author_comment.replies:
            if reply.author and reply.author.name == post_author:
                content.append(reply.body)
                top_author_comment = reply

        with open(file_path, 'w', encoding='utf8') as f:
            f.write('\n\n ----- \n\n'.join(content))
            print(f"Downloaded: {file_name}")

    except Exception as e:
        print(f"Error downloading {text}: {e}")

while not chapter_queue.empty():
    chapter = chapter_queue.get()
    download_chapter(chapter['url'], chapter['text'])
    chapter_queue.task_done()

Downloaded: A job for a deathworlder [Chapter 224].md
Downloaded: A job for a deathwolrder [Chapter 225].md


### Preprocessing files before conversion

In [68]:
import re
processed_path = os.path.join(project_path, 'processed')
if not os.path.exists(processed_path):
    os.makedirs(processed_path)

raw_chapters = glob(os.path.join(raw_dir, '*.md'))

def process_file(file_path):
    # Read the file content
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Find the chapter line
    chapter_line_index = None
    for i, line in enumerate(lines):
        if line.startswith('#') and 'Chapter' in line:
            chapter_line_index = i
            break
        elif line.startswith('Chapter'):
            chapter_line_index = i
            break
        elif line.startswith('*Chapter'):
            chapter_line_index = i
            lines[i] = line[1:-1]
            break
        elif line.startswith('**Chapter'):
            chapter_line_index = i
            lines[i] = line[2:-2]
            break

    if chapter_line_index is None:
        print(f"No chapter line found in {file_path}")
        file_name = os.path.basename(file_path)
        file_name_match = re.search(r'\[Chapter (\d+)(?: (.+))?\]', file_name)
        if file_name_match:
            chapter_number_from_file = file_name_match.group(1)
            chapter_name_from_file = file_name_match.group(2).strip() if file_name_match.group(2) else ''
            lines[0] = f'# Chapter {chapter_number_from_file} {chapter_name_from_file}\n'
            chapter_line_index = 0
        else:
            print(f"Could not determine chapter line for {file_path}")
            return

    # Extract the chapter line
    chapter_line = lines[chapter_line_index].strip()

    # Process the chapter line
    if not chapter_line.startswith('#'):
        chapter_line = f'# {chapter_line}'

    # Extract chapter number and name from the line
    chapter_text = chapter_line.replace('#', '').strip()
    chapter_match = re.match(r'Chapter (\d+)(?: (.+))?', chapter_text)
    if not chapter_match:
        print(f"Could not parse chapter line: {chapter_line}")
        return

    chapter_number = chapter_match.group(1)
    chapter_name = chapter_match.group(2).strip() if chapter_match.group(2) else ''

    # If chapter name is missing, get it from the file name
    if not chapter_name:
        file_name = os.path.basename(file_path)
        file_name_match = re.search(r'\[Chapter (\d+)(?: (.+))?\]', file_name)
        if file_name_match:
            chapter_number_from_file = file_name_match.group(1)
            chapter_name_from_file = f'Chapter {chapter_number_from_file}'
            chapter_line = f'# {chapter_name_from_file}'

    # Trim the top lines
    processed_lines = [chapter_line + '\n'] + lines[chapter_line_index + 1:]

    # Save the new file
    with open(os.path.join(processed_path, f'{os.path.basename(file_path).replace('deaht', 'death')}'), 'w', encoding='utf-8') as new_file:
        new_file.writelines(processed_lines)

    print(f"Processed file saved as {os.path.basename(file_path)}.md")

def process_chp_one(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    chapter_lines = lines[4:]
    chapter_lines[0] = '# Chapter 1 - A Job For A Deathworlder\n'

    with open(os.path.join(processed_path, 'A job for a deathworlder [Chapter 1].md'), 'w', encoding='utf-8') as new_file:
        new_file.writelines(chapter_lines)

processed_files = glob(os.path.join(processed_path, '*.md'))
for raw_chapter in raw_chapters:
    if os.path.basename(raw_chapter).replace('deaht', 'death') in [os.path.basename(f) for f in processed_files]: continue
    if 'one' in raw_chapter.lower(): 
        process_chp_one(raw_chapter)
        continue
    process_file(raw_chapter)


Processed file saved as A job for a deahtworlder [Chapter 108].md.md
Processed file saved as A job for a deahtworlder [Chapter 12].md.md
Processed file saved as A job for a deahtworlder [Chapter 139].md.md
Processed file saved as A job for a deahtworlder [Chapter 179].md.md
Processed file saved as A job for a deahtworlder [Chapter 181].md.md
Processed file saved as A job for a deahtworlder [Chapter 3].md.md
Processed file saved as A job for a deahtworlder [Chapter 53].md.md


In [None]:
import subprocess
import yaml
import os
from glob import glob
import tempfile
import re

processed_path = os.path.join(project_path, 'processed')
chapters = glob(os.path.join(processed_path, '*.md'))
chapters.sort(key=lambda x: int(re.search(r'Chapter (\d+)', os.path.basename(x)).group(1)))

# Add a YAML header to the first chapter file
wiki_pages = glob(os.path.join(project_path, 'wiki', '*.yaml'))
wiki_pages.sort(key=os.path.getmtime, reverse=True)
with open(wiki_pages[0], 'r', encoding='utf8') as yaml_file:
    with open(os.path.join(project_path, 'title.yaml'), 'r', encoding='utf8') as title_file:
        title_data = yaml.safe_load(title_file)
    wiki_data = yaml.safe_load(yaml_file)
    title_data['date'] = wiki_data['revision_date']
    with open(os.path.join(project_path, 'title.yaml'), 'w', encoding='utf8') as title_file:
        yaml.dump(title_data, title_file, allow_unicode=True)

# Create a temporary file to hold the combined content
with tempfile.NamedTemporaryFile(suffix='.md', delete=False) as combined_md_file:
    # Write the YAML header to the combined file
    with open(os.path.join(project_path, 'title.yaml'), 'r', encoding='utf8') as title_file:
        title_data = yaml.safe_load(title_file)
        combined_md_file.write('---\n'.encode('utf-8'))
        combined_md_file.write(yaml.dump(title_data, allow_unicode=True).encode('utf-8'))
        combined_md_file.write('---\n\n'.encode('utf-8'))
        print('Added YAML header to combined file.')
    
    for md_file in chapters:
        with open(md_file, 'r', encoding='utf-8') as f:
            # Write each file's content to the combined file
            combined_md_file.write(f.read().encode('utf-8'))
            # Add a newline between different files (optional)
            combined_md_file.write(b'\n\n')
            print(f"Added {os.path.basename(md_file)} to combined file.")

    combined_md_path = combined_md_file.name

pandoc_command = ['pandoc', combined_md_path, '-o', f'./output/{project_name}.epub']

try:
    # Execute the Pandoc command and capture stderr
    result = subprocess.run(
        pandoc_command,
        check=True,
        stderr=subprocess.PIPE,  # Capture stderr
        text=True  # Decode the output as text
    )
    print(f"Successfully converted {project_name} to EPUB.")
except subprocess.CalledProcessError as e:
    print(f"Failed to convert {project_name}: {e}")
    print("Error details:")
    print(e.stderr)  # Print the stderr output
finally:
    os.unlink(combined_md_path)

Added A job for a deathworlder [Chapter 1].md to combined file.
Added A job for a deathworlder [Chapter 2].md to combined file.
Added A job for a deathworlder [Chapter 3].md to combined file.
Added A job for a deathworlder [Chapter 4].md to combined file.
Added A job for a deathworlder [Chapter 5].md to combined file.
Added A job for a deathworlder [Chapter 6].md to combined file.
Added A job for a deathworlder [Chapter 7] [Part 1].md to combined file.
Added A job for a deathworlder [Chapter 7] [Part 2].md to combined file.
Added A job for a deathworlder [Chapter 8].md to combined file.
Added A job for a deathworlder [Chapter 9].md to combined file.
Added A job for a deathworlder [Chapter 10].md to combined file.
Added A job for a deathworlder [Chapter 11].md to combined file.
Added A job for a deathworlder [Chapter 12].md to combined file.
Added A job for a deathworlder [Chapter 13].md to combined file.
Added A job for a deathworlder [Chapter 14].md to combined file.
Added A job for a