In [1]:
file_path = 'data/english.pdf'

In [2]:
import os
os.chdir('/home/lakiet/Projects/personal/chatbot')

In [3]:
from pypdf import PdfReader  

def extract_text_from_pdf(pdf_path):
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        text += page.extract_text() + "\n\n" 
    return text


def write_to_txt(file_path, text):
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(text)
    print(f"Text successfully written to {file_path}")


In [4]:
txt = extract_text_from_pdf(file_path)
write_to_txt("data/output.txt", txt)

Text successfully written to data/output.txt


In [None]:
import json

def parse_stories_to_json(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    stories = []
    current_story = {}
    i = 0
    n = len(lines)

    # The first line is the title of the first story
    current_story['title'] = lines[i].strip()
    i += 1

    while i < n:
        line = lines[i].strip()

        # Check if the line is an author name (between two blank lines and length < 30)
        if i + 1 < n and lines[i-1].strip() == '' and lines[i+1].strip() == '' and len(line) < 30:
            # Save the current story if it exists
            if current_story:
                stories.append(current_story)
                current_story = {}

            # Set the author of the previous story
            if stories:
                stories[-1]['author'] = line

            # The next line is the title of the next story
            i += 2
            if i < n:
                current_story['title'] = lines[i].strip()
                i += 1
        else:
            # Collect the story lines
            if 'story' not in current_story:
                current_story['story'] = line
            else:
                current_story['story'] += '\n' + line
            i += 1

    # Append the last story
    if current_story:
        stories.append(current_story)

    # Convert to JSON
    return json.dumps(stories, indent=4)

# Example usage
file_path = 'stories.txt'  # Replace with your file path
json_output = parse_stories_to_json(file_path)
print(json_output)

# Optionally, save the JSON to a file
with open('stories.json', 'w') as json_file:
    json_file.write(json_output)

In [13]:
import json

def is_blank(line):
    return len(line.strip()) == 0

def parse_txt_to_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = [line.rstrip('\n') for line in f.readlines()]

    stories = []
    current_story = {"title": "", "story": "", "author": ""}
    story_lines = []
    
    i = 0
    n = len(lines)
    
    # Skip any initial blank lines.
    while i < n and is_blank(lines[i]):
        i += 1

    # The very first nonblank line is the title of the first story.
    if i < n:
        current_story["title"] = lines[i].strip()
        i += 1
    else:
        return json.dumps(stories, indent=4)

    while i < n:
        line = lines[i]
        
        # Check if this line is a candidate for an author name:
        # It must be nonblank, shorter than 30 characters, and be preceded and followed by blank lines.
        if (not is_blank(line)) and (len(line.strip()) < 30) and (i - 1 >= 0 and is_blank(lines[i-1])) and (i + 1 < n and is_blank(lines[i+1])):
            # End of the current story text.
            current_story["story"] = "\n".join(story_lines).strip()
            current_story["author"] = line.strip()
            stories.append(current_story)
            
            # Prepare for the next story.
            current_story = {"title": "", "story": "", "author": ""}
            story_lines = []
            
            # Skip the blank line after the author candidate.
            i += 2  
            # Skip any additional blank lines.
            while i < n and is_blank(lines[i]):
                i += 1
            if i < n:
                # Next nonblank line is the title of the next story.
                current_story["title"] = lines[i].strip()
            i += 1
            continue
        else:
            # Otherwise, accumulate the line as part of the current story text.
            story_lines.append(line)
        i += 1

    # NEW FIX: The last nonblank line is the author of the last story.
    if current_story["title"]:
        # Remove any trailing blank lines.
        while story_lines and is_blank(story_lines[-1]):
            story_lines.pop()
        # If there's at least one nonblank line, pop it as the author.
        if story_lines:
            current_story["author"] = story_lines.pop().strip()
        current_story["story"] = "\n".join(story_lines).strip()
        stories.append(current_story)
    
    return json.dumps(stories, indent=4)

# Example usage:
if __name__ == "__main__":
    filepath = "stories.txt"  # path to your text file
    json_output = parse_txt_to_json(filepath)
    
    with open('stories.json', 'w') as json_file:
        json_file.write(json_output)
