Data Load

In [1]:
import json

In [2]:
with open('data/gutenberg.json','r') as source:
    data = json.load(source)

data[0]

{'doc_id': 1,
 'url': 'https://www.gutenberg.org/files/1342/1342-0.txt',
 'author': 'Jane Austen',
 'origin': 'Pride and Prejudice',
 'start_line': 39,
 'end_line': 14567}

In [3]:
from urllib.request import Request, urlopen
import re

In [4]:
# Test data
data_dict = [
        {
                "url": "https://www.gutenberg.org/files/21/21-0.txt",
                "author": "Aesop",
                "origin": "Aesop’s Fables",
                "start_line": 39,
                "end_line": 4777,
                "search_pattern": re.compile(r'([^\n]+)\n\n(.*?)(?=\n{5}(?=[^\n]+\n\n)|$)', re.DOTALL)
        },
        {
                "url" : "https://www.gutenberg.org/files/2591/2591-0.txt",
                "author" : "Jacob and Wilhelm Grimm",
                "origin" : "Grimms' Fairy Tales",
                "start_line" : 123,
                "end_line" : 9239,
                "search_pattern" : re.compile(r"([A-Z0-9 ,.'!?-]+)\n{3}(.*?)(?=\n{5}|$)", re.DOTALL)
        }
]

In [5]:
from document import Document

In [6]:
def split_chapters(lines):
    full_text = '\n'.join(lines)
    raw_chapters = re.split(r'\n{4,}', full_text.strip())
    print(raw_chapters)

    chapters = []
    for raw in raw_chapters:
        parts = [line.strip() for line in raw.split('\n') if line.strip()]
        if not parts:
            continue
        
        title = parts[0]
        content = '\n'.join(parts[1:])
        chapters.append((title, content.strip()))
    
    return chapters

In [7]:
for data in data_dict:
    print('='*10)
    with urlopen(Request(url=data['url'])) as response:
        raw_text = response.read().decode('utf-8')
    
    lines = raw_text.splitlines()
    relevant_lines = lines[data['start_line']:data['end_line']]
    relevant_text = '\n'.join(relevant_lines)

    matches = data['search_pattern'].findall(relevant_text)
    print(matches[:2])

    fables = []
    
    for title, content in matches:
        fable = {
            'title' : title.strip(),
            'content': content.strip(),
            'author': data['author'],
            'origin': data['origin']
        }

        fables.append(fable)
    
    print(len(fables))
    
    for fable in fables[:3]:
        print(f"\nTitle: {fable['title']}\n")
        print(f"Content: {fable['content'][:150]}...\n")

[('The Wolf And The Lamb', 'WOLF, meeting with a Lamb astray from the fold, resolved not to lay\nviolent hands on him, but to find some plea to justify to the Lamb the\nWolf’s right to eat him. He thus addressed him: “Sirrah, last year you\ngrossly insulted me.” “Indeed,” bleated the Lamb in a mournful tone\nof voice, “I was not then born.” Then said the Wolf, “You feed in my\npasture.” “No, good sir,” replied the Lamb, “I have not yet tasted\ngrass.” Again said the Wolf, “You drink of my well.” “No,” exclaimed the\nLamb, “I never yet drank water, for as yet my mother’s milk is both food\nand drink to me.” Upon which the Wolf seized him and ate him up, saying,\n“Well! I won’t remain supperless, even though you refute every one of my\nimputations.” The tyrant will always find a pretext for his tyranny.'), ('The Bat And The Weasels', 'A BAT who fell upon the ground and was caught by a Weasel pleaded to be\nspared his life. The Weasel refused, saying that he was by nature the\nenemy of al

In [8]:
from my_module import load_collection_from_url

In [10]:
documents = load_collection_from_url(url=data_dict[0]['url'],author=data_dict[0]['author'],origin=data_dict[0]['origin'],start_line=data_dict[0]['start_line'],end_line=data_dict[0]['end_line'],search_pattern=data_dict[0]['search_pattern'])

In [11]:
for doc in documents[:5]:
    print(f"""
        document_id = {doc.document_id}.
        title = {doc.title}
        Author = {doc.author}
        origin = {doc.origin}
        raw_text = {doc.raw_text[:100]}
        terms = {doc.terms[:20]}""")


        document_id = 0.
        title = The Wolf And The Lamb
        Author = Aesop
        origin = Aesop’s Fables
        raw_text = WOLF, meeting with a Lamb astray from the fold, resolved not to lay
violent hands on him, but to fin
        terms = ['WOLF,', 'meeting', 'with', 'a', 'Lamb', 'astray', 'from', 'the', 'fold,', 'resolved', 'not', 'to', 'lay', 'violent', 'hands', 'on', 'him,', 'but', 'to', 'find']

        document_id = 1.
        title = The Bat And The Weasels
        Author = Aesop
        origin = Aesop’s Fables
        raw_text = A BAT who fell upon the ground and was caught by a Weasel pleaded to be
spared his life. The Weasel 
        terms = ['A', 'BAT', 'who', 'fell', 'upon', 'the', 'ground', 'and', 'was', 'caught', 'by', 'a', 'Weasel', 'pleaded', 'to', 'be', 'spared', 'his', 'life.', 'The']

        document_id = 2.
        title = The Ass And The Grasshopper
        Author = Aesop
        origin = Aesop’s Fables
        raw_text = AN ASS having heard some Gr

In [12]:
print(documents[0].raw_text.replace('\n',' ').split())

['WOLF,', 'meeting', 'with', 'a', 'Lamb', 'astray', 'from', 'the', 'fold,', 'resolved', 'not', 'to', 'lay', 'violent', 'hands', 'on', 'him,', 'but', 'to', 'find', 'some', 'plea', 'to', 'justify', 'to', 'the', 'Lamb', 'the', 'Wolf’s', 'right', 'to', 'eat', 'him.', 'He', 'thus', 'addressed', 'him:', '“Sirrah,', 'last', 'year', 'you', 'grossly', 'insulted', 'me.”', '“Indeed,”', 'bleated', 'the', 'Lamb', 'in', 'a', 'mournful', 'tone', 'of', 'voice,', '“I', 'was', 'not', 'then', 'born.”', 'Then', 'said', 'the', 'Wolf,', '“You', 'feed', 'in', 'my', 'pasture.”', '“No,', 'good', 'sir,”', 'replied', 'the', 'Lamb,', '“I', 'have', 'not', 'yet', 'tasted', 'grass.”', 'Again', 'said', 'the', 'Wolf,', '“You', 'drink', 'of', 'my', 'well.”', '“No,”', 'exclaimed', 'the', 'Lamb,', '“I', 'never', 'yet', 'drank', 'water,', 'for', 'as', 'yet', 'my', 'mother’s', 'milk', 'is', 'both', 'food', 'and', 'drink', 'to', 'me.”', 'Upon', 'which', 'the', 'Wolf', 'seized', 'him', 'and', 'ate', 'him', 'up,', 'saying,', 