Data Load

In [1]:
import json

In [2]:
with open('data/gutenberg.json','r') as source:
    data = json.load(source)

data[0]

{'doc_id': 1,
 'url': 'https://www.gutenberg.org/files/1342/1342-0.txt',
 'author': 'Jane Austen',
 'origin': 'Pride and Prejudice',
 'start_line': 39,
 'end_line': 14567}

In [3]:
from my_module import load_collection_from_url

docs = load_collection_from_url(
    url=data[0]['url'],
    author=data[0]['author'],
    origin=data[0]['origin'],
    start_line=data[0]['start_line'],
    end_line=data[0]['end_line'],
    search_pattern=None)


In [4]:
# Test data

data_dict = [
        {
                "url": "https://www.gutenberg.org/files/21/21-0.txt",
                "author": "Aesop",
                "origin": "Aesop’s Fables",
                "start_line": 39,
                "end_line": 4777
        },
        {
                "url" : "https://www.gutenberg.org/files/2591/2591-0.txt",
                "author" : "Jacob and Wilhelm Grimm",
                "origin" : "Grimms' Fairy Tales",
                "start_line" : 123,
                "end_line" : 9239
        }
]

In [5]:
from urllib.request import Request, urlopen
import re

In [6]:
def split_chapters(lines):
    full_text = '\n'.join(lines)
    raw_chapters = re.split(r'\n{4,}', full_text.strip())
    print(raw_chapters)

    chapters = []
    for raw in raw_chapters:
        parts = [line.strip() for line in raw.split('\n') if line.strip()]
        if not parts:
            continue
        
        title = parts[0]
        content = '\n'.join(parts[1:])
        chapters.append((title, content.strip()))
    
    return chapters

In [7]:
for data in data_dict:
    print('='*10)
    with urlopen(Request(url=data['url'])) as response:
        raw_text = response.read().decode('utf-8')
        chapter_text = raw_text.splitlines()[data.get('start_line'):data.get('end_line')]
        # print(chapter_text)
        chapters = split_chapters(chapter_text)
        # print('Total Chapters : ', chapters.__len__())
        # for (title, chapter) in chapters[:5]:
        #     print(title, chapter, sep='\n')
        #     print('-'*10)


['THE GOLDEN BIRD\n\n\nA certain king had a beautiful garden, and in the garden stood a tree\nwhich bore golden apples. These apples were always counted, and about\nthe time when they began to grow ripe it was found that every night one\nof them was gone. The king became very angry at this, and ordered the\ngardener to keep watch all night under the tree. The gardener set his\neldest son to watch; but about twelve o’clock he fell asleep, and in\nthe morning another of the apples was missing. Then the second son was\nordered to watch; and at midnight he too fell asleep, and in the morning\nanother apple was gone. Then the third son offered to keep watch; but\nthe gardener at first would not let him, for fear some harm should come\nto him: however, at last he consented, and the young man laid himself\nunder the tree to watch. As the clock struck twelve he heard a rustling\nnoise in the air, and a bird came flying that was of pure gold; and as\nit was snapping at one of the apples with it

In [8]:
from my_module import load_collection_from_url

In [9]:
documents = load_collection_from_url(url=data_dict[0]['url'],author=data_dict[0]['author'],origin=data_dict[0]['origin'],start_line=data_dict[0]['start_line'],end_line=data_dict[0]['end_line'])

In [13]:
for doc in documents[:5]:
    print(f"""
        document_id = {doc.document_id}.
        title = {doc.title}
        Author = {doc.author}
        origin = {doc.origin}
        raw_text = {doc.raw_text[:100]}
        terms = {doc.terms[:20]}""")


        document_id = 0.
        title = The Wolf And The Lamb
        Author = Aesop
        origin = Aesop’s Fables
        raw_text = WOLF, meeting with a Lamb astray from the fold, resolved not to lay
violent hands on him, but to fin
        terms = ['WOLF,', 'meeting', 'with', 'a', 'Lamb', 'astray', 'from', 'the', 'fold,', 'resolved', 'not', 'to', 'lay', 'violent', 'hands', 'on', 'him,', 'but', 'to', 'find']

        document_id = 1.
        title = The Bat And The Weasels
        Author = Aesop
        origin = Aesop’s Fables
        raw_text = A BAT who fell upon the ground and was caught by a Weasel pleaded to be
spared his life. The Weasel 
        terms = ['A', 'BAT', 'who', 'fell', 'upon', 'the', 'ground', 'and', 'was', 'caught', 'by', 'a', 'Weasel', 'pleaded', 'to', 'be', 'spared', 'his', 'life.', 'The']

        document_id = 2.
        title = The Ass And The Grasshopper
        Author = Aesop
        origin = Aesop’s Fables
        raw_text = AN ASS having heard some Gr

In [11]:
print(documents[0].raw_text.replace('\n',' ').split())

['WOLF,', 'meeting', 'with', 'a', 'Lamb', 'astray', 'from', 'the', 'fold,', 'resolved', 'not', 'to', 'lay', 'violent', 'hands', 'on', 'him,', 'but', 'to', 'find', 'some', 'plea', 'to', 'justify', 'to', 'the', 'Lamb', 'the', 'Wolf’s', 'right', 'to', 'eat', 'him.', 'He', 'thus', 'addressed', 'him:', '“Sirrah,', 'last', 'year', 'you', 'grossly', 'insulted', 'me.”', '“Indeed,”', 'bleated', 'the', 'Lamb', 'in', 'a', 'mournful', 'tone', 'of', 'voice,', '“I', 'was', 'not', 'then', 'born.”', 'Then', 'said', 'the', 'Wolf,', '“You', 'feed', 'in', 'my', 'pasture.”', '“No,', 'good', 'sir,”', 'replied', 'the', 'Lamb,', '“I', 'have', 'not', 'yet', 'tasted', 'grass.”', 'Again', 'said', 'the', 'Wolf,', '“You', 'drink', 'of', 'my', 'well.”', '“No,”', 'exclaimed', 'the', 'Lamb,', '“I', 'never', 'yet', 'drank', 'water,', 'for', 'as', 'yet', 'my', 'mother’s', 'milk', 'is', 'both', 'food', 'and', 'drink', 'to', 'me.”', 'Upon', 'which', 'the', 'Wolf', 'seized', 'him', 'and', 'ate', 'him', 'up,', 'saying,', 