# Bibliothek

Python library to download, organize and analyze books from archive.org

It is composed by three main classes:
- `Library`: Here is where books are cataloged and stored.
- `Librarian`: A librarian is a class that allows you to interact with a `Library`
- `Book`: It contains the information related to a book.
- `Historian`: It analyzes book's compositions.

In [4]:
import json
import os
from pathlib import Path # implement paths!
import sqlite3
import tarfile
import tempfile
import zipfile
# os.environ['OPENCV_IO_ENABLE_JASPER']='TRUE' # enable jasper
import cv2
import pandas as pd
from tqdm import tqdm
from internetarchive import get_item, download

class Library:
    "Stores books and books catalog"

    def load_library(self, library_location):
        "Loads the library"
        self.library_location = Path(library_location)
        self.name = self.library_location.stem

        self.catalog_df = pd.read_csv(self.library_location / "catalog.csv")
        print(f'Library {self.name} loaded')
        print(f'Books: {len(self.catalog_df)}')

    def open_library(self, name):
        self.name = name
        "Creates a database and directory to store books" 
        # Create database and catalog table
        create_library(self.name)

    def add_book(self, book_id: str):
        "Adds a book to the library"
        book_directory = self.library_location / book_id
       
        # Check if book is already in the catalog
        if os.path.exists(book_directory):
            raise Exception("Book already in the library")
        if book_id in self.catalog_df.identifier:
            raise Exception("Book already in the catalog") 
        
        # Get book metadata
        book_metadata = get_item(book_id).item_metadata
        book_date = book_metadata['metadata']['date']
        book_subject = book_metadata['metadata']['subject']
        book_title = book_metadata['metadata']['title']
        book_year = book_metadata['metadata']['year']
        book_language = book_metadata['metadata']['language']

        # Download book
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_dir = Path(temp_dir)
            download_book(book_id, temp_dir)
            for file in os.listdir(temp_dir / book_id):
                print(temp_dir / book_id / file)
                extract(temp_dir / book_id / file, temp_dir / 'extracted')
                for file in os.listdir(temp_dir / 'extracted'):
                    convert_jp2_to_png(temp_dir / 'extracted', book_directory)
        
        # Add book to the catalog
        book_df = pd.DataFrame(
            {'identifier': book_id,
             'title': book_title,
             'subject': book_subject,
             'date': book_date,
             'year': book_year,
             'language': book_language}
        )
        self.catalog_df = pd.concat([self.catalog_df, book_df])
        self.catalog_df.to_csv(self.library_location / "catalog.csv", index=False)
        print('Book added to the catalog')

# Library utility functions
def check_if_library_exists(library_name: str) -> bool:
    "Checks if the library exists"
    return os.path.exists(library_name)

def create_library(name: str, directory='.'):
    "Creates a .csv file with the books catalog"
    # Check if the library exists
    print('- - - Creating library - - -')
    print('Name:', name)
    library_path = Path(directory) / name
    print('Location:', library_path)
    if check_if_library_exists(library_path):
        raise Exception(f"The library {name} already exists")

    # Create directories
    os.makedirs(library_path / "books")
    
    # Create the library catalog
    library_df =  pd.DataFrame(columns=['identifier', 'date', 'subject', 'title', 'year', 'language'])
    library_df.to_csv(library_path / 'catalog.csv', index=False)
    
    print('Done')

# Download and process book
def download_book(book_id, dest_path, verbose=True):
    print(f'Downloading {book_id}')
    download(
        book_id,
        destdir=dest_path,
        formats="Single Page Processed JP2 ZIP",
        verbose=verbose,
    )

def extract(file, dest_path):
    print(f'Extracting {file} to {dest_path}')
    if file.suffix == ".zip":
        with zipfile.ZipFile(file, "r") as zip_ref:
            zip_ref.extractall(dest_path)
    if file.suffix == ".tar":
        tar = tarfile.open(file)
        tar.extractall(dest_path)
        tar.close()
    print('Done')


def convert_jp2_to_png(jp2_directory, png_directory):
    print('Converting images to png')
    image_files_list = list(jp2_directory.rglob('*.jp2'))
    for file in tqdm(image_files_list):
        print(file)
        image = cv2.imread(str(file))
        filename = str((Path(png_directory) / file.stem).with_suffix('.png'))
        cv2.imwrite(filename, image)
    print('Done')

In [10]:
library = Library()
library.open_library('alexandria')

- - - Creating library - - -
Name: alexandria
Location: alexandria
Done


In [17]:
download_book('1200thedresdencodex1200ad', 'test')

Downloading 1200thedresdencodex1200ad
1200thedresdencodex1200ad:
 downloaded 1200thedresdencodex1200ad/1200 The Dresden Codex 1200 AD_jp2.zip to test/1200thedresdencodex1200ad/1200 The Dresden Codex 1200 AD_jp2.zip


In [2]:
library = Library()
library.load_library('alexandria')

for file in os.listdir('test/1200thedresdencodex1200ad'):
    extract(Path('test/1200thedresdencodex1200ad') / file, 'test/extracted')
    for directory in os.listdir('test/extracted'):
        convert_jp2_to_png(Path('test/extracted') / directory, '1200thedresdencodex1200ad')


Library alexandria loaded
Books: 0
Extracting test/1200thedresdencodex1200ad/1200 The Dresden Codex 1200 AD_jp2.zip to test/extracted
Done
Converting images to png


0it [00:00, ?it/s]


Done
Converting images to png


  0%|          | 0/78 [00:00<?, ?it/s]image too large
error: cannot decode code stream
  0%|          | 0/78 [00:00<?, ?it/s]

test/extracted/1200 The Dresden Codex 1200 AD_jp2/1200 The Dresden Codex 1200 AD_0034.jp2





error: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1644507859398/work/modules/imgcodecs/src/loadsave.cpp:801: error: (-215:Assertion failed) !_img.empty() in function 'imwrite'


In [1]:
library = Library()
library.load_library('alexandria')
# library.add_book('1200thedresdencodex1200ad')

NameError: name 'Library' is not defined

In [5]:
image = cv2.imread("test/extracted/1200 The Dresden Codex 1200 AD_jp2/1200 The Dresden Codex 1200 AD_0000.jp2")

image too large
error: cannot decode code stream
