# Download TEI files

## 1 Installing needed packages

When running on a remote JupyterLab, packages that are needed have to be explicitly installed:

In [None]:
# Install a pip package in the current Jupyter kernel
!pip install --quiet requests==2.31.0
!pip install --quiet pandas==2.1.4

from utils import (
    fetch_and_extract_zip,
    fetch_and_format_xml,
    download_ntvmr_transcripts,
    download_ntvmr_manuscripts,
    get_docID_set,
)
from concurrent.futures import ProcessPoolExecutor
import time
import os
import glob

## 2 Download from IGNTP (transcriptions only)

In [None]:
urls = [
    "http://www.iohannes.com/transcriptions/XML/greek/papyri.zip",
    "http://www.iohannes.com/transcriptions/XML/greek/majuscules.zip",
    "http://www.iohannes.com/transcriptions/XML/greek/minuscules.zip",
    "http://www.iohannes.com/transcriptions/XML/greek/lectionaries.zip",
    # "http://www.epistulae.org/downloads/Galatians_Greek_Transcriptions.zip",
    # "http://www.epistulae.org/downloads/Ephesians_Greek_transcriptions.zip",
    # "http://www.epistulae.org/downloads/Philippians_Greek_transcriptions.zip",
    "https://itseeweb.cal.bham.ac.uk/epistulae/downloads/Romans_Greek_transcriptions.zip",
    "https://itseeweb.cal.bham.ac.uk/epistulae/downloads/Galatians_Greek_transcriptions.zip",
    "https://itseeweb.cal.bham.ac.uk/epistulae/downloads/Ephesians_Greek_transcriptions.zip",
    "https://itseeweb.cal.bham.ac.uk/epistulae/downloads/Philippians_Greek_transcriptions.zip",
    "https://itseeweb.cal.bham.ac.uk/epistulae/downloads/1Cor_Greek_transcriptions.zip",
]

for url in urls:
    fetch_and_extract_zip(url, "../data/transcriptions/igntp")

# remove basetext files as they are not needed by getting a list of files matching the pattern
files_to_delete = glob.glob(
    os.path.join("../data/transcriptions/", "**", "*basetext*.xml"), recursive=True
)
for file_path in files_to_delete:
    print(f"Delete {file_path}")
    os.remove(file_path)

## 3 Download from NTVMR

First we need to set up the ranges of documents to download

In [None]:
# the automated way
url = "https://ntvmr.uni-muenster.de/community/vmr/api/metadata/liste/get/"
fetch_and_format_xml(
    url, "../data/manuscripts/metadata_list.xml", "../data/manuscripts/errors.log"
)
docID_set = get_docID_set("../data/manuscripts/metadata_list.xml", all=False)

# the manual way:
# set docID ranges for different manuscript types
# papyri = list(range(10001, 10151))
# majuscules = list(range(20001, 20451))
# minuscules = list(range(30001, 33021))
# lectionaries = list(range(40001, 43001))

# merge lists of docIDs
# docID_list = papyri + majuscules + minuscules + lectionaries

### 3.1 Download manuscript data

As there are many JSON files to be downloaded it is recommended (for a speedup) to run the download in parallel. Set 'max_workers' to the number of cpu cores you want to use.

In [None]:
data_path = "../data/manuscripts/ntvmr"
error_log_file = "../data/manuscripts/errors.log"
overwrite = True

# Create the directory if it doesn't exist
os.makedirs(data_path, exist_ok=True)

# Record the start time
tic = time.time()
# TODO: instead of tic-toc use tdqm for progression bar

# execute threadpool
with ProcessPoolExecutor(max_workers=16) as executor:
    # Submitting the function with arguments to the executor for each docID
    for docID in docID_set:
        executor.submit(
            download_ntvmr_manuscripts, docID, data_path, error_log_file, overwrite
        )

# Record the end time
toc = time.time()
# Calculate the runtime
runtime = toc - tic
print(f"The script took {runtime:.4f} seconds to execute.")

### 3.2 Download transcription data

As there are many TEI files to be downloaded it is recommended (for a speedup) to run the download in parallel. Set 'max_workers' to the number of cpu cores you want to use.

In [None]:
data_path = "../data/transcriptions/ntvmr"
error_log_file = "../data/transcriptions/error.log"
overwrite = True

# Create the directory if it doesn't exist
os.makedirs(data_path, exist_ok=True)

# Record the start time
tic = time.time()
# TODO: instead of tic-toc use tdqm for progression bar

# execute threadpool
with ProcessPoolExecutor(max_workers=16) as executor:
    # Submitting the function with arguments to the executor for each docID
    for docID in docID_set:
        executor.submit(
            download_ntvmr_transcripts, docID, data_path, error_log_file, overwrite
        )

# Record the end time
toc = time.time()
# Calculate the runtime
runtime = toc - tic
print(f"The script took {runtime:.4f} seconds to execute.")