# Download TEI files

## 1 Installing and importing needed modules

When running on a remote JupyterLab, packages that are needed have to be explicitly installed:

In [None]:
# Install a pip package in the current Jupyter kernel
import sys

!{sys.executable} -m pip install requests==2.31.0
!{sys.executable} -m pip install pandas==2.1.4

from utils import (
    fetch_and_extract_zip,
    fetch_and_format_xml,
    download_ntvmr_transcripts,
    download_ntvmr_manuscripts,
    get_docID_set,
)
from concurrent.futures import ProcessPoolExecutor
import getpass
import time
import os
import xml.etree.ElementTree as ET

## 3 Download from IGNTP (transcriptions only)

In [None]:
urls = [
    "http://www.iohannes.com/transcriptions/XML/greek/papyri.zip",
    "http://www.iohannes.com/transcriptions/XML/greek/majuscules.zip",
    "http://www.iohannes.com/transcriptions/XML/greek/minuscules.zip",
    "http://www.iohannes.com/transcriptions/XML/greek/lectionaries.zip",
    # "http://www.epistulae.org/downloads/Galatians_Greek_Transcriptions.zip",
    # "http://www.epistulae.org/downloads/Ephesians_Greek_transcriptions.zip",
    # "http://www.epistulae.org/downloads/Philippians_Greek_transcriptions.zip",
    "https://itseeweb.cal.bham.ac.uk/epistulae/downloads/Romans_Greek_transcriptions.zip",
    "https://itseeweb.cal.bham.ac.uk/epistulae/downloads/Galatians_Greek_transcriptions.zip",
    "https://itseeweb.cal.bham.ac.uk/epistulae/downloads/Ephesians_Greek_transcriptions.zip",
    "https://itseeweb.cal.bham.ac.uk/epistulae/downloads/Philippians_Greek_transcriptions.zip",
    #"https://itseeweb.cal.bham.ac.uk/epistulae/downloads/1Cor_Greek_transcriptions.zip",
]

for url in urls:
    fetch_and_extract_zip(url, "../data/transcriptions/igntp")

# remove basetext as it is not needed and is just throwing errors
try:
    os.remove("../data/transcriptions/igntp/ecm_galatians/NT_GRC_basetext_Gal.xml")
except FileNotFoundError:
    print("nothing to delete")

## 4 Download from NTVMR

First we need to set up the ranges of documents to download

In [None]:
# the automated way
url = "https://ntvmr.uni-muenster.de/community/vmr/api/metadata/liste/get/"
fetch_and_format_xml(
    url, "../data/manuscripts/metadata_list.xml", "../data/manuscripts/errors.txt"
)
docID_set = get_docID_set("../data/manuscripts/metadata_list.xml")

# the manual way:
# set docID ranges for different manuscript types
# papyri = list(range(10001, 10151))
# majuscules = list(range(20001, 20451))
# minuscules = list(range(30001, 33021))
# lectionaries = list(range(40001, 43001))

# merge lists of docIDs
# docID_list = papyri + majuscules + minuscules + lectionaries

### 4.1 Download manuscript data

As there are many JSON files to be downloaded it is recommended (for a speedup) to run the download in parallel. Set `max_workers` to the number of cpu cores you want to use.

In [None]:
data_path = "../data/manuscripts/ntvmr"
error_log_file = "../data/manuscripts/error.log"
overwrite = True

# Create the directory if it doesn't exist
os.makedirs(data_path, exist_ok=True)

# print local time
print(time.localtime())

# Record the start time
tic = time.time()

# execute threadpool
with ProcessPoolExecutor(max_workers=16) as executor:
    # Submitting the function with arguments to the executor for each docID
    for docID in docID_set:
        executor.submit(
            download_ntvmr_manuscripts, docID, data_path, error_log_file, overwrite
        )

# Record the end time
toc = time.time()
# Calculate the runtime
runtime = toc - tic
print(f"The script took {runtime:.4f} seconds to execute.")

# takes ~10min on 16 cores

### 4.2 Download transcription data

As there are many TEI files to be downloaded it is recommended (for a speedup) to run the download in parallel. Set `max_workers` to the number of cpu cores you want to use.

In [None]:
data_path = "../data/transcriptions/ntvmr"
error_log_file = "../data/transcriptions/error.log"
overwrite = False

# Create the directory if it doesn't exist
os.makedirs(data_path, exist_ok=True)

# print local time
print(time.localtime())

# Record the start time
tic = time.time()

# execute threadpool
with ProcessPoolExecutor(max_workers=16) as executor:
    # Submitting the function with arguments to the executor for each docID
    for docID in docID_set:
        executor.submit(
            download_ntvmr_transcripts, docID, data_path, error_log_file, overwrite
        )

# Record the end time
toc = time.time()
# Calculate the runtime
runtime = toc - tic
print(f"The script took {runtime:.4f} seconds to execute.")

# takes ~27min on 16 cores