# Hebrew OT

In [None]:
%%bash
curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/master/Older%20Formats/TOTHT%20Gen-Deu%20-%20Translators%20OT%20Hebrew%20Tagged%20text%20-%20STEPBible.org%20CC%20BY.txt >> ../data/STEP/Gen_Deu.txt
curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/master/Older%20Formats/TOTHT%20Jos-Est%20-%20Translators%20OT%20Hebrew%20Tagged%20text%20-%20STEPBible.org%20CC%20BY.txt >> ../data/STEP/Jos_Est.txt
curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/master/Older%20Formats/TOTHT%20Job-Sng%20-%20Translators%20OT%20Hebrew%20Tagged%20text%20-%20STEPBible.org%20CC%20BY.txt >> ../data/STEP/Job_Sng.txt
curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/master/Older%20Formats/TOTHT%20Isa-Mal%20-%20Translators%20OT%20Hebrew%20Tagged%20text%20-%20STEPBible.org%20CC%20BY.txt >> ../data/STEP/Isa_Mal.txt

In [None]:
prefix = '../data/STEP/'
paths = ['Gen_Deu.txt', 'Jos_Est.txt', 'Job_Sng.txt', 'Isa_Mal.txt']
paths = [prefix + p for p in paths]
import json
ATNACH = '\u0591'

In [None]:
# Function to parse the input data and produce the desired output
def get_half_lines(line):
    words = line.split()
    split = None
    for i, w in enumerate(words):
        if ATNACH in w:
            split = i
    if split != None:
        return ' '.join(words[:split+1]), ' '.join(words[split+1:])
    else:
        return line, ''


def parse_data_file(input_file, keep_sep=False, priority='Hebrew'):
    parsed_data = []
    current_ref = None
    current_text = []
    # Open the input file for reading
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

        # Iterate through each line in the file, skipping headers
        for line in lines[45:]:
            columns = line.strip().split('\t')
            if len(columns) < 6:
                continue  # Skip lines that do not have enough data

            heb_ref, eng_ref, pointed, accented, morphology, extended_strongs = columns

            # Extract the Hebrew and English references, ignoring the word number
            heb_verse = heb_ref.split('-')[0]
            eng_verse = eng_ref.split('-')[0]

            if not keep_sep:
                accented = accented.replace('/', '')
            else:
                accented = accented.replace('/', ' ')

            # # If we're still processing the same reference, keep appending words
            # as some verses break different places, this tells us whether we want Hebrew or English verse breaks to have priority
            if priority == "Hebrew":
                comp = heb_verse
            elif priority == "English":
                comp = eng_verse

            if current_ref == comp:
                current_text.append(accented)
            else:
                # If we encounter a new reference, store the previous one (if it exists)
                if current_ref is not None:
                    line = ' '.join(current_text)
                    half_a, half_b = get_half_lines(line)
                    parsed_data.append({
                        'heb_ref': current_ref,
                        'eng_ref': current_eng_ref,
                        'line': line,
                        'half_a': half_a,
                        'half_b': half_b
                    })
                # Start collecting data for the new reference
                current_ref = heb_verse
                current_eng_ref = eng_verse
                current_text = [accented]

        # Append the last collected reference
        if current_ref:
            line = ' '.join(current_text)
            half_a, half_b = get_half_lines(line)
            parsed_data.append({
                'heb_ref': heb_verse,
                'eng_ref': eng_verse,
                'line': line,
                'half_a': half_a,
                'half_b': half_b
            })
    return parsed_data

In [None]:
data = []
for p in paths:
    data.extend(parse_data_file(p, keep_sep=True, priority='Hebrew'))


In [None]:
%%bash
curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/master/Translators%20Amalgamated%20OT%2BNT/TAHOT%20Gen-Deu%20-%20Translators%20Amalgamated%20Hebrew%20OT%20-%20STEPBible.org%20CC%20BY.txt > ../data/STEP/Gen_Deu_eng.txt
curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/master/Translators%20Amalgamated%20OT%2BNT/TAHOT%20Jos-Est%20-%20Translators%20Amalgamated%20Hebrew%20OT%20-%20STEPBible.org%20CC%20BY.txt > ../data/STEP/Jos_Est_eng.txt
curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/master/Translators%20Amalgamated%20OT%2BNT/TAHOT%20Job-Sng%20-%20Translators%20Amalgamated%20Hebrew%20OT%20-%20STEPBible.org%20CC%20BY.txt > ../data/STEP/Job_Sng_eng.txt
curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/master/Translators%20Amalgamated%20OT%2BNT/TAHOT%20Isa-Mal%20-%20Translators%20Amalgamated%20Hebrew%20OT%20-%20STEPBible.org%20CC%20BY.txt > ../data/STEP/Isa_Mal_eng.txt


In [None]:
import pandas as pd

def parse_eng_data_file(file_path):
    parsed_data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        curr_ref = None
        for i, line in enumerate(lines):
            if line.startswith("# "):
                ref = line.split("\t")[0].split("# ")[1].strip().rstrip()
                translation = lines[i+1].split("#_Translation")[1].strip().rstrip().replace('\t', ' ')
                if len(ref.split()) >1:
                    # formatted as Gen.31.55 (Heb: 32.1)
                    eng_ref = ref.split()[0]
                    heb_ref = eng_ref.split('.')[0] + '.'+ ref.split('Heb: ')[1].replace(')', '')
                    #heb_ref = eng_ref.split('.')[0] + '.' + ref.split()[-1].replace('(', '').replace(")", '')
                    
                else:
                    eng_ref = ref
                    heb_ref = eng_ref
                

                curr_ref = eng_ref
                parsed_data.append({'eng_ref': eng_ref,
                                    'heb_ref': heb_ref,
                                    'translation': translation,
                                    })         
            elif line.startswith(f"#_{curr_ref}"):
                # if there is a subsequent line with the rest of the info
                extra_translation  = lines[i+1].split("#_Translation")[1].strip().rstrip().replace('\t', ' ')
                # grab the last entry in parsed_data and add in the translation
                parsed_data[-1]['translation'] += " " + extra_translation  

    return parsed_data

eng_data = []
prefix = '../data/STEP/'
paths = ['Gen_Deu_eng.txt','Jos_Est_eng.txt', 'Job_Sng_eng.txt', 'Isa_Mal_eng.txt']
for p in [prefix+pth for pth in paths]:
    eng_data.extend(parse_eng_data_file(p))

eng_data[:5]



In [None]:
# let's make it into a dataframe, merge on eng_ref, and then hopefully we have a perfectly aligned dataframe
import pandas as pd

heb = pd.DataFrame.from_records(data)
heb.head()

In [None]:
eng = pd.DataFrame.from_records(eng_data)
eng.head()

In [None]:
all_OT = pd.merge(heb, eng, on='eng_ref')
# this creates heb_ref_x and heb_ref_y. we want to make certain that these are the same. If so
all_OT.head()

In [None]:
all_OT[all_OT['eng_ref']==None]

In [None]:
all_OT = all_OT.rename(columns = {'heb_ref_x': 'heb_ref'})
all_OT = all_OT.drop(columns =['heb_ref_y'])
all_OT= all_OT[['eng_ref', 'heb_ref', 'line', 'translation', 'half_a', 'half_b']]
all_OT.head()

In [None]:
half_a = []
half_b = []
for i, row in all_OT.iterrows():
    #print(row)
    if row['half_b'] != "":
        words = row['translation'].split()
        half_a.append(" ".join(words[:len(words)//2]))
        half_b.append(" ".join(words[len(words)//2:]))
    else:
        half_a.append(row['translation'])
        half_b.append("")

all_OT['trans_a'] = half_a
all_OT['trans_b'] = half_b

all_OT.head()


In [None]:
all_data = all_OT.to_dict('records')


In [None]:
# Write the output as a JSON file
with open('../data/STEP/OT_aligned_sep.json', 'w', encoding='utf-8') as out_f:
    for line in all_data:
        json.dump(line, out_f, ensure_ascii=False)
        out_f.write('\n')


# Greek OT

In [None]:
import xml.etree.ElementTree as ET
import json

# Parse the XML file
tree = ET.parse('../data/LXX/LXX.xml')  # replace with your file name
root = tree.getroot()

# Initialize an empty list to store the output
output = []

# Iterate over all the books
for biblebook in root.findall('BIBLEBOOK'):
    book_number = biblebook.get('bnumber')  # Get book number
    
    # Iterate over all chapters in the book
    for chapter in biblebook.findall('CHAPTER'):
        chapter_number = chapter.get('cnumber')  # Get chapter number
        
        # Iterate over all verses in the chapter
        for verse in chapter.findall('VERS'):
            verse_number = verse.get('vnumber')  # Get verse number
            
            # Concatenate all the <gr> text elements for this verse
            verse_text = ' '.join([gr.text for gr in verse.findall('gr') if gr.text])

            # Create the identifier in the format <book.chapter.verse>
            identifier = f"{book_number}.{chapter_number}.{verse_number}"
            
            # Create a dictionary with the identifier and text
            verse_entry = {
                "book_num": book_number,
                "chapter": chapter_number,
                "verse": verse_number,
                "text": verse_text
            }
            
            # Add to the output list
            output.append(verse_entry)

# Write the output to a JSON Lines file
with open('data/LXX.jsonl', 'w', encoding='utf-8') as f:
    for entry in output:
        f.write(json.dumps(entry, ensure_ascii=False) + '\n')

print("Finished processing the XML file.")


## Now we align it to English

In [None]:
import pandas as pd
df = pd.read_json('../data/LXX/LXX.json', lines=True)
df.head()

In [None]:
NAMES = {k:v for k,v in zip(['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', 'Ruth', '1 Samuel', '2 Samuel',
                             '1 Kings', '2 Kings', '1 Chronicles', '2 Chronicles', 'Ezra', 'Nehemiah', 'Esther', 'Job',
                             'Psalm', 'Proverbs', 'Ecclesiastes', 'Song of Solomon', 'Isaiah', 'Jeremiah', 'Lamentations', 'Ezekiel',
                             'Daniel', 'Hosea', 'Joel', 'Amos', 'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah',
                             'Haggai', 'Zechariah', 'Malachi'],
                            ['Gen', 'Exo', 'Lev', 'Num', 'Deu', 'Jos', 'Jdg', 'Rut', '1Sa',
       '2Sa', '1Ki', '2Ki', '1Ch', '2Ch', 'Ezr', 'Neh', 'Est', 'Job',
       'Psa', 'Pro', 'Ecc', 'Sng', 'Isa', 'Jer', 'Lam', 'Ezk', 'Dan',
       'Hos', 'Jol', 'Amo', 'Oba', 'Jon', 'Mic', 'Nam', 'Hab', 'Zep',
       'Hag', 'Zec', 'Mal'])}

In [None]:
num2name = {i+1:v for i, (k,v) in enumerate(NAMES.items())}
num2name

In [None]:
df['book'] = df['book_num'].apply(lambda x: num2name[x])

df['eng_ref'] = df['book'] + "." + df['chapter'].astype(str) + "." + df['verse'].astype(str)
df.head()

In [None]:
df[['eng_ref', 'text']].to_json('../data/LXX/LXX_aligned.json', lines=True, orient="records", force_ascii=False)

# Greek NT

In [None]:

!curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/refs/heads/master/Translators%20Amalgamated%20OT%2BNT/TAGNT%20Mat-Jhn%20-%20Translators%20Amalgamated%20Greek%20NT%20-%20STEPBible.org%20CC-BY.txt >> data/Mat_Jhn.txt
!curl -L https://raw.githubusercontent.com/STEPBible/STEPBible-Data/refs/heads/master/Translators%20Amalgamated%20OT%2BNT/TAGNT%20Act-Rev%20-%20Translators%20Amalgamated%20Greek%20NT%20-%20STEPBible.org%20CC-BY.txt >> data/Act_Rev.txt


In [None]:
def parse_eng_data_file(file_path):
    parsed_data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        curr_ref = None
        for i, line in enumerate(lines):
            if line.startswith("# "):
                parts = line.split("# ")[1].split('\t')
                ref = parts[0]
                if '[' in parts[1]:
                    parts = parts[1:]
                text = " ".join(parts[1:]).strip().rstrip()
                translation = lines[i+1].split("#_Translation")[1].strip().rstrip().replace('\t', ' ')
                if len(ref.split()) >1:
                    eng_ref = ref.split()[0]
                    #other_ref = eng_ref.split('.')[0] + '.' + ref.split()[-1].replace('(', '').replace(")", '')
                    
                else:
                    eng_ref = ref
                
                curr_ref = eng_ref
                
                parsed_data.append({'eng_ref': eng_ref,
                                    'text': text,
                                    'translation': translation,
                                    })    
            elif line.startswith(f"#_{curr_ref}"):
                # if there is a subsequent line with the rest of the info
                parts = line.split("#_")[1].split('\t')
                ref = parts[0]
                if '[' in parts[1]:
                    parts = parts[1:]
                extra_text = " ".join(parts[1:]).strip().rstrip()
                extra_translation  = lines[i+1].split("#_Translation")[1].strip().rstrip().replace('\t', ' ')
                # grab the last entry in parsed_data and add in the translation
                parsed_data[-1]['text'] += " " + extra_text
                parsed_data[-1]['translation'] += " " + extra_translation  
        

    return parsed_data

In [None]:
data = []
for p in ['data/Mat_Jhn.txt', 'data/Act_Rev.txt']:
    data.extend(parse_eng_data_file(p))

data[:5]

In [None]:
data[40] # checking that translations spanning multiple lines are correct

In [None]:
import json
with open('../data/STEP/NT_aligned.json', 'w', encoding='utf-8') as out_f:
    for line in data:
        json.dump(line, out_f, ensure_ascii=False)
        out_f.write('\n')