In [None]:
# a RELiC-type approach to retrieving narrative episodes. We take passages which have a cross-reference between them, apply a contrastive loss to their embeddings
# and then use out-of-domain "text" queries with descriptions of type-scenes. 

# let's try this.
# things I would need to figure out, taking a context window around the cross-reference, and then applying a contrastive loss to the embeddings of the two passages.

# first let's write a function to parse cross_references.txt

In [15]:
## Helpers
import re

book2id = {k : i for i, k in enumerate("""GEN
EXO
LEV
NUM
DEU
JOS
JDG
RUT
1SA
2SA
1KI
2KI
1CH
2CH
EZR
NEH
EST
JOB
PSA
PRO
ECC
SOS
ISA
JER
LAM
EZE
DAN
HOS
JOE
AMO
OBA
JON
MIC
NAH
HAB
ZEP
HAG
ZEC
MAL
MAT
MAR
LUK
JOH
ACT
ROM
1CO
2CO
GAL
EPH
PHP
COL
1TH
2TH
1TI
2TI
TIT
PHM
HEB
JAM
1PE
2PE
1JO
2JO
3JO
JDE
REV""".split())}
id2book = {i : b for b, i in book2id.items()}

mapping = {
    "1KGS" : "1KI",
    "2KGS" : "2KI",    
    "JAS" : "JAM",
    "JUDG" : "JDG",
    "PS" : "PSA",
    "JUDE" : "JDE",
    "SONG" : "SOS",
    "PHIL" : "PHP",
    "PHLM" : "PHM",    
    "SON" : "SOS",
    "PHI" : "PHP",
    "JUD" : "JDE"
}

class Location(dict):
    def __init__(self, value):
        if isinstance(value, str):
            book, chapter, verse = re.sub(r"^b\.", "", value).upper().split(".")
            book3 = mapping.get(book, book[:3])
            assert book3 in book2id
            self["book"] = book3
            self["chapter"] = int(chapter)
            self["verse"] = int(verse)
        elif isinstance(value, (dict,)):
            for k in ["book", "chapter", "verse"]:
                self[k] = value[k]
        else:
            raise Exception("Not sure how to turn '{}' into a location".format(value))

    def __cmp__(self, other):
        a = (book2id[self["book"]], self["chapter"], self["verse"])
        b = (book2id[other["book"]], other["chapter"], other["verse"])
        return -1 if a < b else 0 if a == b else 1

    def __gt__(self, other):
        return (book2id[self["book"]], self["chapter"], self["verse"]) > (book2id[other["book"]], other["chapter"], other["verse"])

    def __le__(self, other):
        return (book2id[self["book"]], self["chapter"], self["verse"]) <= (book2id[other["book"]], other["chapter"], other["verse"])

    def __lt__(self, other):
        return (book2id[self["book"]], self["chapter"], self["verse"]) < (book2id[other["book"]], other["chapter"], other["verse"])    

    def __hash__(self):
        return hash(repr(self))

In [3]:
import pandas as pd

cross_refs = pd.read_csv('../data/cross_references.txt', sep='\t', header=0)
# keep only those lines for which votes > 50
votes_thres = 50
cross_refs = cross_refs[cross_refs['Votes'] > votes_thres]

In [4]:
cross_refs.head()

Unnamed: 0,From Verse,To Verse,Votes
5,Gen.1.1,Neh.9.6,56
10,Gen.1.1,Exod.20.11,88
18,Gen.1.1,John.1.1-John.1.3,191
19,Gen.1.1,Isa.45.18,141
23,Gen.1.1,Heb.11.3,167


In [17]:
# we can see some of the cross-references are to a span of verses, we will take the first verse of that span.
# 436 cross-refs link to a span of at least 2 verses. 436 one-to-many references
cross_refs['To Verse'] = cross_refs['To Verse'].apply(lambda x: x.split("-")[0])
# check if that worked
sum(cross_refs['To Verse'].apply(lambda x: len(x.split("-")) > 1))
# good, no more multi-verse references

0

In [26]:
# modified from Tom's script
books = set()
labels = set()
golds = {}
num_golds = 0

for i, row in cross_refs.iterrows():
    a = Location(row['From Verse'])
    b = Location(row['To Verse'])
    labels.add(a)
    labels.add(b)
    books.add(a["book"])
    books.add(b["book"])
    if a < b:
        src, tgt = a, b
    else:
        src, tgt = b, a
    key = (src, tgt)
    golds[src] = golds.get(src, {})
    
    golds[src][tgt] = row['Votes']
    num_golds += 1

In [29]:
golds

{{'book': 'GEN',
  'chapter': 1,
  'verse': 1}: {{'book': 'NEH', 'chapter': 9, 'verse': 6}: 56, {'book': 'EXO',
   'chapter': 20,
   'verse': 11}: 88, {'book': 'JOH',
   'chapter': 1,
   'verse': 1}: 143, {'book': 'ISA', 'chapter': 45, 'verse': 18}: 141, {'book': 'HEB',
   'chapter': 11,
   'verse': 3}: 167, {'book': 'JOB',
   'chapter': 38,
   'verse': 4}: 88, {'book': 'ACT', 'chapter': 17, 'verse': 24}: 78, {'book': '2PE',
   'chapter': 3,
   'verse': 5}: 58, {'book': 'ISA',
   'chapter': 44,
   'verse': 24}: 52, {'book': 'ISA', 'chapter': 42, 'verse': 5}: 88, {'book': 'COL',
   'chapter': 1,
   'verse': 16}: 84, {'book': 'REV',
   'chapter': 4,
   'verse': 11}: 114, {'book': 'HEB', 'chapter': 1, 'verse': 10}: 105},
 {'book': 'GEN',
  'chapter': 2,
  'verse': 24}: {{'book': 'MAT', 'chapter': 19, 'verse': 3}: 53},
 {'book': 'GEN',
  'chapter': 3,
  'verse': 15}: {{'book': 'ROM', 'chapter': 16, 'verse': 20}: 77},
 {'book': 'GEN',
  'chapter': 15,
  'verse': 6}: {{'book': 'JAM', 'chapte