# Get Data

In [1]:
from collections import *
from tf.fabric import Fabric
from IPython.display import display, HTML

TF = Fabric(modules='Hebrew/etcbc4c')
api = TF.load('''
              book chapter verse 
              function pdp vt
              lex lex_utf8 g_word_utf8
              mother tab
              ''')
api.makeAvailableIn(globals())

This is Text-Fabric 2.3.0
Api reference : https://github.com/ETCBC/text-fabric/wiki/Api
Tutorial      : https://github.com/ETCBC/text-fabric/blob/master/docs/tutorial.ipynb
Data sources  : https://github.com/ETCBC/text-fabric-data
Data docs     : https://etcbc.github.io/text-fabric-data
Shebanq docs  : https://shebanq.ancient-data.org/text
Slack team    : https://shebanq.slack.com/signup
Questions? Ask shebanq@ancient-data.org for an invite to Slack
109 features found and 0 ignored
  0.00s loading features ...
   |     0.01s B book                 from /Users/Cody/github/text-fabric-data/Hebrew/etcbc4c
   |     0.01s B chapter              from /Users/Cody/github/text-fabric-data/Hebrew/etcbc4c
   |     0.01s B verse                from /Users/Cody/github/text-fabric-data/Hebrew/etcbc4c
   |     0.18s B g_word_utf8          from /Users/Cody/github/text-fabric-data/Hebrew/etcbc4c
   |     0.17s B lex_utf8             from /Users/Cody/github/text-fabric-data/Hebrew/etcbc4c
   |     0.08s

In [2]:
def getLabel(clauseNode):
    '''
    convert a Text-Fabric node number into a label
    return a label of form: book.chapter.verse.clause
    '''
    bookNode = L.u(clauseNode, otype='book')[0]
    chapterNode = L.u(clauseNode, otype='chapter')[0]
    verseNode = L.u(clauseNode, otype='verse')[0]
    verseClauses = list(L.d(verseNode, otype='clause_atom'))
    clauseNum = verseClauses.index(clauseNode) + 1
    clauseLabel = '{}.{}.{}.{}'.format(F.book.v(bookNode), 
                                       F.chapter.v(chapterNode), 
                                       F.verse.v(verseNode), 
                                       clauseNum)
    return clauseLabel
    
def getEtcbcData(corpus):
    '''
    Gather the needed data from the Text-Fabric module;
    label clauses with a simple reference tag (ex. Psalms.1.1.1)
    return an embedded dictionary keyed by data labels...;
    '''
    # the corpus argument is a string; 
    # find the node number with a feature value that matches the corpus string
    # there will be only 1 result, so we use 'next' to pull the first result from the generator 
    corpus = next(book for book in F.otype.s('book') if F.book.v(book) == corpus)
    
    # now we pull all the clause node numbers from the corpus:
    clauseNodes = L.d(corpus, otype='clause_atom')

    # create a mapping for each mother clause to its daughter clauses
    # in Text-Fabric that information is stored the other way around
    # so we build up that data with a defaultdict with a list value:
    motherToDaughters = defaultdict(list)
    for clause in clauseNodes:
        if F.tab.v(clause) == 0: # do not store parallel daughter clauses
            continue
        daughter = getLabel(clause)
        for mother in E.mother.f(clause):
            motherToDaughters[mother].append(daughter)
    
    # we will store all of the clause data in this dict:
    clauses = OrderedDict()
    
    # iterate over the clause node numbers and gather the data to be returned
    # the data is stored in the clauses ordered dictionary
    for clause in clauseNodes:
        clauseLabel = getLabel(clause)
        wordNodes = L.d(clause, otype='word')
        text = T.text(wordNodes)
        indentation = F.tab.v(clause)
        phraseNodes = L.d(clause, otype='phrase')
        timePhrases = tuple(phrase for phrase in phraseNodes if F.function.v(phrase) == 'Time')
        timePhraseText = tuple(T.text(L.d(phrase, otype='word')) for phrase in timePhrases)
        daughters = motherToDaughters.get(clause, [])
        mother = getLabel(E.mother.f(clause)[0]) if E.mother.f(clause) else None
        clauses[clauseLabel] = {
                                'text': text,
                                'timePhraseText': timePhraseText,
                                'daughters': daughters,
                                'mother': mother,
                                'timeCategories':[],
                                'indentation':indentation,
                                'etcbcClauseNode': clause #!! REMOVE LATER
                                }
    return clauses

In [3]:
def climbClauseTree(clause, clauseDict, span, coverage):
    '''
    recursively climb the clause tree and build the time spans
    if a time marker occurs inside the tree, break the span 
    span is a list that accrues as the code descends the tree
    coverage is a set that tracks which clauses are accounted for
    '''    
    for daughter in clauseDict[clause]['daughters']:
        timeMarkers = clauseDict[daughter]['timePhraseText']
        if timeMarkers:
            continue
        else:
            span.append(daughter)
            coverage.add(daughter)
            climbClauseTree(daughter, clauseDict, span, coverage)

def buildTimeSpans(clauseDict):
    '''
    loop through all clauses and call the climbClauseTree function
    each timespan is stored in timeSpans dict, keyed by its first clause
    coverage tracks which clauses are accounted for to avoid overlap
    '''
    timeSpans = OrderedDict()
    coverage = set()
    for clause, cData in clauseDict.items():
        if clause not in coverage and cData['timePhraseText']:
            span = []
            span.append(clause)
            climbClauseTree(clause, clauseDict, span, coverage)
            timeSpans[clause] = span
    return timeSpans

In [4]:
def buildDisplay(clauseDict, spanDict):
    '''
    compile HTML code to display:
        1. plain text clauses
        2. indent clauses based on relationship to each other
        3. shade each clause within each time-span
    '''
    
    templates = {
                 'div' : '<div style="text-align: right; padding-right: 450px">{data}</div>',
                 'label' : '<span>{clauseLabel}</span>',
                 'clause' : '<span{style}>{clause}</span>{label}<br>\n',
                 'colorStyle' : ' style="background-color:{color}"',
                 'tab' : '&nbsp&nbsp&nbsp&nbsp',
                 'colors' : ('#fffdaf','#f9c9a7')
                }
    
    inTimeSpan = set(cl for span in spanDict 
                     for cl in spanDict[span])
    
    # compile the code for the clause display in 'clauses' string
    clauses = ''
    currentColor = templates['colors'][1]
    
    for clause, clauseData in clauseDict.items():
        
        if clause in spanDict:
            color = next(col for col in templates['colors'] if col != currentColor)
            for sClause in spanDict[clause]:
                currentColor = color
                colorStyle = templates['colorStyle'].format(color=color)
                indentation = templates['tab'] * clauseDict[sClause]['indentation']
                clauseText = clauseDict[sClause]['text'] + indentation
                clauseDisplay = templates['clause'].format(style = colorStyle,
                                                           label = templates['tab'] + sClause,
                                                           clause = clauseText)
                clauses += clauseDisplay
                
        elif clause not in inTimeSpan:
            indentation = templates['tab'] * clauseData['indentation']
            clauseText = clauseData['text'] + indentation
            clauseDisplay = templates['clause'].format(style = '',
                                                       label = templates['tab'] + clause,
                                                       clause = clauseText)
            clauses += clauseDisplay
            
        else:
            continue
            
    htmlDiv = templates['div'].format(data=clauses)
    return htmlDiv

In [17]:
test = getEtcbcData('Genesis')

In [18]:
spans = buildTimeSpans(test)

In [19]:
body = buildDisplay(test, spans)

with open('test.html', 'w') as file:
    file.write('<HTML>')
    file.write('<meta charset="UTF-8">')
    file.write('<HEAD>')
    file.write('</HEAD>')
    file.write('<BODY>')
    file.write(body)
    file.write('</BODY>')
    file.write('</HTML>')

# Test Field

In [495]:
# PRINT SPANS:
pprint(list(spans.items())[:100])

[('Genesis.1.1.1',
  ['Genesis.1.1.1',
   'Genesis.1.2.1',
   'Genesis.1.2.2',
   'Genesis.1.2.3',
   'Genesis.1.3.1',
   'Genesis.1.3.2',
   'Genesis.1.3.3',
   'Genesis.1.4.1',
   'Genesis.1.4.2',
   'Genesis.1.4.3',
   'Genesis.1.5.1',
   'Genesis.1.5.2',
   'Genesis.1.5.3',
   'Genesis.1.5.4',
   'Genesis.1.5.5',
   'Genesis.1.6.1',
   'Genesis.1.6.2',
   'Genesis.1.6.3',
   'Genesis.1.7.1',
   'Genesis.1.7.2',
   'Genesis.1.7.3',
   'Genesis.1.7.4',
   'Genesis.1.7.5',
   'Genesis.1.7.6',
   'Genesis.1.8.1',
   'Genesis.1.8.2',
   'Genesis.1.8.3',
   'Genesis.1.8.4',
   'Genesis.1.9.1',
   'Genesis.1.9.2',
   'Genesis.1.9.3',
   'Genesis.1.9.4',
   'Genesis.1.10.1',
   'Genesis.1.10.2',
   'Genesis.1.10.3',
   'Genesis.1.10.4',
   'Genesis.1.11.1',
   'Genesis.1.11.2',
   'Genesis.1.11.3',
   'Genesis.1.11.4',
   'Genesis.1.11.5',
   'Genesis.1.11.6',
   'Genesis.1.11.7',
   'Genesis.1.11.8',
   'Genesis.1.12.1',
   'Genesis.1.12.2',
   'Genesis.1.12.3',
   'Genesis.1.12.4',
   'G

In [494]:
test['Genesis.2.4.3']

{'daughters': ['Genesis.2.4.4', 'Genesis.2.5.1', 'Genesis.2.7.1'],
 'etcbcClauseNode': 514731,
 'indentation': 1,
 'mother': 'Genesis.2.4.1',
 'text': 'בְּיֹ֗ום ',
 'timeCategories': [],
 'timePhraseText': ()}

In [225]:
tClause = next(cl for cl in F.otype.s('clause_atom')
                   if F.book.v(L.u(cl, otype='book')[0]) == 'Genesis'
                   and F.chapter.v(L.u(cl, otype='chapter')[0]) == 2
                   and F.verse.v(L.u(cl, otype='verse')[0]) == 7
              )
tMom = E.mother.f(tClause)

words = L.d(tMom[0], otype='word')

T.text(words)

'בְּיֹ֗ום '

In [496]:
corpus = next(book for book in F.otype.s('book') if F.book.v(book) == 'Genesis')

timePhrases = set( T.text(L.d(p, otype='word')) for p in L.d(corpus, otype='phrase') 
                   if F.function.v(p) == 'Time'
                 )


In [497]:
timePhrases

{'אֶ֖מֶשׁ ',
 'אֶ֣מֶשׁ׀ ',
 'אַ֣חֲרֵי כֵ֔ן ',
 'אַֽחֲרֵי־כֵ֗ן ',
 'אַֽחֲרֵי־כֵ֞ן ',
 'אַחֲרֵ֖י מֹ֣ות אַבְרָהָ֑ם ',
 'אַחֲרֵי֙ הַדְּבָרִ֣ים הָאֵ֔לֶּה ',
 'אַחֲרֵי֙ מֹ֣ות אַבְרָהָ֔ם ',
 'אַחֲרֵי־כֵ֑ן ',
 'אַחֲרֵי־כֵ֥ן ',
 'אַחֲרֵי־כֵן֙ ',
 'אַחֲרֵי־כֵן֩ ',
 'אַחֲרֵיהֶ֖ם ',
 'אַחֲרֵיהֶ֗ן ',
 'אַחֲרֵיהֶן֙ ',
 'אַחַ֖ר ',
 'אַחַ֗ר ',
 'אַחַ֣ר ',
 'אַחַ֣ר הַמַּבּ֑וּל ',
 'אַחַ֥ר הַמַּבּֽוּל׃ ',
 'אַחַ֥ר הַמַּבּֽוּל׃ פ ',
 'אַחַר֙ ',
 'אַחַר֙ הַדְּבָרִ֣ים הָאֵ֔לֶּה ',
 'אַרְבַּ֥ע וּשְׁלֹשִׁ֖ים שָׁנָ֑ה ',
 'אַרְבַּ֥ע מֵאֹ֖ות שָׁנָֽה׃ ',
 'אַרְבַּֽע־עֶשְׂרֵ֤ה שָׁנָה֙ ',
 'אַרְבָּעִ֣ים יֹ֔ום וְאַרְבָּעִ֖ים לָ֑יְלָה ',
 'אַרְבָּעִ֣ים יֹ֔ום וְאַרְבָּעִ֖ים לָֽיְלָה׃ ',
 'אַרְבָּעִ֣ים שָׁנָ֔ה וּשְׁמֹנֶ֥ה מֵאֹ֖ות שָׁנָ֑ה ',
 'אַרְבָּעִ֥ים יֹ֖ום ',
 'אָ֖ז ',
 'אָ֣ז ',
 'אָ֤ז ',
 'אָ֥ז ',
 'אָֽמֶשׁ׃ ',
 'בְאַרְבַּע֩ עֶשְׂרֵ֨ה שָׁנָ֜ה ',
 'בְיָמָיו֙ ',
 'בַבֹּ֔קֶר ',
 'בַבֹּ֨קֶר֙ ',
 'בַחֹ֨דֶשׁ֙ הַשֵּׁנִ֔י ',
 'בַיֹּ֛ום ',
 'בַיֹּ֨ום הַשְּׁלִישִׁ֜י ',
 'בָעֶ֔רֶב ',
 'בְּאֶחָ֣ד לַחֹ֔דֶשׁ ',
 'בְּאַחֲרִ֥ית