<img align="right" src="images/tf-small.png" width="90"/>
<img align="right" src="images/etcbc.png" width="100"/>

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, re, collections
from utils import prs_set

In [3]:
from tf.app import use

In [4]:
BASE = os.path.expanduser('~/github')
ORG = 'cmerwich'
REPO = 'participant-analysis'
RELATIVE = 'test/tf'

In [5]:
A = use(
    'bhsa', version='c',
    mod=(
        #'etcbc/phono/tf,'
        #'etcbc/valence/tf,'
        #'etcbc/lingo/heads/tf,'
        'cmerwich/bh-reference-system/tf'
    ),
    hoist=globals(),
)

	connecting to online GitHub repo annotation/app-bhsa ... connected
Using TF-app in /Users/Christiaan/text-fabric-data/annotation/app-bhsa/code:
	rv1.2=#5fdf1778d51d938bfe80b37b415e36618e50190c (latest release)
	connecting to online GitHub repo etcbc/bhsa ... connected
Using data in /Users/Christiaan/text-fabric-data/etcbc/bhsa/tf/c:
	rv1.6=#bac4a9f5a2bbdede96ba6caea45e762fe88f88c5 (latest release)
	connecting to online GitHub repo etcbc/phono ... connected
Using data in /Users/Christiaan/text-fabric-data/etcbc/phono/tf/c:
	r1.2=#1ac68e976ee4a7f23eb6bb4c6f401a033d0ec169 (latest release)
	connecting to online GitHub repo etcbc/parallels ... connected
Using data in /Users/Christiaan/text-fabric-data/etcbc/parallels/tf/c:
	r1.2=#395dfe2cb69c261862fab9f0289e594a52121d5c (latest release)
	connecting to online GitHub repo cmerwich/bh-reference-system ... connected
Using data in /Users/Christiaan/text-fabric-data/cmerwich/bh-reference-system/tf/c:
	rv1.0 (latest release)


In [6]:
def compute_text(MY_BOOK, MY_CHAPTERS):

    results = []
    highlights = {}
    
    for book in F.otype.s('book'):
        book_name = T.bookName(book)
       
        for chn in L.d(book, 'chapter'):
            chapter = F.chapter.v(chn)
            tup = (chn,)
            if (
                (MY_BOOK and book_name not in MY_BOOK)
                or 
                (MY_CHAPTERS and chapter not in MY_CHAPTERS)
            ):
                continue
            for phrase in L.d(chn, 'phrase'):
                typ = F.typ.v(phrase)
                if typ == 'NP':
                    tup = tup + (phrase,)
                    highlights[phrase] = 'skyblue'

            for phr_atom in L.d(chn, 'phrase_atom'):
                if F.rela.v(phr_atom) == 'Appo':
                    tup = tup + (phr_atom,)
                    highlights[phr_atom] = 'yellow'

            for w in L.d(chn, 'word'):
                pdp = F.pdp.v(w)
                pgn_prps = F.pgn_prps.v(w)
                pgn_prde = F.pgn_prde.v(w)
                pgn_verb = F.pgn_verb.v(w)
                pgn_prs = F.pgn_prs.v(w)

                if pdp == 'verb':
                    tup = tup + (w,)
                    highlights[w] = 'springgreen'

                if pdp == 'subs':
                    tup = tup + (w,)
                    highlights[w] = 'skyblue'

                if pdp == 'art':
                    tup = tup + (w,)
                    highlights[w] = 'skyblue'

                if pdp == 'nmpr':
                    tup = tup + (w,)
                    highlights[w] = 'tomato' 

                if pdp == 'prps':
                    tup = tup + (w,)
                    highlights[w] = 'palegoldenrod'

                if pdp == 'prde':
                    tup = tup + (w,)
                    highlights[w] = 'royalblue'

                if pdp == 'prep' and pgn_prs in prs_set:
                    tup = tup + (w,)
                    highlights[w] = 'DarkGoldenrod'

            results.append(tup)
    return (results, highlights)

In [7]:
len([n for n in F.otype.s('phrase') if F.det.v(n)])

253207

In [8]:
F.det.freqList(nodeTypes={'phrase'})

(('NA', 139311), ('det', 83886), ('und', 30010))

In [9]:
F.det.freqList(nodeTypes={'phrase_atom'})

(('NA', 140906), ('det', 94558), ('und', 32077))

In [10]:
PHRASELIKE = {'phrase', 'phrase_atom'}

def transPrs(value):
    prs = int
    if value == 'absent':
        prs = 0
    elif value == 'n/a':
        prs = 0
    else:
        prs = 1
    return prs

def transDet(value):
    return 1 if value == 'det' else 0 if value == 'und' else None

def makeMentionModule(results, highlights):
    mNode = F.otype.maxNode
    color = {}
    mtype = {}
    nlink = {}
    mDet = {}
    mSuf = {}
    otype = {n: F.otype.v(n) for n in N()}
    oslots = {n: E.oslots.s(n) for n in range(F.otype.maxSlot + 1, F.otype.maxNode + 1)}
    for tup in results:
        for n in tup[1:]:
            mNode += 1
            nType = F.otype.v(n)
            otype[mNode] = 'mention'
            nlink[mNode] = {n}
            oslots[mNode] = {n} if nType == 'word' else set(E.oslots.s(n))
            mtype[mNode] = nType
            mColor = highlights.get(n, None)
            
            if nType == 'word':
                prs = transPrs(F.prs.v(n))
                mSuf[mNode] = prs
            
            det = None
            if nType in PHRASELIKE:
                det = transDet(F.det.v(n))
            else:
                pa = L.u(n, otype='phrase_atom')[0]
                det = transDet(F.det.v(pa))
                if det is None:
                    p = L.u(n, otype='phrase')[0]
                    det = transDet(F.det.v(p))
            if det is not None:
                mDet[mNode] = det                    
                
            if mColor:
                color[mNode] = mColor
    
    metaData = {
        '': dict(
            moduleAuthor='Christiaan M. Erwich',
            moduleReason='add a new node type for mentions',
        ),
        'otype': dict(
            valueType='str',
        ),
        'mtype': dict(
            valueType='str',
            description='type of mention, coded as a color name',
        ),
        'nlink': dict(
            valueType='str',
            description='link to node on which the mention is based',
        ),
        'oslots': dict(
            valueType='str',
        ),
        'color': dict(
            valueType='str',
            description='type of mention, coded as a color name',
        ),
        'mDet': dict(
            valueType='str',
            description='determinedness of a mention',
        ),
        'mSuf': dict(
            valueType='str',
            description='if mention has suffix',
        ),
    }
    nodeFeatures = dict(otype=otype, mtype=mtype, color=color, mDet=mDet, mSuf=mSuf)
    edgeFeatures = dict(oslots=oslots, nlink=nlink)
    TF.save(
        nodeFeatures=nodeFeatures, 
        edgeFeatures=edgeFeatures, 
        metaData=metaData,
        location=f'{BASE}/{ORG}/{REPO}',
        module=f'{RELATIVE}/{A.version}',
    )           

In [11]:
#MY_BOOK = {'Psalms'}
MY_BOOK = None

#MY_CHAPTERS = set(range(1,2))
MY_CHAPTERS = None

(results, highlights) = compute_text(MY_BOOK, MY_CHAPTERS)

In [12]:
makeMentionModule(results, highlights)

  0.00s VALIDATING oslots feature
  0.21s VALIDATING oslots feature
  0.21s maxSlot=     426584
  0.21s maxNode=    1770970
  0.51s OK: oslots is valid
   |     0.72s T color                to /Users/Christiaan/github/cmerwich/participant-analysis/test/tf/c
   |     0.54s T mDet                 to /Users/Christiaan/github/cmerwich/participant-analysis/test/tf/c
   |     0.48s T mSuf                 to /Users/Christiaan/github/cmerwich/participant-analysis/test/tf/c
   |     0.65s T mtype                to /Users/Christiaan/github/cmerwich/participant-analysis/test/tf/c
   |     1.14s T otype                to /Users/Christiaan/github/cmerwich/participant-analysis/test/tf/c
   |     1.11s T nlink                to /Users/Christiaan/github/cmerwich/participant-analysis/test/tf/c
   |     5.62s T oslots               to /Users/Christiaan/github/cmerwich/participant-analysis/test/tf/c
