# 1. Import standard Python stuff 

In [1]:
import sys, os, re, pickle, csv, collections
from collections import *
from IPython.display import HTML
from pprint import pprint
from functools import reduce

from tf.fabric import Fabric
from tf.transcription import Transcription
from tf.extra.bhsa import Bhsa

import numpy as np
from pandas import DataFrame, read_csv
import pandas as pd
from random import random
import matplotlib.pyplot as plt

# 2. Import data

In [6]:
VERSION = 'c'
DATABASE = '~/github/etcbc'
BHSA = f'bhsa/tf/{VERSION}'
REFERENCE = f'bh-reference-system/tf/{VERSION}' # Check my GitHub to download these extra features
TF = Fabric(locations=[DATABASE], modules=[BHSA, REFERENCE], silent=False )

This is Text-Fabric 5.4.1
Api reference : https://dans-labs.github.io/text-fabric/Api/General/
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

119 features found and 0 ignored


# 3. Load the necessary features

In [7]:
api = TF.load('''
    otype
    lex book chapter verse
    nu ps gn prs ls lex gloss
    function sp typ pdp language 
    pgn_prps pgn_prde pgn_verb 
    pgn_verb_prs pgn_prs
''', silent=True)

api.makeAvailableIn(globals())

B = Bhsa(api, 'coref annotated', version=VERSION)

**Documentation:** <a target="_blank" href="https://etcbc.github.io/bhsa" title="{provenance of this corpus}">BHSA</a> <a target="_blank" href="https://etcbc.github.io/bhsa/features/hebrew/c/0_home.html" title="{CORPUS.upper()} feature documentation">Feature docs</a> <a target="_blank" href="https://dans-labs.github.io/text-fabric/Api/Bhsa/" title="BHSA API documentation">BHSA API</a> <a target="_blank" href="https://dans-labs.github.io/text-fabric/Api/General/" title="text-fabric-api">Text-Fabric API 5.4.1</a> <a target="_blank" href="https://dans-labs.github.io/text-fabric/Api/General/#search-templates" title="Search Templates Introduction and Reference">Search Reference</a>

# 4. Retrieve data from Text-Fabric

In [25]:
MY_BOOK = {'Psalms'} # Set any Hebrew Bible Book
MY_CHAPTERS = set(range(1,6)) # Set any range in 150 chapters of the Psalms

words_list = []
coref_info_dict = {}

def get_coref_info():
    
    '''Function retrieves all information needed for coreference resolution.
    The data = wordnode, indexnumber, book, chap, verse, hbtext, gloss, pgn, pdp, typ, ls
    '''
    
    for book in F.otype.s('book'):
        book_name = T.bookName(book)
        
        for chn in L.d(book, 'chapter'):
            chapter = F.chapter.v(chn)
            
            if book_name in MY_BOOK and chapter in MY_CHAPTERS:
                
                for vn in L.d(chn, 'verse'):
                    verse_list = [] # make one list per verse: contains all words in one verse
                    
                    for phrn in L.d(vn, 'phrase'):
                        
                        for word in L.d(phrn, 'word'): 
                            
                            verse_list.append(word)
                            words_list.append(word)
                        
                            for i, w in enumerate(verse_list): #index all words in one verse, start = 0
                                
                                boo, chap, vers = T.sectionFromNode(w)
                                hbtext = T.text([w], fmt='text-orig-plain')
                                info = [w, i, boo, chap, vers, hbtext] # add all info that needs no checking/reworking
                                
                                lex = L.u(w, 'lex')[0]
                                # replace multiple glosses for one word with ',' and space by a '-'
                                gloss = F.gloss.v(lex).replace(', ','-') 
                                info.append(gloss)
                                
                                pgn_prps = F.pgn_prps.v(w)
                                pgn_prde = F.pgn_prde.v(w)
                                pgn_verb = F.pgn_verb.v(w)
                                pgn_prs = F.pgn_prs.v(w)
                                pgn_verb_prs = F.pgn_verb_prs.v(w)
                                
                                if pgn_prps:
                                    info.append(pgn_prps)
                                elif pgn_prde:
                                    info.append(pgn_prde)
                                elif pgn_verb and pgn_prs:
                                    info.append(pgn_verb_prs)
                                elif pgn_verb and not pgn_prs:
                                    info.append(pgn_verb)
                                elif pgn_prs and not pgn_verb:
                                    info.append(pgn_prs)
                                else:
                                    info.append('-')
                                
                                pdp = F.pdp.v(w)
                                info.append(pdp)
                                
                                phrase = L.u(w, 'phrase')[0]
                                typ = F.typ.v(phrase)
                                info.append(typ)
            
                                ls = F.ls.v(w)
                                if ls == 'none':
                                    info.append('-')
                                else:
                                    info.append(ls)
                            
                                coref_info_dict[w] = info 

    return words_list, coref_info_dict
                               
#get_coref_info()

# 5. Make dataset - Export to CSV

In [26]:
words_list, coref_info_dict = get_coref_info() 

with open('coref_to_annotate_wouter.csv', 'w') as f:
    header = ['wordnode', 'index', 'book', 'chapter', 'verse', 'hbword', 'gloss', 'pgn', 'pdp_pos', 'phrase_type', 'lexical_set', 'coref']
    f.write('{}\n'.format(','.join(header)))
    
    for node in words_list:
        info_list = coref_info_dict[node]
        line = [str(element) for element in info_list]
        f.write('{}\n'.format(','.join(line)))