In [None]:
!pip install -U spacy

In [None]:
!pip install 'numpy<2'

In [None]:
import os
from os import path
import spacy
import json
from json import loads
from spacy.tokens import Token
from spacy.tokens import DocBin
import sys
import re
import itertools

In [None]:
!python3 -m spacy download ro_core_news_lg

In [None]:
ron_nlp = spacy.load('ro_core_news_lg')
ron_nlp._path

In [None]:

syllabic_rwp = ['mă', 'te', 'lu', 'se', 'mi', 'ți', 'i', 'și', 'ne', 'vă', 'le', 'ni', 'vi', 'li']
# the o-RWP behaves as an auxiliary verb with vocalic onset
vocalic_rwp = ['o']
syllabic_rwv = ['su', 'i']
syllabic_neg = ['nu']
auxiliary_verb = ['am', 'ai', 'a', 'ați', 'au', 'aș', 'ar', 'oi', 'om', 'oți', 'or', 'voi', 'vei', 'va', 'vom', 'veți', 'vor']

ron_vowels = "aeiouăîâ"

EMPTY_TOKEN = ron_nlp('#')[0]
EMPTY_TOKEN.pos_ = "PUNCT" 
HYPHEN_TOKEN =  ron_nlp('-')[0]

# Define getter functions
def get_is_rwp(token):
    return token.lower_ in syllabic_rwp and token.pos_ != "CCONJ" # to check with 'și'

def get_is_vocalic_rwp(token):
    return token.lower_ in vocalic_rwp and token.pos_ != "DET" # to check with 'o casă'

def get_is_rwv(token):
    return token.lower_ in syllabic_rwv

def get_is_neg(token):
    return token.lower_ in syllabic_neg

def get_is_auxiliary_verb(token):
    return token.lower_ in auxiliary_verb and token.pos_ != "VERB" and token.pos_ != "PART" and token.pos_ != "DET"

def get_is_obligatory_host(token):
    return (get_is_auxiliary_verb(token) and get_vowel_initial_char(token)) or get_is_vocalic_rwp(token) 

# in the context of Romanian weak pronouns, a linear order part is an item that can occurr in both pre- and post-verbal position
# such as the sequence "le" in "Le faci."  vs "Fă-le!" or the sequence "mi le" in "Mi le dai. vs "Dă-mi-le!"
# ==> pointer to article

def get_is_linear_order_part(token):
    return get_is_rwp(token) or get_is_vocalic_rwp(token) or get_is_auxiliary_verb(token) or get_is_rwv(token)

def get_vowel_initial_char(token):
    return token.lower_[0] in ron_vowels 

def get_vowel_final_char(token):
    return token.lower_[len(token.lower_)-1] in ron_vowels 

def get_vowel_initial(token):
    return token.lower_[0] in ron_vowels and not token.lower_.startswith('iu')

def get_vowel_final(token):
    return token.lower_[len(token.lower_)-1] in ron_vowels and not \
    (token.lower_.endswith('ui') or token.lower_.endswith('oi') or token.lower_.endswith('eu'))

def get_is_first_syllable_stressed(token):
    firstSyllableStressed = False
    if token.lower_ == 'umbli' or (token.lower_ == 'am' and token.pos_ == 'VERB') or \
        (token.lower_ == 'este' and token.pos_ == 'AUX')  or \
        (token.lower_ == 'aflu' and token.pos_ == 'VERB') or \
        (token.lower_ == 'arde' and token.pos_ == 'VERB') or \
        (token.lower_ == 'altuia'):
        firstSyllableStressed = True
    return firstSyllableStressed 

def get_is_last_syllable_stressed(token):
    return False 

# Set extension on the Token with getter
Token.set_extension("is_rwp", getter=get_is_rwp, force=True)
Token.set_extension("is_vocalic_rwp", getter=get_is_vocalic_rwp, force=True)
Token.set_extension("is_rwv", getter=get_is_rwv, force=True)
Token.set_extension("is_neg", getter=get_is_neg, force=True)
Token.set_extension("is_auxiliary_verb", getter=get_is_auxiliary_verb, force=True)
Token.set_extension("is_obligatory_host", getter=get_is_obligatory_host, force=True)
Token.set_extension("is_linear_order_part", getter=get_is_linear_order_part, force=True)
Token.set_extension("vowel_initial_char", getter=get_vowel_initial_char, force=True)
Token.set_extension("vowel_final_char", getter=get_vowel_final_char, force=True)
Token.set_extension("vowel_initial", getter=get_vowel_initial, force=True)
Token.set_extension("vowel_final", getter=get_vowel_final, force=True)
Token.set_extension("is_first_syllable_stressed", getter=get_is_first_syllable_stressed, force=True)
Token.set_extension("is_last_syllable_stressed", getter=get_is_last_syllable_stressed, force=True)

def get_asyllabic_form(underlying_representation, sandhi):
    asyllabic_form = ""
    if underlying_representation == "mă":
        asyllabic_form = "m"
    elif underlying_representation == "vă":
        asyllabic_form = "v"
    elif  underlying_representation == "nu":
        asyllabic_form = "n"
    elif  underlying_representation == "lu":
        asyllabic_form = "l"
    elif  underlying_representation == "su":
        asyllabic_form = "s"
    elif  underlying_representation == "o":
        asyllabic_form = "o"
    elif  underlying_representation == "se" and sandhi == "OBLIGATORY":
        asyllabic_form = "s"
    # disallow the forms "s-aduce", allow only "se-aduce" to decrease possible ambiguity 
    # between the RWP "se" and the subjunction "să" as in "s-aduc" vs. "să aduca"
    elif  underlying_representation == "se" and sandhi != "OBLIGATORY":
        asyllabic_form = "se"
    elif  underlying_representation == "te":
        asyllabic_form = "te"
    elif  underlying_representation == "ne":
        asyllabic_form = "ne"
    elif  underlying_representation == "le":
        asyllabic_form = "le"
    elif underlying_representation == "mi":
        asyllabic_form = "mi"
    elif underlying_representation == "ți":
        asyllabic_form = "ți"
    elif underlying_representation == "i":
        asyllabic_form = "i"
    elif underlying_representation == "și":
        asyllabic_form = "și"
    elif underlying_representation == "ni":
        asyllabic_form = "ni"
    elif underlying_representation == "vi":
        asyllabic_form = "vi"
    elif underlying_representation == "li":
        asyllabic_form = "li"
    return asyllabic_form

# coping with ni => ne, vi => vă, and li => le
def get_interim_form(underlying_representation):
    interim_form = ""
    if underlying_representation == "ni":
        interim_form = "ne"
    elif underlying_representation == "vi":
        interim_form = "vă"
    elif underlying_representation == "li":
        interim_form = "le"
    else:
        interim_form = underlying_representation
    return interim_form
    
# get immediate left token
def get_lnbor(token):
    l_nbor = EMPTY_TOKEN
    if  token.i > 0:
        l_nbor = token.nbor(-1)
    return l_nbor
    
# get immediate right token
def get_rnbor(token):
    r_nbor = EMPTY_TOKEN
    if token.i < len(token.doc)-1:
        r_nbor = token.nbor(1)
    return r_nbor

def is_rightmost_in_cluster(token):
    r_nbor = get_rnbor(token)
    return (token._.is_rwp or token._.is_rwv) and not (r_nbor._.is_rwp or r_nbor._.is_rwv)

def is_leftmost_in_cluster(token):
    l_nbor = get_lnbor(token)
    return (token._.is_rwp or token._.is_rwv) and not (l_nbor._.is_rwp or l_nbor._.is_rwv)

def is_singleton_in_cluster(token):
    return is_rightmost_in_cluster(token) and is_leftmost_in_cluster(token)

def is_preverbal(token):
    preverbality = None
    theDoc = token.doc
    for lt in range(token.i, -1, -1):
        if token._.is_linear_order_part and theDoc[lt-1]._.is_linear_order_part:
            pass
        else:
            if (theDoc[lt-1].pos_ != 'VERB' and theDoc[lt-1].pos_ != 'INTJ') or theDoc[lt].pos_ == 'VERB':
                preverbality =  True
                break
            else:
                preverbality =  False
                break
    return preverbality



In [None]:

def filterOutput(originalOutput):
    filteredOutput = []
    unique_list = []
    pattern01 = re.compile(r' î..-|.{2,}-.{2}-mb|într-o-|nu-ntr|\S{2,}-nainte|-avea|de-a se |aduce-napoi|[aeiouăîâ]-î|putea-neca')
    
    for opt in originalOutput:
        match01 = re.search(pattern01, opt)
        if not match01:
            filteredOutput.append(opt)

    for x in filteredOutput:
        if x not in unique_list:
            unique_list.append(x)
    
    return unique_list


In [None]:

def indexOutput(filteredOutput, id, targets):
    genOput = {}

    for i,v in enumerate(filteredOutput):
        oputKey = 'ex'+id+'_o'+str(i+1)
        genOput[oputKey] = v
    return genOput
    

In [None]:

def getEvaluation(filteredOutput, id, targets):
    genEval = {}
    for i,v in enumerate(filteredOutput):
        evalKey = 'ex'+id+'_e'+str(i+1)
        if v in targets.values():
            genEval[evalKey] = 'ok'
        else:
            genEval[evalKey] = 'ko'

    trgLen = len(targets)
    oputLen = len(filteredOutput)
    okLen = sum(x == 'ok' for x in genEval.values())
    
    progressLabel = 'done' if trgLen == okLen and trgLen == oputLen else 'todo'
    progress = progressLabel + ' _ ' + str(oputLen)  + '|' + str(okLen) + ' > ' + str(trgLen)
        
    return [genEval, progress]
    

In [None]:

logDir = 'logDir_1'
genDir = 'genDir_1'

if not path.exists(logDir):
   os.makedirs(logDir)

if not path.exists(genDir):
   os.makedirs(genDir)

# output version suffix to compare and debug if needed
v_suffix = '88'

logFile = path.join(logDir, 'logfile_' + v_suffix + '.log')
genFile = path.join(genDir, 'genOpt_' + v_suffix + '.json')

iputDocs = "spacyAnnotation/rwp_352_corrected_annotation.spacy"
spacyDocuments = DocBin().from_disk(iputDocs)

iputRWP = "input_rwp_352.json"
with open(iputRWP) as input_json_file:
    rwpDB = json.load(input_json_file)

blank_nlp_ro = spacy.blank("ro")
pos_debug = True
addEvaluation = True


In [None]:

for ix,current_doc in enumerate(spacyDocuments.get_docs(blank_nlp_ro.vocab)):
    doc_length = len(current_doc)
    f_ix = f'{ix+1:03}'
    print("<" + str(f_ix) + ">" + " . . . . . . . . . . . . . . . . . . .")
    print("<" + str(f_ix) + ">" + " . . . . . . . . . . . . . . . . . . .", file=open(logFile, 'a'))
    print("[" + str(doc_length) + " items] " + str(current_doc))
    print("[" + str(doc_length) + " items] " + str(current_doc), file=open(logFile, 'a'))
    print("........................", file=open(logFile, 'a'))
    output_tokens = []
    in_tokens = []
    output_variants = []
    
    for token in current_doc:
        if pos_debug:
            print(str(token.i) + " " + token.text + " " + token.lower_ + " " + \
            token.pos_ + " " + token.tag_ + " " + token.dep_, file=open(logFile, 'a'))

        r_nbor = get_rnbor(token)
        l_nbor = get_lnbor(token)
        surface_form = ""
        interim_form = ""
        postverbal_hyphen = ""
        surface_forms = []
        
        # 1. current token is a Linear Order Part (LOP), i.e., can occur both pre- and post-verbally
        if token._.is_linear_order_part:
            is_postverbal_token = not is_preverbal(token) and not r_nbor.pos_ == 'VERB'
            if is_postverbal_token:
                postverbal_hyphen = HYPHEN_TOKEN.text

            # 1.1 current token is a Romanian Weak Pronoun (RWP) or a Romanian Weak Verb (RWV)
            if  token._.is_rwp or token._.is_rwv:
                # 1.1.1 RWP rightmost item in the cluster
                if is_rightmost_in_cluster(token):
                    
                    # cope with ni => ne, vi => vă, and li => le
                    interim_form = get_interim_form(token.lower_)
                    
                    # 1.1.1.1 obligatory sandhi to the right
                    if r_nbor._.is_obligatory_host:
                        surface_form = get_asyllabic_form(interim_form, "OBLIGATORY") + HYPHEN_TOKEN.text
                        surface_forms.append(postverbal_hyphen+surface_form)

                    # 1.1.1.2 no obligatory host to the right
                    else:
                        
                        # 1.1.1.2.1 obligatory sandhi to the left for the u- and i-forms (lu ==> -l, mi ==> -mi)
                        if l_nbor._.is_rwp:
                            # 1.1.1.2.1.1 e- or ă-forms
                            if ('e' in interim_form) or ('ă' in interim_form):
                                surface_form = interim_form
                                surface_forms.append(postverbal_hyphen+surface_form)
                                ## check for optional sandhi
                                if r_nbor._.vowel_initial_char and r_nbor._.vowel_initial and not \
                                   r_nbor._.is_first_syllable_stressed and not r_nbor.lower_.startswith('o') and not\
                                   r_nbor.lower_.startswith('e'):
                                    surface_form = get_asyllabic_form(interim_form, "OPTIONAL") + HYPHEN_TOKEN.text
                                    surface_forms.append(postverbal_hyphen+surface_form)

                                
                            # 1.1.1.2.1.2 i- or u-forms
                            else:
                                surface_form = HYPHEN_TOKEN.text + get_asyllabic_form(interim_form, "OBLIGATORY")
                                surface_forms.append(postverbal_hyphen+surface_form)
                        
                        # rightmost AND leftmost item ==> single RWP/RWV
                        # check context for different syllabic hosts: to the right, to the left or î-prothetic vowel
                        # 1.1.1.2.2 no obligatory sandhi to the left
                        else:
                            # 1.1.1.2.2.1 e- or ă-forms
                            if ('e' in interim_form) or  ('ă' in interim_form):
                                surface_form = interim_form
                                surface_forms.append(postverbal_hyphen+surface_form)
                                ## check for optional sandhi
                                if r_nbor._.vowel_initial_char and r_nbor._.vowel_initial and not \
                                   r_nbor._.is_first_syllable_stressed and not r_nbor.lower_.startswith('o') and not\
                                   r_nbor.lower_.startswith('e'):
                                    surface_form = get_asyllabic_form(interim_form, "OPTIONAL") + HYPHEN_TOKEN.text
                                    surface_forms.append(postverbal_hyphen+surface_form)
                                # ni-e, vi-e, li-e sete    
                                if token.lower_.endswith('i') and ('-pd-' in token.tag_) and \
                                   r_nbor.lower_ == 'e' and r_nbor.pos_ == 'AUX':
                                    surface_form = token.text + HYPHEN_TOKEN.text
                                    surface_forms.append(postverbal_hyphen+surface_form)
                            # 1.1.1.2.2.2 i- or u-forms
                            else:
                                ## check pre- or post-verbal position
                                if is_postverbal_token:
                                    surface_form = get_asyllabic_form(interim_form, "OBLIGATORY")
                                    surface_forms.append(postverbal_hyphen+surface_form)
                                else:
                                    surface_form = "î" + get_asyllabic_form(interim_form, "OBLIGATORY")
                                    surface_forms.append(postverbal_hyphen+surface_form)
                                    # TODO: refine this
                                    if r_nbor._.vowel_initial_char and r_nbor._.vowel_initial and not \
                                       r_nbor.lower_.startswith('î') and not r_nbor._.is_first_syllable_stressed:
                                        surface_form = get_asyllabic_form(interim_form, "OBLIGATORY") + HYPHEN_TOKEN.text
                                        surface_forms.append(surface_form)
                                        print("gogo " + str(r_nbor._.is_first_syllable_stressed), file=open(logFile, 'a'))
                                    # disabiguate VERB.inf RWP VERB.pres: do not attach a RWP to an infinitive ("*a vedea-i cade greu") 
                                    if l_nbor._.vowel_final and not (l_nbor.pos_ == 'VERB' and l_nbor.tag_.startswith('Vmii') and \
                                                                    r_nbor.pos_ == 'VERB' and r_nbor.tag_.startswith('Vmip')):
                                        surface_form = HYPHEN_TOKEN.text + get_asyllabic_form(interim_form, "OBLIGATORY")
                                        surface_forms.append(surface_form)
                                    if r_nbor.pos_ == 'VERB' and r_nbor.lower_.startswith('î'):
                                        surface_form = token.text + HYPHEN_TOKEN.text
                                        surface_forms.append(surface_form)
                                    if r_nbor.pos_ == 'VERB' and r_nbor.lower_.startswith('î') and token.lower_ == 'lu':
                                        surface_form = get_asyllabic_form(interim_form, "OBLIGATORY") + HYPHEN_TOKEN.text
                                        surface_forms.append(surface_form)

                # 1.1.1 RWP not rightmost item in the cluster
                else:
                    surface_form = token.text
                    surface_forms.append(postverbal_hyphen+surface_form)

            # 1.2 current token is not (RWP or RWV), but a LOP, i.e., an auxiliary verb or the o-RWP
            else:
                surface_form = token.text
                surface_forms.append(postverbal_hyphen+surface_form)

            output_tokens.append(surface_forms)

        # 2. current token is not a LOP
        else:
            # adjust gerund forms
            # ex: dând l afară ==> dându-l afară
            if token.pos_ == 'VERB' and token.tag_ == 'Vmg' and r_nbor._.is_rwp:
                surface_form = token.text + 'u'
                surface_forms.append(surface_form)
            # adjust the imperative forms
            # ex: uită o/împușcă o ==> uit-o/împușc-o
            elif token.pos_ == 'VERB' and token.lower_.endswith('ă') and len(token.lower_) > 2 and r_nbor._.is_vocalic_rwp:
                surface_form = token.lower_[:-1]
                surface_forms.append(surface_form)
            # leave the token as it it    
            # ex: Îți dau mere. ==> Îți dau mere.
            else: 
                surface_forms.append(token.text)

            # optional sandhi of negation
            # ex: nu am văzut ==> n-am văzut
            if token.lower_ == 'nu' and r_nbor._.vowel_initial_char and r_nbor._.vowel_initial and not r_nbor.lower_.startswith('î'):
                surface_form = token.lower_.replace('u','') + '-'
                surface_forms.append(surface_form)

            # optional sandhi with of î-forms 'în'/'încolo'
            # ex: și încolo ==> și-ncolo
            if token.lower_.startswith('î') and l_nbor._.vowel_final_char and l_nbor._.vowel_final and l_nbor.lower_ != 'lu' :
                surface_form = token.text.replace('î','-',1)
                surface_forms.append(surface_form)

            # optional sandhi with ă-forms 'să'/'că' 
            # ex: vreau să o cumperi ==> vreau s-o cumperi
            if token.lower_.endswith('ă') and token.pos_ != 'VERB' and r_nbor._.is_vocalic_rwp:
                surface_form = token.text.replace('ă','-',1)
                surface_forms.append(surface_form)

            output_tokens.append(surface_forms)

    output_variants = []
    print(output_tokens)

    for l in itertools.product(*output_tokens):
        print(l)
        xput = f'{" ".join(l).replace("- ", "-").replace(" -", "-").replace("--", "-")}'
        xput = f'{xput.replace(" ,", ",").replace(" ?", "?").replace(" .",".").replace(" !", "!")}'
        xput = xput[:1].upper() + xput[1:]
        xput = re.compile(r"\s+").sub(" ", xput).strip()
        output_variants.append(xput)
    
    filteredOutput = filterOutput(output_variants)
    
    print("........................", file=open(logFile, 'a'))
    print(filteredOutput, file=open(logFile, 'a'))
    print("................................................................")

    targets = rwpDB['rwp_db']['ex'+f_ix]['targets']

    # insert generation output into the rwpDB
    rwpDB['rwp_db']['ex'+f_ix]['output'] = indexOutput(filteredOutput, f_ix, targets)

    # insert generation evaluation into the rwpDB
    if addEvaluation:
        (genEval, progress) = getEvaluation(filteredOutput, f_ix, targets)
        rwpDB['rwp_db']['ex'+f_ix]['evaluation'] = genEval
        rwpDB['rwp_db']['ex'+f_ix]['ex'+f_ix+'_progress'] = progress

    # in loop: ___for current_doc___
    print("-------------------------------------------", file=open(logFile, 'a'))

print("===========================================", file=open(logFile, 'a'))
   
with open(genFile, 'w', encoding='utf-8') as f:
    json.dump(rwpDB, f, ensure_ascii=False, indent=2)
