<a href="https://colab.research.google.com/github/elleish/sakha-language-tools/blob/main/lexc_parsing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Python morphological generator based on apertium

<ul><li>Apertium git <a href="https://github.com/apertium/apertium-sah">github.com/apertium/apertium-sah</a></li>
<li>Apertium online service <a href="https://beta.apertium.org/index.eng.html#analysis?aLang=sah">beta.apertium.org</a></li>
<li>Apertium explained <a href="https://blogs.helsinki.fi/language-technology/files/2016/09/FINMT2016-francis-tyers.pdf">blogs.helsinki.fi/language-technology</a></li>
<li>Starting a new language with HFST <a href="https://wiki.apertium.org/wiki/Starting_a_new_language_with_HFST">wiki.apertium.org</a>
</ul>

In [1]:
# downloading apertium from github
!wget https://raw.githubusercontent.com/apertium/apertium-sah/master/apertium-sah.sah.lexc

--2023-10-23 05:39:42--  https://raw.githubusercontent.com/apertium/apertium-sah/master/apertium-sah.sah.lexc
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1841535 (1.8M) [text/plain]
Saving to: ‘apertium-sah.sah.lexc’


2023-10-23 05:39:42 (21.8 MB/s) - ‘apertium-sah.sah.lexc’ saved [1841535/1841535]



In [2]:
# loading downloaded apertium.lexc for s
f = open("apertium-sah.sah.lexc", "r")
lexc = f.read()
print(lexc[:320])

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!!       М О Р Ф О Л О Г И Ч Е С К И Й · Т Р А Н С Д У К Т О Р · Д Л Я       !!
!!                       Я К У Т С К О Г О · Я З Ы К А                       !!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!



In [3]:
def clear(x):
    return x.split(';')[0].split('!')[0].split('#')[0].strip()



lexc_lines = lexc.split('\n')
tree = dict()
header, block = '__empty', []

for line in lexc_lines:
    if line.startswith("LEXICON Root") or line.startswith("Multichar_Symbols") or line.startswith("LEXICON"):
        line = line.split("LEXICON ")[-1]
        tree[header] = block.copy()
        block = []
        header = clear(line)
    else:
        if line != '' and not line.startswith('!'):
            block.append(clear(line))

tree.keys()


dict_keys(['__empty', 'Multichar_Symbols', 'Root', 'CLIT-EMPH', 'CLITICS-NO-COP', 'CLITICS-INCL-COP', 'COPULA', 'CASES-OBL', 'CASES-POSS-3SP', 'CASES-POSS-12SG', 'CASES-NOM', 'POSS-OBL', 'POSS-PX3PL-OBL-SG', 'POSS-PX3PL-OBL-PL', 'POSS-OBL-PL', 'POSS-OBL-SG', 'POSS-NOM', 'POSS-NOM-ENDINGS', 'POSS-PX3PL-NOM-SG', 'POSS-PX3PL-NOM-PL', 'POSS-NOM-PL', 'POSS-NOM-SG', 'ATTR-SUBST', 'GENPOSS-ETC', 'CASES-ETC', 'N-INFL-COMMON-SG', 'N-INFL-COMMON-PL', 'CASES', 'GER-SUBST', 'GER-SUBST-NOM', 'SUBST', 'LII-POSTPOSITION', 'FULL-NOMINAL-INFLECTION', 'N1', 'N1-IRREG-PL', 'N-COMPOUND-PX-COMMON', 'N5', 'NP-COMMON', 'NP-ANT-M', 'NP-ANT-F', 'NP-PAT-VICH', 'NP-COG-OBIN-FEM', 'NP-COG-OB', 'NP-COG-IN', 'NP-COG-M', 'NP-COG-MF', 'NP-PAT-M', 'NP-TOP', 'NP-TOP-RUS', 'NP-TOP-ASSR', 'NP-TOP-COMPOUND', 'NP-TOP-ABBR', 'NP-ORG', 'NP-AL', 'A1', 'A2', 'A3', 'A4', 'A9', 'NUM', 'NUM-DIGIT', 'NUM-ORD', 'NUM-COLL', 'NUM-ROMAN', 'PRON-MIN', 'PRON-EN', 'PRON-KINI', 'PRON-BIHIGI', 'PRON-EHIGI', 'PRON-KINILER', 'PRON-DEM-BU-COM

In [25]:
def it_is_node(x):
    x = str(x)
    if len(x) > 0:
        return x==x.upper() and x in tree.keys()
    else:
        return False

def triple(x):
   tempa, tempb, tempc = "", "", ""
   if ';' in x:
      x = x.split(';', 1)[0]
   if ':' in x:
      tempa, x = x.split(':', 1)
   if ' ' in x:
      tempb, tempc = x.split(' ', 1)
   else:
      tempc = x
   return tempa.strip(), tempb.strip(), tempc.strip()



def go(x, depth, morph, surface):
    if depth > 16 :
        print('depth16')
        return
    print('┃  ' * depth, x)
    visited = []
    if x in tree.keys():
        for line in tree[x]:
            tag, form, node = triple(line)
            if it_is_node(node) and not node in visited:
                print('┃  ' * max(0, depth-1), node)
                print('┃  ' * max(0, depth-1) + '┠──', (morph + tag).replace("%",""), (surface + form).replace("%",""))
                visited.append(node)
                if not node in ('CLITICS-NO-COP', 'COPULA'):
                    go(node, depth + 1, morph + tag, surface + form)

    return

In [26]:
go('Verbs', 1, '', '')

┃   Verbs
 V-AUX
┠── эр эр
┃  ┃   V-AUX
┃   V-COMMON
┃  ┠── эр<vaux> эр
┃  ┃  ┃   V-COMMON
┃  ┃   V-FINITE-REGULAR_NEGATIVE
┃  ┃  ┠── эр<vaux> эр
┃  ┃  ┃  ┃   V-FINITE-REGULAR_NEGATIVE
┃  ┃  ┃   V-PERS-IFI
┃  ┃  ┃  ┠── эр<vaux><ifi> эр>{T}
┃  ┃  ┃  ┃  ┃   V-PERS-IFI
┃  ┃  ┃  ┃   CLITICS-NO-COP
┃  ┃  ┃  ┃  ┠── эр<vaux><ifi><p1><sg> эр>{T}>{I}м
┃  ┃   V-FINITE-IRREGULAR_NEGATIVE
┃  ┃  ┠── эр<vaux> эр
┃  ┃  ┃  ┃   V-FINITE-IRREGULAR_NEGATIVE
┃  ┃  ┃   V-PERS-S1
┃  ┃  ┃  ┠── эр<vaux><aor> эр>{A}{р}
┃  ┃  ┃  ┃  ┃   V-PERS-S1
┃  ┃  ┃  ┃   V-PERS-S1-P12
┃  ┃  ┃  ┃  ┠── эр<vaux><aor> эр>{A}{р}
┃  ┃  ┃  ┃  ┃  ┃   V-PERS-S1-P12
┃  ┃  ┃  ┃  ┃   CLITICS-NO-COP
┃  ┃  ┃  ┃  ┃  ┠── эр<vaux><aor><p1><sg> эр>{A}{р}>{B}{I}н
┃  ┃  ┃  ┃   V-PERS-S1-P3
┃  ┃  ┃  ┃  ┠── эр<vaux><aor> эр>{A}{р}
┃  ┃  ┃  ┃  ┃  ┃   V-PERS-S1-P3
┃  ┃  ┃  ┃  ┃   CLITICS-NO-COP
┃  ┃  ┃  ┃  ┃  ┠── эр<vaux><aor><p3><sg> эр>{A}{р}
┃  ┃  ┃   V-PERS-S2
┃  ┃  ┃  ┠── эр<vaux><plu> эр>{B}{I}т
┃  ┃  ┃  ┃  ┃   V-PERS-S2
┃  ┃  ┃  ┃   CLITICS

In [27]:
go('V-AUX', 1, '', '')

┃   V-AUX
 V-COMMON
┠── <vaux> 
┃  ┃   V-COMMON
┃   V-FINITE-REGULAR_NEGATIVE
┃  ┠── <vaux> 
┃  ┃  ┃   V-FINITE-REGULAR_NEGATIVE
┃  ┃   V-PERS-IFI
┃  ┃  ┠── <vaux><ifi> >{T}
┃  ┃  ┃  ┃   V-PERS-IFI
┃  ┃  ┃   CLITICS-NO-COP
┃  ┃  ┃  ┠── <vaux><ifi><p1><sg> >{T}>{I}м
┃   V-FINITE-IRREGULAR_NEGATIVE
┃  ┠── <vaux> 
┃  ┃  ┃   V-FINITE-IRREGULAR_NEGATIVE
┃  ┃   V-PERS-S1
┃  ┃  ┠── <vaux><aor> >{A}{р}
┃  ┃  ┃  ┃   V-PERS-S1
┃  ┃  ┃   V-PERS-S1-P12
┃  ┃  ┃  ┠── <vaux><aor> >{A}{р}
┃  ┃  ┃  ┃  ┃   V-PERS-S1-P12
┃  ┃  ┃  ┃   CLITICS-NO-COP
┃  ┃  ┃  ┃  ┠── <vaux><aor><p1><sg> >{A}{р}>{B}{I}н
┃  ┃  ┃   V-PERS-S1-P3
┃  ┃  ┃  ┠── <vaux><aor> >{A}{р}
┃  ┃  ┃  ┃  ┃   V-PERS-S1-P3
┃  ┃  ┃  ┃   CLITICS-NO-COP
┃  ┃  ┃  ┃  ┠── <vaux><aor><p3><sg> >{A}{р}
┃  ┃   V-PERS-S2
┃  ┃  ┠── <vaux><plu> >{B}{I}т
┃  ┃  ┃  ┃   V-PERS-S2
┃  ┃  ┃   CLITICS-NO-COP
┃  ┃  ┃  ┠── <vaux><plu><p1><sg> >{B}{I}т>{I}м
┃  ┃   V-PERS-NEGFUT
┃  ┃  ┠── <vaux><neg><fut> >{I}{A}
┃  ┃  ┃  ┃   V-PERS-NEGFUT
┃  ┃   V-PERS-S1-P12
┃  ┃  ┠─

In [21]:
[triple(a,b,c) for a,b,c in tree['Verbs']]

ValueError: ignored

In [None]:
print([tree['LEXICON N5'],\
tree['LEXICON SUBST'],\
tree['LEXICON LII-POSTPOSITION'] \
])
print(tree['LEXICON N-INFL-COMMON-PL'])

[['%<n%>%<attr%>:', '%<n%>:%{☭%} SUBST', '%<n%>: SUBST', '%<n%>:%{☭%} LII-POSTPOSITION'], ['N-INFL-COMMON-SG', '%<pl%>:%>%{L%}%{A%}р N-INFL-COMMON-PL'], ['%+лыы%<post%>:%>%{L%}%{I%}%{I%} CLITICS-NO-COP']]
['POSS-NOM-PL', 'POSS-OBL-PL', 'CASES']


In [None]:
go('LEXICON V-IV')

%<v%>%<iv%>: V-COMMON
V-COMMON
V-FINITE-REGULAR_NEGATIVE
%<ifi%>:%>%{T%} V-PERS-IFI
%>%{T%} V-PERS-IFI
%<neg%>:%>%{B%}%{A%} V-FINITE-REGULAR_NEGATIVE
%>%{B%}%{A%} V-FINITE-REGULAR_NEGATIVE
V-FINITE-IRREGULAR_NEGATIVE
%<aor%>:%>%{A%}%{р%} V-PERS-S1
%>%{A%}%{р%} V-PERS-S1
%<neg%>%<aor%>:%>%{B%}%{A%}т V-PERS-S1
%>%{B%}%{A%}т V-PERS-S1
%<past%>:%>%{B%}%{I%}т V-PERS-S1
%>%{B%}%{I%}т V-PERS-S1
%<neg%>%<past%>:%>%{B%}%{A%}т%{A%}х V-PERS-S1
%>%{B%}%{A%}т%{A%}х V-PERS-S1
%<plu%>:%>%{B%}%{I%}т V-PERS-S2
%>%{B%}%{I%}т V-PERS-S2
%<neg%>%<plu%>:%>%{B%}%{A%}т%{A%}х V-PERS-S2
%>%{B%}%{A%}т%{A%}х V-PERS-S2
%<pii%>:%>%{A%}%{Р%}% эт V-PERS-S2
%>%{A%}%{Р%}% эт V-PERS-S2
%<neg%>%<pii%>:%>%{B%}%{A%}т% эт V-PERS-S2
%>%{B%}%{A%}т% эт V-PERS-S2
%<pii%>:%>%{A%}%{Р%} V-PERS-S2
%>%{A%}%{Р%} V-PERS-S2
%<neg%>%<pii%>:%>%{B%}%{A%}т V-PERS-S2
%>%{B%}%{A%}т V-PERS-S2
%<epis%>:%>%{B%}%{I%}тт%{A%}%{A%}х V-PERS-S1
%>%{B%}%{I%}тт%{A%}%{A%}х V-PERS-S1
%<aor%>%<nec%>:%>%{A%}р%>д%{A%}%{A%}х V-PERS-S1
%>%{A%}р%>д%{A%}%{A%}х 

In [None]:
tree['LEXICON V-IV']

['%<v%>%<iv%>: V-COMMON', '%<v%>%<tv%>: V-DER']

In [None]:
tree['LEXICON N-INFL-COMMON-SG']

['POSS-NOM-SG', 'POSS-OBL-SG', 'CASES']

In [None]:
[tree['LEXICON POSS-NOM-SG'], tree['LEXICON POSS-OBL-SG'], tree['LEXICON CASES']]

[['POSS-NOM', 'POSS-PX3PL-NOM-SG'],
 ['POSS-OBL', 'POSS-PX3PL-OBL-SG'],
 ['CASES-NOM', 'CASES-OBL']]

In [None]:
[[tree['LEXICON POSS-NOM'], tree['LEXICON POSS-PX3PL-NOM-SG']],
 [tree['LEXICON POSS-OBL'], tree['LEXICON POSS-PX3PL-OBL-SG']],
 [tree['LEXICON CASES-NOM'], tree['LEXICON CASES-OBL']]]

[[['CASES-ETC', 'POSS-NOM-ENDINGS'],
  ['%<px3pl%>:%>%{L%}%{A%}р%{A%} CASES-NOM',
   '%<px3sg%>:%>%{t%}%{A%} CASES-NOM']],
 [['CASES-ETC',
   '%<px1sg%>:%>%{B%}%{I%} CASES-POSS-12SG',
   '%<px1sg%>%<dat%>:%>%{B%}%{A%}р',
   '%<px2sg%>:%>%{G%}%{I%} CASES-POSS-12SG',
   '%<px2sg%>%<dat%>:%>%{G%}%{A%}р',
   '%<px1pl%>:%>%{B%}%{I%}т%{I%} CASES-POSS-3SP',
   '%<px2pl%>:%>%{G%}%{I%}т%{I%} CASES-POSS-3SP'],
  ['%<px3pl%>:%>%{L%}%{A%}р%{I%} CASES-POSS-3SP',
   '%<px3sg%>:%>%{t%}%{I%} CASES-POSS-3SP']],
 [['%<nom%>: CLITICS-INCL-COP'],
  ['%<par%>:%>%{T%}%{A%}',
   '%<dat%>:%>%{G%}%{A%}',
   '%<acc%>:%>%{n%}%{I%}',
   '%<abl%>:%>%{t%}т%{A%}н',
   '%<ins%>:%>%{i%}н%{A%}н',
   '%<com%>:%>%{L%}%{I%}%{I%}н',
   '%<comp%>:%>%{T%}%{A%}%{A%}ҕ%{A%}р']]]

In [None]:
tree['LEXICON CLITICS-INCL-COP']

['', 'CLITICS-NO-COP', 'COPULA']

In [None]:
tree['LEXICON CLITICS-INCL-COP']

['', 'CLITICS-NO-COP', 'COPULA']