Skip to content

Commit

Permalink
Minor update to OE docs and Syllabifier constructor (#829)
Browse files Browse the repository at this point in the history
* integrated OE tagger into core CLTK

* minor docs fix

* fixed RST docs for OE

* fixed RST docs for OE

* split tests into general nlp subdirectory and language-specific subdirectory

* gathered language-specific tests for OE and ME

* renamed test directories to be discoverable by nose

* fixed paths for test files

* changed name of OE syllabifier to old_english (on the model of old_norse) and updated docs for OE

* fixed high german test

* minor fixes to tests, added a test for OE syllabification

* handled pandas deprecation warning

* fixed doctest to use middle_high_german key
  • Loading branch information
free-variation authored and kylepjohnson committed Sep 17, 2018
1 parent 09f3640 commit c809655
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 10 deletions.
12 changes: 6 additions & 6 deletions cltk/phonology/syllabify.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def __init__(self, low_vowels=None, mid_vowels=None, high_vowels=None, flaps=Non

self.break_geminants = break_geminants

if language == 'middle english':
if language == 'middle_english':
hierarchy = [[] for _ in range(len(set(ME_Syllabifier.values())))]

for k in ME_Syllabifier:
Expand All @@ -84,7 +84,7 @@ def __init__(self, low_vowels=None, mid_vowels=None, high_vowels=None, flaps=Non
self.set_hierarchy(hierarchy)
self.set_vowels(hierarchy[0])

elif language == 'old english':
elif language == 'old_english':
hierarchy = [[] for _ in range(len(set(OE_Syllabifier.values())))]

for k in OE_Syllabifier:
Expand All @@ -93,7 +93,7 @@ def __init__(self, low_vowels=None, mid_vowels=None, high_vowels=None, flaps=Non
self.set_hierarchy(hierarchy)
self.set_vowels(hierarchy[0])

elif language == 'middle high german':
elif language == 'middle_high_german':
hierarchy = [[] for _ in range(len(set(MHG_Syllabifier.values())))]

for k in MHG_Syllabifier:
Expand Down Expand Up @@ -207,17 +207,17 @@ def syllabify_SSP(self, word):
Additionally, you can utilize the language parameter:
>>> s = Syllabifier(language='middle high german')
>>> s = Syllabifier(language='middle_high_german')
>>> s.syllabify('lobebæren')
['lo', 'be', 'bæ', 'ren']
>>> s = Syllabifier(language='middle english')
>>> s = Syllabifier(language='middle_english')
>>> s.syllabify("huntyng")
['hun', 'tyng']
>>> s = Syllabifier(language='old english')
>>> s = Syllabifier(language='old_english')
>>> s.syllabify("arcebiscop")
['ar', 'ce', 'bis', 'cop']
Expand Down
4 changes: 2 additions & 2 deletions cltk/stem/sanskrit/indian_syllabifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ def get_lang_data(self):
tamil_csv = os.path.join(csv_dir_path, 'tamil_script_phonetic_data.csv')
tamil_phonetic_data = pd.read_csv(tamil_csv, encoding='utf-8')

all_phonetic_vectors = all_phonetic_data.ix[:, PHONETIC_VECTOR_START_OFFSET:].as_matrix()
tamil_phonetic_vectors = tamil_phonetic_data.ix[:, PHONETIC_VECTOR_START_OFFSET:].as_matrix()
all_phonetic_vectors = all_phonetic_data.ix[:, PHONETIC_VECTOR_START_OFFSET:].values
tamil_phonetic_vectors = tamil_phonetic_data.ix[:, PHONETIC_VECTOR_START_OFFSET:].values

phonetic_vector_length = all_phonetic_vectors.shape[1]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def test_middle_high_german_syllabification(self):
"""
Test Middle High German syllabification
"""
mhg_syllabifier = Syllabifier(language='middle high german')
mhg_syllabifier = Syllabifier(language='middle_high_german')
syllabified = mhg_syllabifier.syllabify('lobebæren')
target = ['lo', 'be', 'bæ', 'ren']

Expand Down
5 changes: 5 additions & 0 deletions cltk/tests/test_languages/test_old_english.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from cltk.corpus.utils.importer import CorpusImporter
from cltk.corpus.swadesh import Swadesh
from cltk.tag.pos import POSTag
from cltk.phonology.syllabify import Syllabifier

__author__ = ["John Stewart <johnstewart@aya.yale.edu>", ]

Expand All @@ -27,6 +28,10 @@ def test_swadesh_old_english(self):
match = swadesh.words()[0]
self.assertEqual(first_word, match)

def test_syllabification_old_english(self):
s = Syllabifier(language='old_english')
self.assertEqual(s.syllabify('geardagum'), ['gear', 'da', 'gum'])

def test_pos_unigram_old_english(self):
"""Test tagging Old English POS with unigram tagger."""
tagger = POSTag('old_english')
Expand Down
13 changes: 12 additions & 1 deletion docs/old_english.rst
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,17 @@ The reverse process is also possible:
In [3]: t.transliterate('Hƿæt Ƿe Gardena in geardagum', 'Anglo-Saxon')
Out[3]: 'ᚻᚹᚫᛏ ᚹᛖ ᚷᚪᚱᛞᛖᚾᚪ ᛁᚾ ᚷᛠᚱᛞᚪᚷᚢᛗ'
Syllabification
===============

There is a facility for using the pre-specified sonoroty hierarchy for Old English to syllabify words.

.. code-block:: python
In [1]: from cltk.phonology.syllabify import Syllabifier
In [2]: s = Syllabifier(language='old_english')
In [3]: s.syllabify('geardagum')
Out [3]:['gear', 'da', 'gum']
POS tagging
===========

Expand All @@ -132,7 +143,7 @@ There are a number of different pre-trained models available for POS tagging of
* Conditional Random Field (CRF) model
* Perceptron model

(Bigram and trigram models are also available, but unsuitable due to low accuracy.)
(Bigram and trigram models are also available, but unsuitable due to low recall.)

The taggers were trained from annotated data from the `The ISWOC Treebank <http://iswoc.github.io/>`_ (license: Creative Commons Attribution-NonCommercial-ShareAlike 3.0 License).

Expand Down

0 comments on commit c809655

Please sign in to comment.