Skip to content
Permalink
Browse files

Merge branch 'master' into ci-docs-doctest

  • Loading branch information...
todd-cook committed May 4, 2018
2 parents 86ffa96 + b0f519c commit b21e7778b2d649e8a2f2749b0dac17f9f7671179
Showing with 1,655 additions and 265 deletions.
  1. +9 −1 .coveragerc
  2. +5 −0 .travis.yml
  3. +2 −2 cltk/corpus/greek/beta_to_unicode.py
  4. +10 −0 cltk/corpus/gujarati/corpora.py
  5. +1 −0 cltk/corpus/middle_english/__init__.py
  6. +80 −0 cltk/corpus/middle_english/alphabet.py
  7. +1 −0 cltk/corpus/middle_high_german/__init__.py
  8. +59 −0 cltk/corpus/middle_high_german/alphabet.py
  9. +12 −0 cltk/corpus/old_english/alphabet.py
  10. +32 −3 cltk/corpus/swadesh.py
  11. +11 −7 cltk/corpus/utils/importer.py
  12. +11 −11 cltk/lemmatize/latin/backoff.py
  13. +2 −0 cltk/prosody/latin/ScansionConstants.py
  14. +39 −1 cltk/prosody/latin/Syllabifier.py
  15. +1 −0 cltk/stem/middle_english/__init__.py
  16. +101 −0 cltk/stem/middle_english/stem.py
  17. +1 −0 cltk/stem/middle_high_german/__init__.py
  18. +98 −0 cltk/stem/middle_high_german/stem.py
  19. +1 −1 cltk/stop/classical_hindi/stops.py
  20. +1 −0 cltk/stop/middle_high_german/__init__.py
  21. +229 −0 cltk/stop/middle_high_german/stops.py
  22. +1 −1 cltk/stop/sanskrit/stops.py
  23. +260 −139 cltk/stop/stop.py
  24. +45 −3 cltk/tests/test_corpus.py
  25. +9 −3 cltk/tests/test_stem.py
  26. +149 −3 cltk/tests/test_stop.py
  27. +15 −0 cltk/tests/test_tokenize.py
  28. +0 −34 cltk/tokenize/indian_tokenizer.py
  29. +27 −5 cltk/tokenize/sentence.py
  30. +47 −29 cltk/tokenize/word.py
  31. +5 −3 docs/bengali.rst
  32. +11 −0 docs/french.rst
  33. +36 −6 docs/hindi.rst
  34. +4 −1 docs/index.rst
  35. +7 −3 docs/marathi.rst
  36. +65 −0 docs/middle_english.rst
  37. +115 −0 docs/middle_high_german.rst
  38. +61 −0 docs/multilingual.rst
  39. +14 −0 docs/old_english.rst
  40. +17 −0 docs/old_norse.rst
  41. +17 −0 docs/old_portguese.rst
  42. +20 −4 docs/sanskrit.rst
  43. +6 −4 docs/telugu.rst
  44. +17 −0 docs/tocharian_b.rst
  45. +1 −1 setup.py
@@ -1,4 +1,12 @@
[report]
omit =
*/python?.?/*
*/site-packages/nose/*
*/site-packages/nose/*
exclude_lines =
pragma: no cover
def __repr__
if self\.debug
if 0:
if __name__ == .__main__.:
# raise AssertionError
# raise NotImplementedError
@@ -21,6 +21,9 @@ before_script:
- pip install python-Levenshtein
- pip install gensim # for word2vec.py
- pip install Sphinx
- pip install numpy
- pip install scipy
- pip install scikit-learn

script:
# Notes on nose:
@@ -36,6 +39,8 @@ after_success:
deploy:
provider: pypi
user: kyle_johnson
edge:
branch: v1.8.45
on:
branch: master
password:
@@ -322,10 +322,10 @@
(r'S ', 'ς '),
(r'S$', 'ς'),
(r'S:', 'ς:'),
(r'S\.', 'ς\.'), # pylint: disable=anomalous-backslash-in-string
(r'S\.', 'ς.'),
(r'S,', 'ς,'),
(r'S;', 'ς;'),
(r'S\'', 'ς\''),
(r'S\'', "ς'"),
(r'S-', 'ς-'),
(r'S_', 'ς_'),
#
@@ -0,0 +1,10 @@
"""Gujarati language corpora available for download or loading locally.
All remote corpora hosted by github on the cltk organization account, eg:
'http://github.com/cltk' + name
"""
GUJARATI_CORPORA = [
{'name': 'gujarati_text_wikisource',
'origin': 'https://github.com/cltk/gujarati_text_wikisource.git',
'location': 'remote',
'type': 'text'},
]
@@ -0,0 +1,80 @@
import re
"""
Sources:
From Old English to Standard English: A Course Book in Language Variation Across Time, Dennis Freeborn
https://web.cn.edu/kwheeler/documents/ME_Pronunciation.pdf
https://en.wikipedia.org/wiki/Middle_English_phonology
"""

ALPHABET = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'x', 'y', 'æ', 'ð', 'þ', 'ƿ']

"""
The produced consonant sound in Middle English are categorized as following:
Stops: ⟨/b/, /p/, /d/, /t/, /g/, /k/⟩
Affricatives: ⟨/ǰ/, /č/, /v/, /f/, /ð/, /θ/, /z/, /s/, /ž/, /š/, /c̹/, /x/, /h/⟩
Nasals: ⟨/m/, /n/, /ɳ/⟩
Later Resonants: ⟨/l/⟩
Medial Resonants: ⟨/r/, /y/, /w/⟩
Thorn (þ) was gradually replaced by the dipthong "th", while Eth (ð) which had already fallen out of use by the 14th century was
later replaced by "d"
Wynn (ƿ) is the predecessor of "w". Modern transliteration scripts, usually replace it with "w" as to avoid confusion with
the strikingly similar p
"""

CONSONANTS = ['b', 'c', 'd', 'f', 'g', 'h', 'l', 'm', 'n', 'p', 'r', 's', 't', 'x', 'ð', 'þ', 'ƿ']

"""
The vowel sounds in Middle English are divided into:
Long Vowels: ⟨/a:/, /e/, /e̜/, /i/ , /ɔ:/, /o/ , /u/⟩
Short Vowels: ⟨/a/, /ɛ/, /I/, /ɔ/, /U/, /ə/⟩
"""

VOWELS = ['a', 'e', 'i', 'o', 'u', 'y', 'æ']

"""
As established rules for ME orthography were effectively nonexistent, compiling a definite list of dipthongs is non-trivial. The
following aims to compile a list of the most commonly-used dipthongs.
"""

DIPTHONGS = ['ai', 'au', 'aw', 'ay', 'ei', 'eu', 'ew', 'ey', 'iu', 'iw', 'o', 'oi', 'ou', 'ow', 'oy', 'uw']


def normalize_middle_english(text, to_lower=True, alpha_conv=True, punct=True):
"""
:param text: str text to be normalized
:param to_lower: bool convert text to lower text
>>> normalize_middle_english('Whan Phebus in the CraBbe had neRe hys cours ronne', to_lower = True)
'whan phebus in the crabbe had nere hys cours ronne'
:param alpha_conv: bool convert text to canonical form æ -> ae, þ -> th, ð -> th, ȝ -> y if at beginning,
gh otherwise
>>> normalize_middle_english('I pray ȝow þat ȝe woll', alpha_conv = True)
'i pray yow that ye woll'
:param punct: remove punctuation
>>> normalize_middle_english("furst, to begynne:...", punct = True)
'furst to begynne'
:return:
"""

if to_lower:
text = text.lower()

if alpha_conv:
text = text.replace("æ", "ae").replace("þ", "th").replace("ð", "th")
text = re.sub(r'(?<!\w)(?=\w)ȝ', 'y', text)
text = text.replace("ȝ", "gh")

if punct:
text = re.sub(r"[\.\";\,\:\[\]\(\)!&?‘]", "", text)

return text
@@ -0,0 +1,59 @@
"""Sources: Schreibkonventionen des klassischen Mittelhochdeutschen - Ad fontes Simone Berchtold, Deutsches Seminar
https://de.wikipedia.org/wiki/Mittelhochdeutsch"""

#Alphabet of Middle High German

# c is used at the beginning of only loanwords and is pronounced the same as k (e.g. calant, cappitain)

# Double consonants are pronounced the same way as their corresponding letters in Modern Standard German (e.g. pp/p)

# schl, schm, schn, schw are written in MHG as sw, sl, sm, sn

ALPHABET = ["a", "ë", "e", "i", "o", "u", "ä", "ö", "ü", "â", "ê", "î", "ô", "û", "æ", "œ", "iu", "b", "d", "g", "h", "f", "c", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "z","ȥ"]

# The consonants of Middle High German are categorized as:
# Stops: ⟨p t k/c/q b d g⟩
# Affricates: ⟨pf/ph tz/z⟩
# Fricatives: ⟨v f s ȥ sch ch h⟩
# Nasals: ⟨m n⟩
# Liquids: ⟨l r⟩
# Semivowels: ⟨w j⟩

CONSONANTS = ["b", "d", "g", "h", "f", "c", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "z"]

VOWELS = ["a", "ë", "e", "i", "o", "u", "ä", "ö", "ü", "â", "ê", "î", "ô", "û", "æ", "œ", "iu"]

SHORT_VOWELS = ["a", "ë", "e", "i", "o", "u", "ä", "ö", "ü"]

# æ (also seen as ae), œ (also seen as oe) and iu denote the use of Umlaut over â, ô and û respectively

LONG_VOWELS = ["â", "ê", "î", "ô", "û", "æ", "œ", "iu"]

DIPTHONGS = ["ei", "ie", "ou", "öu", "uo", "üe"]

# ȥ or ʒ is used in modern handbooks and grammars to indicate the s or s-like sound which arose from Germanic t in the High German consonant shift.

import re

def normalize_middle_high_german(text, to_lower_all = True, to_lower_beginning = False, alpha_conv = True, punct = True):
"""
to_lower_all: convert whole text to lowercase
alpha_conv: convert alphabet to canonical form
punct: remove punctuation
"""

if to_lower_all:
text = text.lower()

if to_lower_beginning:
text = text[0].lower() + text[1:]
text = re.sub(r"(?<=[\.\?\!]\s)(\w)",lambda x: x.group(1).lower(),text)

if alpha_conv:
text = text.replace("ē","ê").replace("ī","î").replace("ā","â").replace("ō","ô").replace("ū","û")
text = text.replace("ae","æ").replace("oe","œ")

if punct:
text = re.sub(r"[\.\";\,\:\[\]\(\)!&?‘]","",text)

return text
@@ -0,0 +1,12 @@
"""Old English alphabet."""

# digits [1-10]
DIGITS = ['ān', 'tƿeġen', 'þrēo', 'fēoƿer', 'fīf', 'seox', 'seofon', 'eahta', 'niġon', 'tīen']

ALPHABET = ['a', 'æ', 'b', 'c', 'd', 'ð', 'e', 'f', 'g', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'w', 'ƿ', 'x', 'y', 'þ']

CONSONANTS = ['b', 'c', '', 'cg', 'd', 'ð', 'f', 'ff', 'ȝ', 'g', 'h', 'l', 'n', 'p', 'r', 's', 'ss', 'sc', 't', 'þ', 'þþ', 'ƿ', 'w']

VOWELS = ['a', 'æ', 'e', 'i', 'o', 'u', 'y']

DIPTHONGS = ['ea', 'eo', 'ie']
Oops, something went wrong.

0 comments on commit b21e777

Please sign in to comment.
You can’t perform that action at this time.