Merge remote-tracking branch 'upstream/master'

cltk · Apr 4, 2019 · d838370 · d838370
2 parents 0c5d165 + fb61d10
commit d838370
Show file tree

Hide file tree

Showing 34 changed files with 2,724 additions and 430 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,9 +1,12 @@
 sudo: required
 
+dist: xenial
+
 language: python
 
 python:
 - '3.6'
+- '3.7'
 
 before_install:
 - sudo rm -f /etc/boto.cfg
@@ -23,13 +26,14 @@ before_script:
 - pip install numpy
 - pip install scipy
 - pip install scikit-learn
+- pip install nose-timer
 
 script:
   # Notes on nose:
   # Travis CI pre-installs `nose`
   # https://github.com/coagulant/coveralls-python#nosetests
   # http://nose.readthedocs.org/en/latest/plugins/skip.html
-- nosetests --no-skip --with-coverage --cover-package=cltk --with-doctest
+- nosetests --no-skip --with-coverage --cover-package=cltk --with-doctest --with-timer
 - ( cd docs && make doctest; )
 
 after_success:

diff --git a/cltk/contributors.md b/cltk/contributors.md
@@ -0,0 +1,7 @@
+# Contributors
+CLTK Core authors, ordered alphabetically by first name
+
+## key
+* val1
+* val2
+
diff --git a/cltk/corpus/greek/corpora.py b/cltk/corpus/greek/corpora.py
@@ -66,5 +66,10 @@
      'origin': 'https://github.com/cltk/First1KGreek',
      'location': 'remote',
      'type': 'text'},
+    {'name': 'greek_text_tesserae',
+     'encoding': 'utf-8',
+     'markup': 'plaintext', #modified plaintext with Tesserae-style citations
+     'origin': 'https://github.com/cltk/greek_text_tesserae.git',
+     'location': 'remote',
+     'type': 'text'},
 ]
-
diff --git a/cltk/corpus/hindi/alphabet.py b/cltk/corpus/hindi/alphabet.py
@@ -21,7 +21,7 @@
 
 
 #the Semivowels are also in the script of hindi
-SEMIVOWELS = ['य ','र ','ल' ,'व']    
+SEMIVOWELS = ['य','र','ल' ,'व']    
 
 #There are three sibilants:
 SIBILANTS = ['श','ष','स']
@@ -31,3 +31,23 @@
 # Anusvara is used for final velar nasal sound, Visarga adds voiceless breath after vowel and Candrabindu is used to nasalize vowels 
 
 MODIFIERS = ['◌্','◌ঁ','◌ং','◌ঃ']
+
+# classification of alphabets according to how their sound is produced
+
+VELAR_CONSONANTS = [ 'क' , 'ख' , 'ग' , 'घ' , 'ङ' ]
+
+PALATAL_CONSONANTS = ['च' , 'छ' , 'ज' , 'झ' , 'ञ' ]
+
+RETROFLEX_CONSONANTS = ['ट' , 'ठ' , 'ड' , 'ढ' , 'ण']
+
+DENTAL_CONSONANTS = ['त' , 'थ' , 'द' , 'ध' , 'न' ]
+
+LABIAL_CONSONANTS = ['प' , 'फ' , 'ब' , 'भ' , 'म']
+
+SONORANT_CONSONANTS = ['य' , 'र' , 'ल' , 'व']
+
+SIBILANT_CONSONANTS = ['श' , 'ष' , 'स']
+
+GUTTURAL_CONSONANT = ['ह']
+
+SIGNS= ['ॐ']
diff --git a/cltk/corpus/odia/alphabet.py b/cltk/corpus/odia/alphabet.py
@@ -1,16 +1,81 @@
 """Odia alphabet"""
-__author__ = 'Nishchith Shetty <inishchith[at]gmail[.]com>'
+__author__ = ['Nishchith Shetty <inishchith@gmail.com>']
 
-VOWELS = [
-    'ଅ', 'ଆ', 'ଇ', 'ଈ', 'ଉ', 'ଊ', 'ଋ',
-    'ୠ', 'ଌ', 'ୡ', 'ଏ', 'ଐ', 'ଓ', 'ଔ']
 
-STRUCTURED_CONSONANTS = [
-    'କ', 'ଖ', 'ଗ', 'ଘ', 'ଙ',
-    'ଚ', 'ଛ', 'ଜ', 'ଝ', 'ଞ',
-    'ଟ', 'ଠ', 'ଡ', 'ଢ', 'ଣ',
-    'ତ', 'ଥ', 'ଦ', 'ଧ', 'ନ',
-    'ପ', 'ଫ', 'ବ', 'ଭ', 'ମ']
+# Oriya Unicode Standard
+
+VOWELS = {
+    '0B05':'ଅ',
+    '0B06':'ଆ',
+    '0B07':'ଇ',
+    '0B08':'ଈ',
+    '0B09':'ଉ',
+    '0B0A':'ଊ',
+    '0B0B':'ଋ',
+    'N/A':'ୠ',
+    '0B0C':'ଌ',
+    'N/A':'ୡ',
+    '0B0F':'ଏ',
+    '0B10':'ଐ',
+    '0B13':'ଓ',
+    '0B14':'ଔ'
+    }
+
+STRUCTURED_CONSONANTS = {
+    '0B15':'କ',
+    '0B16':'ଖ',
+    '0B17':'ଗ',
+    '0B18':'ଘ', 
+    '0B19':'ଙ',
+    '0B1A':'ଚ',
+    '0B1B':'ଛ',
+    '0B1C':'ଜ',
+    '0B1D':'ଝ',
+    '0B1E':'ଞ',
+    '0B1F':'ଟ',
+    '0B20':'ଠ',
+    '0B21':'ଡ',
+    '0B22':'ଢ',
+    '0B23':'ଣ',
+    '0B24':'ତ',
+    '0B25':'ଥ',
+    '0B26':'ଦ',
+    '0B27':'ଧ',
+    '0B28':'ନ',
+    '0B2A':'ପ',
+    '0B2B':'ଫ',
+    '0B2C':'ବ',
+    '0B2D':'ଭ',
+    '0B2E':'ମ',
+    '0B2F':'ଯ',
+    '0B30':'ର',
+    '0B32':'ଲ',
+    '0B33':'ଳ',
+    '0B35':'ଵ',
+    '0B36':'ଶ',
+    '0B37':'ଷ',
+    '0B38':'ସ',
+    '0B39':'ହ'
+    }
+
+# The structured consonants are classified according to where the tongue touches the palate of the mouth and are classified accordingly into five structured groups.
+# These consonants are shown here with their IAST transcriptions.
+
+VELAR_CONSONANTS = [ 'କ', 'ଖ', 'ଗ', 'ଘ', 'ଙ' ]
+VELAR_CONSONANTS_PRONONCIATION = [ 'ka', 'kha', 'ga', 'gha', 'ṅa']
+
+PALATAL_CONSONANTS = ['ଚ', 'ଛ',	'ଜ', 'ଝ', 'ଞ']
+PALATAL_CONSONANTS_PRONOUNCIATION = [ 'ca', 'cha', 'ja', 'jha', 'ña']
+
+RETROFLEX_CONSONANTS = ['ଟ', 'ଠ', 'ଡ', 'ଢ', 'ଣ']
+RETROFLEX_CONSONANTS_PRONOUNCIATION = [ 'ṭa', 'ṭha', 'ḍa', 'ḍha', 'ṇa']
+
+DENTAL_CONSONANTS = [ 'ତ', 'ଥ', 'ଦ', 'ଧ', 'ନ']
+DENTAL_CONSONANTS_PRONOUNCIATION = [ 'ta', 'tha', 'da', 'dha', 'na']
+
+LABIALS_CONSONANTS = ['ପ', 'ଫ','ବ', 'ଭ', 'ମ']
+LABIALS_CONSONANTS_PRONOUNCIATION = [ 'pa', 'pha', 'ba', 'bha','ma']
+
 
 UNSTRUCTURED_CONSONANTS = [
     'ଯ', 'ୟ', 'ର', 'ଲ', 'ଳ', 'ୱ',
@@ -19,3 +84,11 @@
 NUMERALS = [
     '୦', '୧', '୨', '୩', '୪',
     '୫', '୬', '୭', '୮', '୯']
+EXTRA_NUMERICAL_SYMBOLS = ['୵',	'୶', '୷','୲', '୳','୴']
+EXTRA_NUMERICAL_SYMBOLS_DESC = ['1/16', '1/8', '3/16', '1/4','1/2',	'3/4']
+
+# Anusvara is used for final velar nasal sound, 
+# Visarga adds voiceless breath after vowel
+# Candrabindu is used to nasalize vowels 
+
+MODIFIERS = ['◌্','◌ঁ','◌ং','◌ঃ']