diff --git a/cltk/corpus/tamil/alphabet.py b/cltk/corpus/tamil/alphabet.py index 77a37ea32..50516dd30 100644 --- a/cltk/corpus/tamil/alphabet.py +++ b/cltk/corpus/tamil/alphabet.py @@ -1,10 +1,14 @@ -"""The Tamil language has 41 letters 8 VOWELS and 33 CONSTANTS""" +"""The Tamil language has 41 letters 8 VOWELS and 33 CONSTANTS. + +`GRANTHA_CONSONANTS` are from the Grantha script, which was used between 6th and 20th century to write Sanskrit and the classical language Manipravalam. +""" __author__ = ['Dilshan Abeysinghe'] __license__ = 'MIT License. See LICENSE.' -VOWELS=['அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ',' எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ'] -CONSTANTS=['க்', 'ங்', 'ச்', 'ஞ்', 'ட்', 'ண்', 'த்', 'ந்', 'ப்', 'ம்', 'ய்', 'ர்', 'ல்', 'வ்', 'ழ்', 'ள்', 'ற்', 'ன்'] +VOWELS = ['அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ',' எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ'] + +CONSTANTS = ['க்', 'ங்', 'ச்', 'ஞ்', 'ட்', 'ண்', 'த்', 'ந்', 'ப்', 'ம்', 'ய்', 'ர்', 'ல்', 'வ்', 'ழ்', 'ள்', 'ற்', 'ன்'] -GRANTHA_CONSONANTS=['ஜ்', 'ஶ்', 'ஷ்', 'ஸ்', 'ஹ்', 'க்ஷ்'] +GRANTHA_CONSONANTS = ['ஜ்', 'ஶ்', 'ஷ்', 'ஸ்', 'ஹ்', 'க்ஷ்'] diff --git a/cltk/corpus/tamil/tamil_cltk_documentation.rst b/cltk/corpus/tamil/tamil_cltk_documentation.rst new file mode 100644 index 000000000..c37bace40 --- /dev/null +++ b/cltk/corpus/tamil/tamil_cltk_documentation.rst @@ -0,0 +1,37 @@ +Tamil +***** + +Tamil is a Dravidian language predominantly spoken by the Tamil people of India and Sri Lanka. It is one of the longest-surviving classical languages in the world. A recorded Tamil literature has been documented for over 2000 years. The earliest period of Tamil literature, Sangam literature, is dated from ca. 300 BC – AD 300. It has the oldest extant literature among Dravidian languages. The earliest epigraphic records found on rock edicts and hero stones date from around the 3rd century BC. More than 55% of the epigraphical inscriptions (about 55,000) found by the Archaeological Survey of India are in the Tamil language. Tamil language inscriptions written in Brahmi script have been discovered in Sri Lanka, and on trade goods in Thailand and Egypt. (Source: `Wikipedia `_) + + +Alphabet +========= + +.. code-block:: python + + In [1]: from cltk.corpus.tamil.alphabet import VOWELS, CONSTANTS, GRANTHA_CONSONANTS + + In [2]: print(VOWELS) + Out[2]: ['அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ',' எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ'] + + In [3]: print(CONSONANTS) + Out[3]: ['க்', 'ங்', 'ச்', 'ஞ்', 'ட்', 'ண்', 'த்', 'ந்', 'ப்', 'ம்', 'ய்', 'ர்', 'ல்', 'வ்', 'ழ்', 'ள்', 'ற்', 'ன்'] + + In [4]: print(GRANTHA_CONSONANTS) + Out[4]: ['ஜ்', 'ஶ்', 'ஷ்', 'ஸ்', 'ஹ்', 'க்ஷ்'] + + + +Corpora +======= + +Use ``CorpusImporter()`` or browse the `CLTK GitHub organization `_ (anything beginning with ``tamil_``) to discover available tamil corpora. + +.. code-block:: python + + In [1]: from cltk.corpus.utils.importer import CorpusImporter + + In [2]: c = CorpusImporter('tamil') + + In [3]: c.list_corpora + Out[3]: ['tamil_text_ptr_tipitaka']