Permalink
Browse files

Merge branch 'master' into master

  • Loading branch information...
siddharthkv7 committed Feb 6, 2019
2 parents 3c6b70a + 093e433 commit 66fb393241c8e470b0181c4ae6edb4e0dee2eb92
Showing with 3,369 additions and 872 deletions.
  1. +7 −8 README.md
  2. +6 −39 cltk/corpus/latin/__init__.py
  3. +0 −1 cltk/corpus/latin/corpora.py
  4. +301 −0 cltk/corpus/latin/latin_library_corpus_types.py
  5. +314 −0 cltk/corpus/latin/perseus_corpus_types.py
  6. +41 −0 cltk/corpus/old_norse/syllabifier.py
  7. +363 −0 cltk/corpus/readers.py
  8. +3 −0 cltk/inflection/old_norse/pronouns.py
  9. +21 −0 cltk/inflection/utils.py
  10. +1 −1 cltk/lemmatize/latin/backoff.py
  11. +130 −0 cltk/lemmatize/old_english/lemma.py
  12. +105 −8 cltk/phonology/old_norse/transcription.py
  13. +101 −10 cltk/phonology/syllabify.py
  14. +27 −8 cltk/phonology/utils.py
  15. +0 −64 cltk/prosody/latin/Verse.py
  16. +27 −22 cltk/prosody/latin/{HendecasyllableScanner.py → hendecasyllable_scanner.py}
  17. +59 −47 cltk/prosody/latin/{HexameterScanner.py → hexameter_scanner.py}
  18. +43 −25 cltk/prosody/latin/{MetricalValidator.py → metrical_validator.py}
  19. +37 −22 cltk/prosody/latin/{PentameterScanner.py → pentameter_scanner.py}
  20. +3 −3 cltk/prosody/latin/{ScansionConstants.py → scansion_constants.py}
  21. +14 −4 cltk/prosody/latin/{ScansionFormatter.py → scansion_formatter.py}
  22. +161 −51 cltk/prosody/latin/{StringUtils.py → string_utils.py}
  23. +69 −46 cltk/prosody/latin/{Syllabifier.py → syllabifier.py}
  24. +66 −0 cltk/prosody/latin/verse.py
  25. +104 −61 cltk/prosody/latin/{VerseScanner.py → verse_scanner.py}
  26. +1 −0 cltk/prosody/middle_high_german/__init__.py
  27. +53 −0 cltk/prosody/middle_high_german/verse.py
  28. +16 −16 cltk/prosody/old_norse/verse.py
  29. +1 −1 cltk/stem/latin/stem.py
  30. +39 −0 cltk/stop/classical_chinese.py
  31. +62 −0 cltk/stop/latin.py
  32. 0 cltk/stop/latin/__init__.py
  33. +0 −102 cltk/stop/latin/stops.py
  34. +84 −164 cltk/stop/stop.py
  35. +50 −0 cltk/tests/test_languages/test_old_english.py
  36. +2 −2 cltk/tests/test_languages/test_old_norse.py
  37. +7 −0 cltk/tests/test_nlp/contributors.md
  38. +159 −13 cltk/tests/test_nlp/test_corpus.py
  39. +92 −68 cltk/tests/test_nlp/test_phonology.py
  40. +4 −4 cltk/tests/test_nlp/test_scansion.py
  41. +79 −51 cltk/tests/test_nlp/test_stop.py
  42. +31 −0 cltk/utils/featurization.py
  43. +21 −1 cltk/utils/file_operations.py
  44. +402 −0 cltk/utils/matrix_corpus_fun.py
  45. +1 −1 cltk/vector/word2vec.py
  46. +1 −1 docs/about.rst
  47. +2 −2 docs/akkadian.rst
  48. +1 −1 docs/ancient_egyptian.rst
  49. +99 −0 docs/corpus_readers.rst
  50. +36 −0 docs/greek.rst
  51. +1 −0 docs/index.rst
  52. +3 −3 docs/installation.rst
  53. +75 −16 docs/latin.rst
  54. +38 −0 docs/old_english.rst
  55. +4 −4 requirements.txt
  56. +2 −2 setup.py
@@ -4,13 +4,12 @@

[![Build Status](https://travis-ci.org/cltk/cltk.svg?branch=master)](https://travis-ci.org/cltk/cltk) [![codecov.io](http://codecov.io/github/cltk/cltk/coverage.svg?branch=master)](http://codecov.io/github/cltk/cltk?branch=master)

[![Join the chat at https://gitter.im/cltk/cltk](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/cltk/cltk?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) <a href="https://zenhub.io"><img src="https://raw.githubusercontent.com/ZenHubIO/support/master/zenhub-badge.png"></a>
[![Join the chat at https://gitter.im/cltk/cltk](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/cltk/cltk?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)


## About

The Classical Language Toolkit (CLTK) offers natural language processing (NLP) support for the languages of Ancient, Classical, and Medieval Eurasia. Greek and Latin functionality are currently most complete. The goals of the CLTK are to:

The Classical Language Toolkit (CLTK) offers natural language processing (NLP) support for the languages of Ancient, Classical, and Medieval Eurasia. Greek, Latin, Akkadian, and the Germanic languages are currently most complete. The goals of the CLTK are to:
* compile analysis-friendly corpora;
* collect and generate linguistic data;
* act as a free and open platform for generating scientific research.
@@ -50,7 +49,7 @@ Each major release of the CLTK is given a [DOI](http://en.wikipedia.org/wiki/Dig

Thus, please cite core software as something like:
```
Kyle P. Johnson et al.. (2014-2017). CLTK: The Classical Language Toolkit. DOI 10.5281/zenodo.<current_release_id>
Kyle P. Johnson et al.. (2014-2019). CLTK: The Classical Language Toolkit. DOI 10.5281/zenodo.<current_release_id>
```

A style-neutral BibTeX entry would look like this:
@@ -60,7 +59,7 @@ author = {Kyle P. Johnson et al.},
title = {CLTK: The Classical Language Toolkit},
howpublished = {\url{https://github.com/cltk/cltk}},
note = {{DOI} 10.5281/zenodo.<current_release_id>},
year = {2014--2017},
year = {2014--2019},
}
```

@@ -72,11 +71,11 @@ year = {2014--2017},

We are thankful for the following organizations that have offered support:

* Google Summer of Code (sponsoring two students, 2016 & 2017)
* JetBrains (licenses for PyCharm and WebStorm)
* Google Summer of Code (sponsoring two students, 2016, 2017; three students 2018)
* JetBrains (licenses for PyCharm)
* Google Cloud Platform (with credits for the Classical Language Archive and API)


## License

The CLTK is Copyright (c) 2017 Kyle P. Johnson, under the MIT License. See [LICENSE](https://github.com/cltk/cltk/blob/master/LICENSE) for details.
The CLTK is Copyright (c) 2014-2019 Kyle P. Johnson, under the MIT License. See [LICENSE](https://github.com/cltk/cltk/blob/master/LICENSE) for details.
@@ -1,44 +1,11 @@
# CLTK: Latin Corpus Readers

__author__ = ['Patrick J. Burns <patrick@diyclassics.org>']
__license__ = 'MIT License. See LICENSE.'

"""
CLTK Latin corpus readers
CLTK: Corpus Latin properties
"""

import os.path
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

from cltk.tokenize.sentence import TokenizeSentence
from cltk.tokenize.word import WordTokenizer

# Would like to have this search through a CLTK_DATA environment variable
# Better to use something like make_cltk_path in cltk.utils.file_operations?
home = os.path.expanduser('~')
cltk_path = os.path.join(home, 'cltk_data')
if not os.path.isdir(cltk_path):
os.makedirs(cltk_path)

word_tokenizer = WordTokenizer('latin')
__author__ = ['Patrick J. Burns <patrick@diyclassics.org>', 'Todd Cook <todd.g.cook@gmail.com>']
__license__ = 'MIT License. See LICENSE.'

if os.path.exists(cltk_path + 'latin/model/latin_models_cltk/tokenizers/sentence'):
sent_tokenizer = TokenizeSentence('latin')
else:
punkt_param = PunktParameters()
abbreviations = ['c', 'l', 'm', 'p', 'q', 't', 'ti', 'sex', 'a', 'd', 'cn', 'sp', "m'", 'ser', 'ap', 'n', 'v', 'k', 'mam', 'post', 'f', 'oct', 'opet', 'paul', 'pro', 'sert', 'st', 'sta', 'v', 'vol', 'vop']
punkt_param.abbrev_types = set(abbreviations)
sent_tokenizer = PunktSentenceTokenizer(punkt_param)

# Latin Library
try:
latinlibrary = PlaintextCorpusReader(cltk_path + '/latin/text/latin_text_latin_library',
'.*\.txt',
word_tokenizer=word_tokenizer,
sent_tokenizer=sent_tokenizer,
encoding='utf-8')
pass
except IOError as e:
pass
# print("Corpus not found. Please check that the Latin Library is installed in CLTK_DATA.")
abbreviations = ['c', 'l', 'm', 'p', 'q', 't', 'ti', 'sex', 'a', 'd', 'cn', 'sp', "m'", 'ser',
'ap', 'n', 'v', 'k', 'mam', 'post', 'f', 'oct', 'opet', 'paul', 'pro', 'sert',
'st', 'sta', 'v', 'vol', 'vop']
@@ -24,7 +24,6 @@
'type': 'text'},
{'encoding': 'latin-1',
'markup': 'beta_code',
'name': '',
'location': 'local',
'name': 'phi5',
'origin': None,
Oops, something went wrong.

0 comments on commit 66fb393

Please sign in to comment.