Skip to content

Commit

Permalink
Update corpus files to include Tesserae corpora (#904)
Browse files Browse the repository at this point in the history
* Update corpus files

* Add Tesserae reader for Latin

* Fix reference to Latin corpus
  • Loading branch information
diyclassics authored and kylepjohnson committed May 2, 2019
1 parent ed9c025 commit 0ecefb0
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 3 deletions.
8 changes: 7 additions & 1 deletion cltk/corpus/latin/corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,5 +85,11 @@
{'location': 'remote',
'type': 'text',
'name': 'latin_text_poeti_ditalia',
'origin': 'https://github.com/cltk/latin_text_poeti_ditalia.git'}
'origin': 'https://github.com/cltk/latin_text_poeti_ditalia.git'},
{'name': 'latin_text_tesserae',
'encoding': 'utf-8',
'markup': 'plaintext', #modified plaintext with Tesserae-style citations
'origin': 'https://github.com/cltk/latin_text_tesserae.git',
'location': 'remote',
'type': 'text'},
]
14 changes: 12 additions & 2 deletions cltk/corpus/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,13 @@

# TODO add your corpus here:
SUPPORTED_CORPORA = {
'latin': ['latin_text_latin_library', 'latin_text_perseus'],
'latin': ['latin_text_latin_library',
'latin_text_perseus',
'latin_text_tesserae',
],
'greek': ['greek_text_perseus',
'greek_text_tesserae']
'greek_text_tesserae',
]
} # type: Dict[str, List[str]]


Expand Down Expand Up @@ -62,6 +66,12 @@ def get_corpus_reader(corpus_name: str = None, language: str = None) -> CorpusRe
word_tokenizer=the_word_tokenizer,
target_language='latin') # perseus also contains English

if corpus_name == 'latin_text_tesserae':
return TesseraeCorpusReader(root=root, fileids=r'.*\.tess',
sent_tokenizer=sentence_tokenizer,
word_tokenizer=the_word_tokenizer,
)

if language == 'greek':
if corpus_name == 'greek_text_perseus':
valid_json_root = os.path.join(root, 'cltk_json') #: we only support this subsection
Expand Down

0 comments on commit 0ecefb0

Please sign in to comment.