Add CorpusReader for Greek and Latin perseus cltk json files #361 #615 (

#854) * Initial releases with unit tests and doctests * Added sections and preliminary documentation for: Scansion of Poetry About the use of macrons in poetry HexameterScanner Hexameter ScansionConstants Syllabifier Metrical Validator ScansionFormatter StringUtils module Made minor formatting corrections elsewhere to quiet warnings encountered during transpiling the rst file during testing and verification. * corrected documentation & doctest comments that were causing errors. doctests run with an added command line switch: nosetests --no-skip --with-coverage --cover-package=cltk --with-doctest * fixing broken doctest comment * correcting documentation comment that causes doctest to err * Corrections to make the build pass: 1. added install gensim to travis build script; its absence is causing an error in word2vec.py during the build. 2. Modified transcription.py so that the macronizer is initialized on instantiation of the Transcriber class and not at the module level; the macronizer file is 32MB and this also seems to cause an error with travis as github does not make large files displayable, and so it may not be available for the build. The macronizer object has been made a component of "self." * moved package import inside of main so that it does not prevent the build from completing; soon, we should move to update the dependencies of word2vec; gensim pulls in boto which isn't python3 compliant, there is a boto3 version which we may be able to slot in, but perhaps a larger question is boto necessary? * correcting documentation * add JsonFile Corpus Reader for Perseus Greek and Latin cltk json corpora add better corpus reader documentation correct annotations and package naming unit tests for JsonFile Corpus Readers * improved documentation and a fix for tests * remove unnecessary coerce to int for sorting sections and subsections * switch print statement to log statement * corrected JsonFileCorpusReader to work with arbitrary levels of nested dictionaries * add perseus corpus types file for assemble_corpus functionality revise assemble_corpus method to just return a CorpusReader instead of a tuple of CorpusReader and input params correct latin library corpus types Revised test_corpus.py file to use setUp; removed the download_test_corpora file, changed the travis script
cltk · Jan 16, 2019 · e74a70d · e74a70d
1 parent 79123d9
commit e74a70d
Show file tree

Hide file tree

Showing 14 changed files with 829 additions and 270 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -24,7 +24,6 @@ before_script:
 - pip install numpy
 - pip install scipy 
 - pip install scikit-learn
-- python cltk/tests/download_test_corpora.py
 
 script:
   # Notes on nose:

diff --git a/cltk/corpus/latin/latin_library_corpus_types.py b/cltk/corpus/latin/latin_library_corpus_types.py
@@ -13,7 +13,6 @@
 # ontology map directories
 
 corpus_directories_by_type = {
-
     'republican': [
         './caesar',
         './lucretius',
@@ -25,7 +24,7 @@
         './ovid',
         './horace',
         './vergil',
-        './hyginus',
+        './hyginus'
     ],
     'early_silver': [
         './martial',
@@ -41,11 +40,11 @@
     'late_silver': [
         './suetonius',
         './gellius',
-        './apuleius'
+        './apuleius',
         './justin',
         './apicius',
         './fulgentius',
-        './orosius',
+        './orosius'
     ],
     'old': [
         './plautus'
@@ -66,11 +65,11 @@
         './prudentius',
         './tertullian',
         './kempis',
-        './leothegreat',
+        './leothegreat'
     ],
     'medieval': [
         './boethiusdacia',
-        './dante',
+        './dante'
     ],
     'renaissance': [
     ],
@@ -85,36 +84,35 @@
         './may',
         './melanchthon',
         './xylander',
-        './campion',
+        './campion'
     ],
     #: uncategorized
     'misc':
-        [
+        ['./alanus',
-            './alanus',
+         './albertanus',
-            './albertanus',
+         './albertofaix',
-            './albertofaix',
+         './aquinas',
-            './aquinas',
+         './ammianus',
-            './ammianus',
+         './arnobius',
-            './arnobius',
+         './capellanus',
-            './capellanus',
+         './cato',
-            './cato',
+         './claudian',
-            './claudian',
+         './curtius',
-            './curtius',
+         './eutropius',
-            './eutropius',
+         './frontinus',
-            './frontinus',
+         './gestafrancorum',
-            './gestafrancorum',
+         './justinian',
-            './justinian',
+         './lactantius',
-            './lactantius',
+         './martinbraga',
-            './martinbraga',
+         './mirandola',
-            './mirandola',
+         './ottofreising',
-            './ottofreising',
+         './pauldeacon',
-            './pauldeacon',
+         './sha',
-            './sha',
+         './theodosius',
-            './theodosius',
+         './voragine',
-            './voragine',
+         './walter',
-            './walter',
+         './williamtyre'
-            './williamtyre'
+         ],
-        ],
     'early': []
 }
 
@@ -145,7 +143,7 @@
         'varro.rr1.txt',
         'varro.rr2.txt',
         'varro.rr3.txt',
-        'sulpicia.txt',
+        'sulpicia.txt'
     ],
     'augustan': [
         'resgestae.txt',
@@ -169,7 +167,7 @@
         'propertius1.txt',
         'tibullus1.txt',
         'tibullus2.txt',
-        'tibullus3.txt',
+        'tibullus3.txt'
     ],
     'early_silver': [
         'pliny.ep1.txt',
@@ -228,7 +226,7 @@
         'valmax8.txt',
         'valmax9.txt',
         'vell1.txt',
-        'vell2.txt',
+        'vell2.txt'
     ],
     'late_silver': [
     ],
@@ -240,15 +238,15 @@
         'ter.hecyra.txt',
         'ter.phormio.txt',
         'andronicus.txt',
-        'enn.txt',
+        'enn.txt'
     ],
     'early': [
         '12tables.txt'
     ],
     'medieval': [
         'anselmepistula.txt',
         'anselmproslogion.txt',
-        'carm.bur.txt',
+        'carm.bur.txt'
     ],
     'christian': [
         'anon.martyrio.txt',
@@ -285,12 +283,12 @@
         'regula.txt',
         'sedulius.txt',
         'sulpiciusseverus.txt',
-        'vorag.txt',
+        'vorag.txt'
     ],
     'renaissance': [
         'petrarch.ep1.txt',
         'petrarch.numa.txt',
-        'petrarch.rom.txt',
+        'petrarch.rom.txt'
     ],
     'neo_latin': [
         'spinoza.ethica1.txt',
@@ -299,5 +297,5 @@
         'spinoza.ethica4.txt',
         'spinoza.ethica5.txt'
     ],
-    'misc':[]
+    'misc': []
 }