Minor modification for TLGU (#984)

* light pylint formatting for word.py * make multiling tokenization docs comprehenisble * mk example ocs * minor adjustments for tlgu * rm print statement Co-authored-by: Travis CI <travis@travis-ci.org> Co-authored-by: Kyle P. Johnson <kyle.p.johnson>
cltk · Jun 14, 2020 · a722963 · a722963
1 parent 326f299
commit a722963
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 14 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# pyenv
+.python-version
+
 # mypy
 .mypy_cache
 

diff --git a/cltk/corpus/greek/tlgu.py b/cltk/corpus/greek/tlgu.py
@@ -28,7 +28,7 @@
     'line_tab': '-B',
     'higher_levels': '-X',
     'lower_levels': '-Y',
-    'no_spaces': '-N',  # break_lines
+    'no_spaces': '-N',  # rm_newlines
     'citation_debug': '-C',
     'code_debug': '-S',
     'verbose': '-V',
@@ -102,15 +102,15 @@ def _check_install(self):
                         logger.error('TLGU install with sudo failed.')
 
     def convert(self, input_path=None, output_path=None, markup=None,
-                break_lines=False, divide_works=False, latin=False,
+                rm_newlines=False, divide_works=False, latin=False,
                 extra_args=None):
         """
         :param input_path: TLG filepath to convert.
         :param output_path: filepath of new converted text.
         :param markup: Specificity of inline markup. Default None removes all
         numerical markup; 'full' gives most detailed, with reference numbers
         included before each text line.
-        :param break_lines: No spaces; removes line ends and hyphens before an
+        :param rm_newlines: No spaces; removes line ends and hyphens before an
          ID code; hyphens and spaces before page and column ends are retained.
         :param divide_works: Each work (book) is output as a separate file in
         the form output_file-xxx.txt; if an output file is not specified, this
@@ -134,7 +134,7 @@ def convert(self, input_path=None, output_path=None, markup=None,
         if markup == 'full':
             full_args = ['v', 'w', 'x', 'y', 'z']
             [tlgu_options.append(x) for x in full_args]  # pylint: disable=W0106
-        if break_lines:
+        if rm_newlines:
             tlgu_options.append('N')
         if divide_works:
             tlgu_options.append('W')
@@ -174,12 +174,11 @@ def convert(self, input_path=None, output_path=None, markup=None,
                          exc)
             raise
 
-    def convert_corpus(self, corpus, markup=None, break_lines=False, divide_works=False, latin=None, extra_args=None):  # pylint: disable=W0613
+    def convert_corpus(self, corpus, markup=None, latin=None):  # pylint: disable=W0613
         """Look for imported TLG or PHI files and convert them all to
         ``~/cltk_data/greek/text/tlg/<plaintext>``.
-        TODO: Should this and/or convert() be static?
         TODO: Add markup options to input.
-        TODO: Do something with break_lines, divide_works, and extra_args or rm them
+        TODO: Add rm_newlines, divide_works, and extra_args
         """
         orig_path_rel = get_cltk_data_dir() + '/originals'
         orig_path = os.path.expanduser(orig_path_rel)
@@ -218,7 +217,7 @@ def convert_corpus(self, corpus, markup=None, break_lines=False, divide_works=Fa
             target_txt_path = os.path.join(target_txt_dir, txt)
             try:
                 self.convert(orig_txt_path, target_txt_path, markup=None,
-                             break_lines=False, divide_works=False, latin=latin,
+                             rm_newlines=False, divide_works=False, latin=latin,
                              extra_args=None)
             except Exception as exception:
                 logger.error("Failed to convert file '%s' to '%s': %s", orig_txt_path, target_txt_path, exception)

diff --git a/docs/greek.rst b/docs/greek.rst
@@ -332,7 +332,7 @@ You may also convert individual files, with options for how the conversion happe
 
    In [4]: t.convert('~/Downloads/corpora/TLG_E/TLG0003.TXT', '~/Documents/thucydides.txt', markup='full')
 
-   In [5]: t.convert('~/Downloads/corpora/TLG_E/TLG0003.TXT', '~/Documents/thucydides.txt', break_lines=True)
+   In [5]: t.convert('~/Downloads/corpora/TLG_E/TLG0003.TXT', '~/Documents/thucydides.txt', rm_newlines=True)
 
    In [6]: t.convert('~/Downloads/corpora/TLG_E/TLG0003.TXT', '~/Documents/thucydides.txt', divide_works=True)
 
@@ -343,16 +343,12 @@ For ``convert()``, plain arguments may be sent directly to the ``TLGU``, as well
 
    In [7]: t.convert('~/Downloads/corpora/TLG_E/TLG0003.TXT', '~/Documents/thucydides.txt', extra_args=['p', 'B'])
 
-Concerning text normalization: Even after plaintext conversion, the TLG will still need some cleanup. The CLTK contains some helpful code for `post-TLGU cleanup <http://docs.cltk.org/en/latest/greek.html#text-cleanup>`_.
+Even after plaintext conversion, the TLG will still need some cleanup. The CLTK contains some code for `post-TLGU cleanup <http://docs.cltk.org/en/latest/greek.html#text-cleanup>`_.
 
 You may read about these arguments in `the TLGU manual <https://github.com/cltk/tlgu/blob/master/tlgu.1.pdf?raw=true>`_.
 
 Once these files are created, see `TLG Indices <http://docs.cltk.org/en/latest/greek.html#tlg-indices>`_ below for accessing these newly created files.
 
-See also `Text Cleanup <http://docs.cltk.org/en/latest/greek.html#text-cleanup>` for removing extraneous non-textual characters from these files.
-
-
-
 
 Corpus Readers
 ==============