Skip to content

Commit

Permalink
Minor modification for TLGU (#984)
Browse files Browse the repository at this point in the history
* light pylint formatting for word.py

* make multiling tokenization docs comprehenisble

* mk example ocs

* minor adjustments for tlgu

* rm print statement

Co-authored-by: Travis CI <travis@travis-ci.org>
Co-authored-by: Kyle P. Johnson <kyle.p.johnson>
  • Loading branch information
kylepjohnson and Travis CI committed Jun 14, 2020
1 parent 326f299 commit a722963
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 14 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# pyenv
.python-version

# mypy
.mypy_cache

Expand Down
15 changes: 7 additions & 8 deletions cltk/corpus/greek/tlgu.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
'line_tab': '-B',
'higher_levels': '-X',
'lower_levels': '-Y',
'no_spaces': '-N', # break_lines
'no_spaces': '-N', # rm_newlines
'citation_debug': '-C',
'code_debug': '-S',
'verbose': '-V',
Expand Down Expand Up @@ -102,15 +102,15 @@ def _check_install(self):
logger.error('TLGU install with sudo failed.')

def convert(self, input_path=None, output_path=None, markup=None,
break_lines=False, divide_works=False, latin=False,
rm_newlines=False, divide_works=False, latin=False,
extra_args=None):
"""
:param input_path: TLG filepath to convert.
:param output_path: filepath of new converted text.
:param markup: Specificity of inline markup. Default None removes all
numerical markup; 'full' gives most detailed, with reference numbers
included before each text line.
:param break_lines: No spaces; removes line ends and hyphens before an
:param rm_newlines: No spaces; removes line ends and hyphens before an
ID code; hyphens and spaces before page and column ends are retained.
:param divide_works: Each work (book) is output as a separate file in
the form output_file-xxx.txt; if an output file is not specified, this
Expand All @@ -134,7 +134,7 @@ def convert(self, input_path=None, output_path=None, markup=None,
if markup == 'full':
full_args = ['v', 'w', 'x', 'y', 'z']
[tlgu_options.append(x) for x in full_args] # pylint: disable=W0106
if break_lines:
if rm_newlines:
tlgu_options.append('N')
if divide_works:
tlgu_options.append('W')
Expand Down Expand Up @@ -174,12 +174,11 @@ def convert(self, input_path=None, output_path=None, markup=None,
exc)
raise

def convert_corpus(self, corpus, markup=None, break_lines=False, divide_works=False, latin=None, extra_args=None): # pylint: disable=W0613
def convert_corpus(self, corpus, markup=None, latin=None): # pylint: disable=W0613
"""Look for imported TLG or PHI files and convert them all to
``~/cltk_data/greek/text/tlg/<plaintext>``.
TODO: Should this and/or convert() be static?
TODO: Add markup options to input.
TODO: Do something with break_lines, divide_works, and extra_args or rm them
TODO: Add rm_newlines, divide_works, and extra_args
"""
orig_path_rel = get_cltk_data_dir() + '/originals'
orig_path = os.path.expanduser(orig_path_rel)
Expand Down Expand Up @@ -218,7 +217,7 @@ def convert_corpus(self, corpus, markup=None, break_lines=False, divide_works=Fa
target_txt_path = os.path.join(target_txt_dir, txt)
try:
self.convert(orig_txt_path, target_txt_path, markup=None,
break_lines=False, divide_works=False, latin=latin,
rm_newlines=False, divide_works=False, latin=latin,
extra_args=None)
except Exception as exception:
logger.error("Failed to convert file '%s' to '%s': %s", orig_txt_path, target_txt_path, exception)
Expand Down
8 changes: 2 additions & 6 deletions docs/greek.rst
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ You may also convert individual files, with options for how the conversion happe
In [4]: t.convert('~/Downloads/corpora/TLG_E/TLG0003.TXT', '~/Documents/thucydides.txt', markup='full')
In [5]: t.convert('~/Downloads/corpora/TLG_E/TLG0003.TXT', '~/Documents/thucydides.txt', break_lines=True)
In [5]: t.convert('~/Downloads/corpora/TLG_E/TLG0003.TXT', '~/Documents/thucydides.txt', rm_newlines=True)
In [6]: t.convert('~/Downloads/corpora/TLG_E/TLG0003.TXT', '~/Documents/thucydides.txt', divide_works=True)
Expand All @@ -343,16 +343,12 @@ For ``convert()``, plain arguments may be sent directly to the ``TLGU``, as well
In [7]: t.convert('~/Downloads/corpora/TLG_E/TLG0003.TXT', '~/Documents/thucydides.txt', extra_args=['p', 'B'])
Concerning text normalization: Even after plaintext conversion, the TLG will still need some cleanup. The CLTK contains some helpful code for `post-TLGU cleanup <http://docs.cltk.org/en/latest/greek.html#text-cleanup>`_.
Even after plaintext conversion, the TLG will still need some cleanup. The CLTK contains some code for `post-TLGU cleanup <http://docs.cltk.org/en/latest/greek.html#text-cleanup>`_.

You may read about these arguments in `the TLGU manual <https://github.com/cltk/tlgu/blob/master/tlgu.1.pdf?raw=true>`_.

Once these files are created, see `TLG Indices <http://docs.cltk.org/en/latest/greek.html#tlg-indices>`_ below for accessing these newly created files.

See also `Text Cleanup <http://docs.cltk.org/en/latest/greek.html#text-cleanup>` for removing extraneous non-textual characters from these files.




Corpus Readers
==============
Expand Down

0 comments on commit a722963

Please sign in to comment.