Permalink
Browse files

0.0.0.1

  • Loading branch information...
bert
bert committed Jan 12, 2014
1 parent 4c96d27 commit a697b9f914f59f16fd1aca39702f18f8ed29426b
Showing with 336 additions and 0 deletions.
  1. +53 −0 .gitignore
  2. +20 −0 LICENSE
  3. +2 −0 README.md
  4. 0 cltk/__init__.py
  5. +218 −0 cltk/compiler.py
  6. +9 −0 examples/try.py
  7. +34 −0 setup.py
View
@@ -0,0 +1,53 @@
+*.py[cod]
+
+# C extensions
+*.so
+
+# Packages
+*.egg
+*.egg-info
+dist
+build
+eggs
+parts
+bin
+var
+sdist
+develop-eggs
+.installed.cfg
+lib
+lib64
+__pycache__
+
+# Installer logs
+pip-log.txt
+
+# Unit test / coverage reports
+.coverage
+.tox
+nosetests.xml
+
+# Translations
+*.mo
+
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+
+PHI5
+PHI7
+TLG_E
+
+phi5.json
+phi7.json
+tlg.json
+
+phi5.pickle
+phi7.pickle
+tlg.pickle
+
+classics_corpus_compiler.log
+
+*\~
+venv
View
20 LICENSE
@@ -0,0 +1,20 @@
+The MIT License (MIT)
+
+Copyright (c) 2013 Kyle P. Johnson
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
View
@@ -0,0 +1,2 @@
+Classical Language Toolkit
+=========================
View
No changes.
View
@@ -0,0 +1,218 @@
+""" Creates a dictionary of PHI author numbers and their associated names. """
+
+import json
+import logging
+import os
+import re
+
+class Compile(object):
+ """Make JSON files out of TLG & PHI disks"""
+
+ def __init__(self, corpora_root='.', project_root='.'):
+ """Initializer, optional corpora and project"""
+ self.corpora_root = corpora_root
+ self.project_root = project_root
+ local_project_save = self.project_root + '/' \
+ + 'classics_corpus_compiler.log'
+ #clear_log()
+ logging.basicConfig(filename=local_project_save,
+ level=logging.INFO,
+ format='%(asctime)s %(message)s',
+ datefmt='%m/%d/%Y %I:%M:%S %p')
+
+ def uppercase_files(self):
+ """Uppercase corpora file names"""
+ corpus_list = ['TLG_E', 'PHI5', 'PHI7']
+ for directory in corpus_list:
+ corpus_dir = self.corpora_root + '/' + directory
+ os.chdir(corpus_dir)
+ for filename in os.listdir('.'):
+ new = filename.upper()
+ os.rename(filename, new)
+
+ def open_index_phi7(self):
+ """Creates a dictionary of PHI7 collections and file names."""
+ global index_dict
+ logging.info('Starting PHI7 index parsing.')
+ index = 'AUTHTAB.DIR'
+ local_index = self.corpora_root + '/' + 'PHI7/' + index
+ try:
+ with open(local_index, 'rb') as index_opened:
+ index_read = index_opened.read().decode('latin-1')
+ index_split = index_read.split('ÿ')[2:-9]
+ index_filter = [item for item in index_split if item]
+ index_dict = {}
+ for file in index_filter:
+ file_repl = file.replace('ƒl', '').replace('ƒg', '')\
+ .replace('ƒh', '').replace('>', '').replace(']]', ']')
+ pattern = '.*Library.*|.*Inscriptions .*|.*Bibliography.*'
+ match = re.search(pattern, file_repl)
+ if match:
+ pass
+ else:
+ split = file_repl.split(' ', 1)
+ number = split[0]
+ name = split[1]
+ index_dict[number] = name
+ logging.info('Finished PHI7 index parsing.')
+ return index_dict
+ except IOError:
+ logging.error('Failed to open PHI7 index file AUTHTAB.DIR')
+
+ def dump_txts_phi7(self):
+ """reads file and translates to ascii"""
+ logging.info('Starting PHI7 corpus compilation.')
+ self.open_index_phi7()
+ phi7_dict = {}
+ for file_name in index_dict:
+ abbrev = index_dict[file_name]
+ files_path = self.corpora_root + '/' + 'PHI7' + '/' \
+ + file_name + '.TXT'
+ try:
+ with open(files_path, 'rb') as txt_opened:
+ txt_read = txt_opened.read().decode('latin-1')
+ txt_ascii = remove_non_ascii(txt_read)
+ phi7_dict[abbrev] = txt_ascii
+ except IOError:
+ logging.error('Failed to open PHI7 file %s of author %s',
+ file_name, abbrev)
+ json_array = json.dumps(phi7_dict)
+ local_project_save = self.project_root + '/' + 'phi7.json'
+ try:
+ with open(local_project_save, 'w') as json_opened:
+ json_opened.write(json_array)
+ except IOError:
+ logging.error('Failed to create and/or write to file phi7.json.')
+ self.confirm_json_present('PHI7')
+ logging.info('Finished PHI7 corpus compilation.')
+
+ def open_index_phi5(self):
+ """Creates a dictionary of PHI5 collections and file names."""
+ global index_dict
+ logging.info('Starting PHI5 index parsing.')
+ index = 'AUTHTAB.DIR'
+ local_index = self.corpora_root + '/' + 'PHI5/' + index
+ try:
+ with open(local_index, 'rb') as index_opened:
+ index_read = index_opened.read().decode('latin-1')
+ index_split = index_read.split('ÿ')[1:-21]
+ index_filter = [item for item in index_split if item]
+ index_dict = {}
+ for file in index_filter:
+ file_repl = file.replace('\x83l', '')\
+ .replace('€', '; ').replace('&1', '')\
+ .replace('&', '')
+ file_split = file_repl.split(' ', 1)
+ label = file_split[0]
+ name = file_split[1]
+ index_dict[label] = name
+ logging.info('Finished PHI5 index parsing.')
+ return index_dict
+ except IOError:
+ logging.error('Failed to open PHI5 index file AUTHTAB.DIR')
+
+ def dump_txts_phi5(self):
+ """reads file and translates to ascii"""
+ logging.info('Starting PHI5 corpus compilation.')
+ self.open_index_phi5()
+ phi5_dict = {}
+ for file_name in index_dict:
+ abbrev = index_dict[file_name]
+ files_path = self.corpora_root + '/' + 'PHI5' + '/' \
+ + file_name + '.TXT'
+ try:
+ with open(files_path, 'rb') as index_opened:
+ txt_read = index_opened.read().decode('latin-1')
+ txt_ascii = remove_non_ascii(txt_read)
+ phi5_dict[abbrev] = txt_ascii
+ except IOError:
+ logging.error('Failed to open PHI5 file %s of author %s',
+ file_name, abbrev)
+ local_project_save = self.project_root + '/' + 'phi5.json'
+ try:
+ with open(local_project_save, 'w') as phi5_json:
+ phi5_json_array = json.dumps(phi5_dict)
+ phi5_json.write(phi5_json_array)
+ except IOError:
+ logging.error('Failed to create and write to file phi5.json.')
+ self.confirm_json_present('PHI5')
+ logging.info('Finished PHI5 corpus compilation.')
+
+ def open_index_tlg(self):
+ """Creates a dictionary of TLG collections and file names."""
+ global index_dict
+ logging.info('Starting TLG index parsing.')
+ index = 'AUTHTAB.DIR'
+ local_index = self.corpora_root + '/' + 'TLG_E/' + index
+ try:
+ with open(local_index, 'rb') as index_opened:
+ index_read = index_opened.read().decode('latin-1')
+ index_split = index_read.split('ÿ')[1:-7]
+ index_filter = [item for item in index_split if item]
+ index_dict = {}
+ for file in index_filter:
+ file_repl = file.replace(' &1', ' ').replace('&', '')\
+ .replace(' 1', ' ').replace('-1', '-').replace('[2', '[')\
+ .replace(']2', ']').replace('1Z', '').replace('1P', 'P')\
+ .replace('1D', 'D').replace('1L', 'L').replace('€', ' ')
+ file_split = file_repl.split(' ', 1)
+ label = file_split[0]
+ name = file_split[1]
+ index_dict[label] = name
+ logging.info('Finished TLG index parsing.')
+ return index_dict
+ except IOError:
+ logging.error('Failed to open TLG index file AUTHTAB.DIR')
+
+ def dump_txts_tlg(self):
+ """reads file and translates to ascii"""
+ logging.info('Starting TLG corpus compilation.')
+ self.open_index_tlg()
+ tlg_dict = {}
+ for file_name in index_dict:
+ abbrev = index_dict[file_name]
+ files_path = self.corpora_root + '/' + 'TLG_E' + '/' \
+ + file_name + '.TXT'
+ try:
+ with open(files_path, 'rb') as index_opened:
+ txt_read = index_opened.read().decode('latin-1')
+ txt_ascii = remove_non_ascii(txt_read)
+ tlg_dict[abbrev] = txt_ascii
+ except IOError:
+ logging.error('Failed to open TLG file %s of author %s',
+ file_name, abbrev)
+ local_project_save = self.project_root + '/' + 'tlg.json'
+ try:
+ with open(local_project_save, 'w') as tlg_json:
+ tlg_json_array = json.dumps(tlg_dict)
+ tlg_json.write(tlg_json_array)
+ except IOError:
+ logging.error('Failed to create and write to file tlg.json.')
+ self.confirm_json_present('TLG_E')
+ logging.info('Finished TLG corpus compilation.')
+
+ def confirm_json_present(self, directory):
+ """Checks that the JSON file is in fact present and opens OK"""
+ logging.info('Confirming JSON file saved.')
+ if directory == 'PHI7':
+ present = os.path.isfile(self.project_root + '/' + 'phi7.json')
+ elif directory == 'PHI5':
+ present = os.path.isfile(self.project_root + '/' + 'phi5.json')
+ elif directory == 'TLG_E':
+ present = os.path.isfile(self.project_root + '/' + 'tlg.json')
+ if present is True:
+ logging.info('%s JSON file is present.', directory)
+ else:
+ logging.error('%s JSON file is not present.', directory)
+
+def remove_non_ascii(input_string):
+ """remove non-ascii: http://stackoverflow.com/a/1342373"""
+ return "".join(i for i in input_string if ord(i) < 128)
+
+def clear_log():
+ """Truncates log"""
+ try:
+ with open('classics_corpus_compiler.log', 'w'):
+ logging.info('Cleared log if present.')
+ except IOError:
+ logging.error('Failed to clear log.')
View
@@ -0,0 +1,9 @@
+from cltk.corpus.compiler import Compile
+
+#c = Compile()
+c = Compile('/home/kyle/Downloads/project_dir/corps', '/home/kyle/Downloads/project_dir')
+#c.uppercase_files()
+#c.dump_txts_phi7()
+c.dump_txts_phi5()
+#c.dump_txts_tlg()
+
View
@@ -0,0 +1,34 @@
+"""Config for PyPI"""
+import os
+from setuptools import setup, find_packages
+
+setup(
+ author='Kyle P. Johnson',
+ author_email='kyle@kyle-p-johnson.com',
+ classifiers=[
+ 'Development Status :: 1 - Planning',
+ 'Environment :: Console',
+ 'Intended Audience :: Education',
+ 'Intended Audience :: Science/Research',
+ 'License :: OSI Approved :: MIT License',
+ 'Natural Language :: English',
+ 'Natural Language :: Greek',
+ 'Natural Language :: Latin',
+ 'Operating System :: OS Independent',
+ 'Operating System :: POSIX',
+ 'Programming Language :: Python :: 3.3',
+ 'Topic :: Text Processing',
+ 'Topic :: Text Processing :: General',
+ 'Topic :: Text Processing :: Linguistic',
+ 'Topic :: Utilities',
+ ],
+ description=('NLP support for Ancient Greek and Latin'),
+ keywords=['nlp', 'ancient greek', 'latin', 'tlg', 'phi', 'literature'],
+ license='MIT',
+ long_description='README',
+ name='cltk',
+ packages=find_packages(),
+ url='https://github.com/kylepjohnson/cltk',
+ version='0.0.0.1',
+ zip_safe = True,
+)

0 comments on commit a697b9f

Please sign in to comment.