Skip to content
Permalink
Browse files

0.0.0.1

  • Loading branch information
bert
bert committed Jan 12, 2014
1 parent 4c96d27 commit a697b9f914f59f16fd1aca39702f18f8ed29426b
Showing with 336 additions and 0 deletions.
  1. +53 −0 .gitignore
  2. +20 −0 LICENSE
  3. +2 −0 README.md
  4. 0 cltk/__init__.py
  5. +218 −0 cltk/compiler.py
  6. +9 −0 examples/try.py
  7. +34 −0 setup.py
@@ -0,0 +1,53 @@
*.py[cod]

# C extensions
*.so

# Packages
*.egg
*.egg-info
dist
build
eggs
parts
bin
var
sdist
develop-eggs
.installed.cfg
lib
lib64
__pycache__

# Installer logs
pip-log.txt

# Unit test / coverage reports
.coverage
.tox
nosetests.xml

# Translations
*.mo

# Mr Developer
.mr.developer.cfg
.project
.pydevproject

PHI5
PHI7
TLG_E

phi5.json
phi7.json
tlg.json

phi5.pickle
phi7.pickle
tlg.pickle

classics_corpus_compiler.log

*\~
venv
20 LICENSE
@@ -0,0 +1,20 @@
The MIT License (MIT)

Copyright (c) 2013 Kyle P. Johnson

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,2 @@
Classical Language Toolkit
=========================
No changes.
@@ -0,0 +1,218 @@
""" Creates a dictionary of PHI author numbers and their associated names. """

import json
import logging
import os
import re

class Compile(object):
"""Make JSON files out of TLG & PHI disks"""

def __init__(self, corpora_root='.', project_root='.'):
"""Initializer, optional corpora and project"""
self.corpora_root = corpora_root
self.project_root = project_root
local_project_save = self.project_root + '/' \
+ 'classics_corpus_compiler.log'
#clear_log()
logging.basicConfig(filename=local_project_save,
level=logging.INFO,
format='%(asctime)s %(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p')

def uppercase_files(self):
"""Uppercase corpora file names"""
corpus_list = ['TLG_E', 'PHI5', 'PHI7']
for directory in corpus_list:
corpus_dir = self.corpora_root + '/' + directory
os.chdir(corpus_dir)
for filename in os.listdir('.'):
new = filename.upper()
os.rename(filename, new)

def open_index_phi7(self):
"""Creates a dictionary of PHI7 collections and file names."""
global index_dict
logging.info('Starting PHI7 index parsing.')
index = 'AUTHTAB.DIR'
local_index = self.corpora_root + '/' + 'PHI7/' + index
try:
with open(local_index, 'rb') as index_opened:
index_read = index_opened.read().decode('latin-1')
index_split = index_read.split('ÿ')[2:-9]
index_filter = [item for item in index_split if item]
index_dict = {}
for file in index_filter:
file_repl = file.replace('ƒl', '').replace('ƒg', '')\
.replace('ƒh', '').replace('>', '').replace(']]', ']')
pattern = '.*Library.*|.*Inscriptions .*|.*Bibliography.*'
match = re.search(pattern, file_repl)
if match:
pass
else:
split = file_repl.split(' ', 1)
number = split[0]
name = split[1]
index_dict[number] = name
logging.info('Finished PHI7 index parsing.')
return index_dict
except IOError:
logging.error('Failed to open PHI7 index file AUTHTAB.DIR')

def dump_txts_phi7(self):
"""reads file and translates to ascii"""
logging.info('Starting PHI7 corpus compilation.')
self.open_index_phi7()
phi7_dict = {}
for file_name in index_dict:
abbrev = index_dict[file_name]
files_path = self.corpora_root + '/' + 'PHI7' + '/' \
+ file_name + '.TXT'
try:
with open(files_path, 'rb') as txt_opened:
txt_read = txt_opened.read().decode('latin-1')
txt_ascii = remove_non_ascii(txt_read)
phi7_dict[abbrev] = txt_ascii
except IOError:
logging.error('Failed to open PHI7 file %s of author %s',
file_name, abbrev)
json_array = json.dumps(phi7_dict)
local_project_save = self.project_root + '/' + 'phi7.json'
try:
with open(local_project_save, 'w') as json_opened:
json_opened.write(json_array)
except IOError:
logging.error('Failed to create and/or write to file phi7.json.')
self.confirm_json_present('PHI7')
logging.info('Finished PHI7 corpus compilation.')

def open_index_phi5(self):
"""Creates a dictionary of PHI5 collections and file names."""
global index_dict
logging.info('Starting PHI5 index parsing.')
index = 'AUTHTAB.DIR'
local_index = self.corpora_root + '/' + 'PHI5/' + index
try:
with open(local_index, 'rb') as index_opened:
index_read = index_opened.read().decode('latin-1')
index_split = index_read.split('ÿ')[1:-21]
index_filter = [item for item in index_split if item]
index_dict = {}
for file in index_filter:
file_repl = file.replace('\x83l', '')\
.replace('€', '; ').replace('&1', '')\
.replace('&', '')
file_split = file_repl.split(' ', 1)
label = file_split[0]
name = file_split[1]
index_dict[label] = name
logging.info('Finished PHI5 index parsing.')
return index_dict
except IOError:
logging.error('Failed to open PHI5 index file AUTHTAB.DIR')

def dump_txts_phi5(self):
"""reads file and translates to ascii"""
logging.info('Starting PHI5 corpus compilation.')
self.open_index_phi5()
phi5_dict = {}
for file_name in index_dict:
abbrev = index_dict[file_name]
files_path = self.corpora_root + '/' + 'PHI5' + '/' \
+ file_name + '.TXT'
try:
with open(files_path, 'rb') as index_opened:
txt_read = index_opened.read().decode('latin-1')
txt_ascii = remove_non_ascii(txt_read)
phi5_dict[abbrev] = txt_ascii
except IOError:
logging.error('Failed to open PHI5 file %s of author %s',
file_name, abbrev)
local_project_save = self.project_root + '/' + 'phi5.json'
try:
with open(local_project_save, 'w') as phi5_json:
phi5_json_array = json.dumps(phi5_dict)
phi5_json.write(phi5_json_array)
except IOError:
logging.error('Failed to create and write to file phi5.json.')
self.confirm_json_present('PHI5')
logging.info('Finished PHI5 corpus compilation.')

def open_index_tlg(self):
"""Creates a dictionary of TLG collections and file names."""
global index_dict
logging.info('Starting TLG index parsing.')
index = 'AUTHTAB.DIR'
local_index = self.corpora_root + '/' + 'TLG_E/' + index
try:
with open(local_index, 'rb') as index_opened:
index_read = index_opened.read().decode('latin-1')
index_split = index_read.split('ÿ')[1:-7]
index_filter = [item for item in index_split if item]
index_dict = {}
for file in index_filter:
file_repl = file.replace(' &1', ' ').replace('&', '')\
.replace(' 1', ' ').replace('-1', '-').replace('[2', '[')\
.replace(']2', ']').replace('1Z', '').replace('1P', 'P')\
.replace('1D', 'D').replace('1L', 'L').replace('€', ' ')
file_split = file_repl.split(' ', 1)
label = file_split[0]
name = file_split[1]
index_dict[label] = name
logging.info('Finished TLG index parsing.')
return index_dict
except IOError:
logging.error('Failed to open TLG index file AUTHTAB.DIR')

def dump_txts_tlg(self):
"""reads file and translates to ascii"""
logging.info('Starting TLG corpus compilation.')
self.open_index_tlg()
tlg_dict = {}
for file_name in index_dict:
abbrev = index_dict[file_name]
files_path = self.corpora_root + '/' + 'TLG_E' + '/' \
+ file_name + '.TXT'
try:
with open(files_path, 'rb') as index_opened:
txt_read = index_opened.read().decode('latin-1')
txt_ascii = remove_non_ascii(txt_read)
tlg_dict[abbrev] = txt_ascii
except IOError:
logging.error('Failed to open TLG file %s of author %s',
file_name, abbrev)
local_project_save = self.project_root + '/' + 'tlg.json'
try:
with open(local_project_save, 'w') as tlg_json:
tlg_json_array = json.dumps(tlg_dict)
tlg_json.write(tlg_json_array)
except IOError:
logging.error('Failed to create and write to file tlg.json.')
self.confirm_json_present('TLG_E')
logging.info('Finished TLG corpus compilation.')

def confirm_json_present(self, directory):
"""Checks that the JSON file is in fact present and opens OK"""
logging.info('Confirming JSON file saved.')
if directory == 'PHI7':
present = os.path.isfile(self.project_root + '/' + 'phi7.json')
elif directory == 'PHI5':
present = os.path.isfile(self.project_root + '/' + 'phi5.json')
elif directory == 'TLG_E':
present = os.path.isfile(self.project_root + '/' + 'tlg.json')
if present is True:
logging.info('%s JSON file is present.', directory)
else:
logging.error('%s JSON file is not present.', directory)

def remove_non_ascii(input_string):
"""remove non-ascii: http://stackoverflow.com/a/1342373"""
return "".join(i for i in input_string if ord(i) < 128)

def clear_log():
"""Truncates log"""
try:
with open('classics_corpus_compiler.log', 'w'):
logging.info('Cleared log if present.')
except IOError:
logging.error('Failed to clear log.')
@@ -0,0 +1,9 @@
from cltk.corpus.compiler import Compile

#c = Compile()
c = Compile('/home/kyle/Downloads/project_dir/corps', '/home/kyle/Downloads/project_dir')
#c.uppercase_files()
#c.dump_txts_phi7()
c.dump_txts_phi5()
#c.dump_txts_tlg()

@@ -0,0 +1,34 @@
"""Config for PyPI"""
import os
from setuptools import setup, find_packages

setup(
author='Kyle P. Johnson',
author_email='kyle@kyle-p-johnson.com',
classifiers=[
'Development Status :: 1 - Planning',
'Environment :: Console',
'Intended Audience :: Education',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: MIT License',
'Natural Language :: English',
'Natural Language :: Greek',
'Natural Language :: Latin',
'Operating System :: OS Independent',
'Operating System :: POSIX',
'Programming Language :: Python :: 3.3',
'Topic :: Text Processing',
'Topic :: Text Processing :: General',
'Topic :: Text Processing :: Linguistic',
'Topic :: Utilities',
],
description=('NLP support for Ancient Greek and Latin'),
keywords=['nlp', 'ancient greek', 'latin', 'tlg', 'phi', 'literature'],
license='MIT',
long_description='README',
name='cltk',
packages=find_packages(),
url='https://github.com/kylepjohnson/cltk',
version='0.0.0.1',
zip_safe = True,
)

0 comments on commit a697b9f

Please sign in to comment.
You can’t perform that action at this time.