-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdoc_info.py
112 lines (100 loc) · 4.25 KB
/
doc_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
from philologic import PhiloDB, SqlToms
import re
import time
import sqlite3
import unicodedata
from difflib import get_close_matches
class DocInfo(object):
"""Helper class meant to provide various information on documents.
It provides various convenience functions based on the PhiloLogic library"""
def __init__(self, db, query=None, path='/var/lib/philologic/databases/'):
self.db_path = path + db
self.toms = SqlToms.SqlToms(self.db_path +'/toms.db', 7)
if query:
self.query = query.split()
self.patterns = [re.compile('(?iu)(\A|\W)(%s)(\W)' % word) for word in self.query]
self.cut_begin = re.compile('\A[^ ]* ')
self.cut_end = re.compile('<*[^ ]* [^ ]*\Z')
self.word = 0
self.philo_search()
def philo_search(self):
"""Query the PhiloLogic database and retrieve a hitlist"""
db = PhiloDB.PhiloDB(self.db_path,7)
self.hitlist = db.query(self.query[self.word])
time.sleep(.05)
self.hitlist.update()
def get_metadata(self, obj_id, field):
return self.__get_info(obj_id=obj_id, field=field)
def get_obj_id(self, **metadata_info):
return self.__get_info(**metadata_info)
def __get_info(self, obj_id=False, field=None, **metadata_info):
if obj_id:
try:
obj_id = obj_id.replace('-', ' ')
obj_id = tuple(obj_id.split())
level = len(obj_id)
info = None
while level:
info = self.toms[obj_id[:level]][field]
if isinstance(info, str):
break
level -= 1
except AttributeError:
info = self.toms[obj_id][field]
else:
info = [hit['philo_id'] for hit in self.toms.query(**metadata_info)]
if info == []:
conn = sqlite3.connect(self.db_path + '/toms.db')
c = conn.cursor()
c.execute('select head from toms')
headword_dict = dict([(headword[0].lower(), headword[0]) for headword in c.fetchall() if headword[0] != None])
headword_list = [headword for headword in headword_dict]
close_matches = [headword_dict[word] for word in get_close_matches(metadata_info['head'], headword_list, 5)]
for match in close_matches:
c.execute("SELECT philo_id FROM toms WHERE head = ?;",(match,))
info.append(c.fetchone()[0])
return info
def get_excerpt(self, doc_id, highlight=False):
"""Return a text excerpt by querying PhiloLogic and using
the byte offset to extract the passage"""
doc_id = doc_id.split()[0]
index = self.binary_search(doc_id)
if index:
offsets = self.hitlist.get_bytes(self.hitlist[index])
byte_offset = offsets[0]
conc_start = byte_offset - 200
if conc_start < 0:
conc_start = 0
text_path = self.db_path + "/TEXT/" + self.get_info(doc_id, 'filename')
text_file = open(text_path)
text_file.seek(conc_start)
text = text_file.read(400)
if highlight:
for word in self.patterns:
text = word.sub('\\1<span style="color: red">\\2</span>\\3', text)
text = self.cut_begin.sub('', text)
text = self.cut_end.sub('', text)
text = text.replace('<s/>', '')
return text
else:
if self.query[self.word] != self.query[-1]:
self.word += 1
else:
self.word = 0
self.philo_search()
self.get_excerpt(doc_id)
def binary_search(self, doc_id, lo=0, hi=None):
"""Based on the Python bisect module"""
if hi is None:
hi = len(self.hitlist)
while lo < hi:
mid = (lo + hi) // 2
midval = self.hitlist[mid][0]
if midval < doc_id:
lo = mid + 1
elif midval > doc_id:
hi = mid
else:
return mid
return None