-
Notifications
You must be signed in to change notification settings - Fork 9
/
summarize.py
51 lines (40 loc) 路 1.29 KB
/
summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# Produce summaries of laws from law titles using
# TextRank algorithm provided by gensim
from gensim.summarization import summarize as summarize_textrank
import codifier
import multiprocessing
import database
import string
import logging
db = database.Database()
# Filtering Heuristic
MAX_TITLE_WORDS = 20
# Summarization job
def job(identifier):
global db
punct = str.maketrans('', '', string.punctuation)
try:
l = codifier.codifier.laws[identifier]
titles = filter(lambda x: len(x.split()) <= MAX_TITLE_WORDS,
[x.lstrip().rstrip().translate(punct) for x in l.titles.values()])
titles = filter(lambda x: x.rstrip() != '', titles)
titles = '. '.join(titles)
summary = summarize_textrank(titles, ratio=0.1)
except BaseException as e:
logging.warning(str(e))
finally:
summary_obj = {
'_id': identifier,
'summary': summary
}
db.summaries.save(summary_obj)
# Summarize
def summarize(identifiers=[]):
workers = multiprocessing.cpu_count() - 1
pool = multiprocessing.Pool(workers)
if identifiers == []:
identifiers = list(codifier.codifier.laws.keys())
print(identifiers)
pool.map(job, identifiers)
if __name__ == '__main__':
summarize()