/
PageCleaner.py
56 lines (46 loc) · 1.58 KB
/
PageCleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# CTK: Cherokee Toolkit
#
# Authors:
# Alvaro Lopez Ortega <alvaro@alobbs.com>
#
# Copyright (C) 2009 Alvaro Lopez Ortega
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of version 2 of the GNU General Public
# License as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.
#
PAGE_CLEAN_DUP_BEGIN = "\n___MAY_BE_DUPPED_BEGIN___\n"
PAGE_CLEAN_DUP_END = "\n___MAY_BE_DUPPED_END___\n"
def Uniq_Block (txt):
ret = PAGE_CLEAN_DUP_BEGIN
ret += txt
ret += PAGE_CLEAN_DUP_END
return ret
def Postprocess (txt):
return _remove_dupped_code (txt)
def _remove_dupped_code (txt):
dups = {}
while True:
# Find begin and end
n1 = txt.find(PAGE_CLEAN_DUP_BEGIN)
if n1 == -1:
return txt
n2 = txt.find(PAGE_CLEAN_DUP_END)
assert n2 != -1
# Remove tags
maybe_dupped = txt[n1+len(PAGE_CLEAN_DUP_BEGIN):n2]
if maybe_dupped in dups:
txt = txt[:n1] + txt[n2+len(PAGE_CLEAN_DUP_END):]
else:
txt = txt[:n1] + maybe_dupped + txt[n2+len(PAGE_CLEAN_DUP_END):]
dups[maybe_dupped] = True