/
convert2text.py
58 lines (42 loc) · 1.37 KB
/
convert2text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import html2text
import string
import os
SRC_DIR = 'data'
DST_DIR = 'text'
def parseHTML(filename):
html = open(filename).read()
html = filter(lambda x: x in string.printable, html)
txt = html2text.html2text(html)
results=[]
count=0
lines = txt.split('\n')
for i in range(len(lines)):
if lines[i].startswith('##'):
count=count+1
if lines[i].startswith(" * Principals only."):
break
if count==2:
results.append(lines[i])
if lines[i].startswith("post id:"):
break
return '\n'.join(results)
for src_dir, subdirList, fileList in os.walk(SRC_DIR):
dst_dir = DST_DIR + src_dir.lstrip(SRC_DIR)
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
print('Found dir %s - %s' % (src_dir, dst_dir))
for src_name in fileList:
dst_name = src_name[:-4]+"txt"
dst_full = dst_dir+"/"+dst_name
if not os.path.exists(dst_full):
print "Converting "+dst_full
txt = parseHTML(src_dir+"/"+src_name)
txt = filter(lambda x: x in string.printable, txt)
target = open(dst_full,'w')
target.truncate()
target.write(txt)
target.close()
else:
print "Already have "+dst_full
#x= parseHTML('data/atlanta/sof/5180401068.html')
#print x