/
utils.py
95 lines (85 loc) · 3.13 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from xml.etree import ElementTree as ET
import base64
import datetime
import hashlib
def find_all_tags(fp, tags, progress_callback=None):
parser = ET.XMLPullParser(("start", "end"))
root = None
while True:
chunk = fp.read(1024 * 1024)
if not chunk:
break
parser.feed(chunk)
for event, el in parser.read_events():
if event == "start" and root is None:
root = el
if event == "end" and el.tag in tags:
yield el.tag, el
root.clear()
if progress_callback is not None:
progress_callback(len(chunk))
def save_note(db, note):
title = note.find("title").text
created = note.find("created").text
updated = note.find("updated").text
# Some content has which breaks the XML parser
content_xml = note.find("content").text.replace(" ", "")
content = ET.tostring(ET.fromstring(content_xml)).decode("utf-8")
row = {
"title": title,
"content": content,
"created": convert_datetime(created),
"updated": convert_datetime(updated),
}
attributes = note.find("note-attributes")
if attributes is not None:
row.update({attribute.tag: attribute.text for attribute in attributes})
note_id = db["notes"].insert(row, hash_id="id", replace=True, alter=True).last_pk
# Now do the resources
for resource in note.findall("resource"):
resource_id = save_resource(db, resource)
db["note_resources"].insert(
{
"note_id": note_id,
"resource_id": resource_id,
},
pk=("note_id", "resource_id"),
foreign_keys=("note_id", "resource_id"),
replace=True,
)
def save_resource(db, resource):
assert resource.find("data").attrib["encoding"] == "base64"
if resource.find("data").text is None:
return
data = base64.b64decode(resource.find("data").text)
md5 = hashlib.md5(data).hexdigest()
row = {
"md5": md5,
}
for tag in ("mime", "width", "height", "duration"):
row[tag] = resource.find(tag).text if resource.find(tag) is not None else None
attributes = resource.find("resource-attributes")
if attributes is not None:
row.update({attribute.tag: attribute.text for attribute in attributes})
if resource.find("recognition") is not None:
ocr = " ".join(
[
t.text
for t in ET.fromstring(resource.find("recognition").text).findall(
".//t"
)
]
)
else:
ocr = None
row["ocr"] = ocr
db["resources"].insert(row, pk="md5", alter=True, replace=True)
db["resources_data"].insert({"md5": md5, "data": data}, pk="md5", replace=True)
return md5
def ensure_indexes(db):
for column in ("created", "updated"):
db["notes"].create_index([column], if_not_exists=True)
if not db["notes_fts"].exists():
db["notes"].enable_fts(["title", "content"], create_triggers=True)
def convert_datetime(s):
return datetime.datetime.strptime(s, "%Y%m%dT%H%M%SZ").isoformat()