-
Notifications
You must be signed in to change notification settings - Fork 0
/
dompy.py
134 lines (97 loc) · 2.98 KB
/
dompy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from lxml import html as lhtml
import urllib2
import sys
class Document:
"""Given an html page as a string, convert it to a document model
Aims to mimick the document variable in the browser."""
def __init__(self,html):
lxml_tree = lhtml.fromstring(html)
self.body = collect_nodes(lxml_tree.body) #the body node, in most cases, we start iterating from here
#This is a collection of ALL the Nodes on the page. Just a flat array
#We should probably build this first.
self.all = parseTree(self.body)
self.idMap = self._build_hashMap()
def _build_hashMap(self):
idMap = {}
for node in self.all:
if node.id:
idMap[node.id] = node
return idMap
def getElementById(self,id):
"""This is most likely implemented as a hash map on most browsers.
We can probably build this while filling the document.all collection"""
try:
return self.idMap[id]
except KeyError:
return None
def getElementsByTagName(self,tagName):
return self.body.getElementsByTagName(tagName)
def getElementsByClassName(self,className):
return self.body.getElementsByClassName(className)
class Node:
"""Implementation of javascript DOM nodes. I'm not really sure if
there's a propper name for these."""
def __init__(self,elTag,parentNode=None,children=[]):
self.tagName = elTag.tag
self.parentNode = parentNode
self.children = children
self.id = elTag.get("id")
self.className = elTag.get("class")
self.title = elTag.get('title')
self.href = elTag.get('href')
self.value = elTag.get('value')
self.name = elTag.get('name')
self.action = elTag.get('action')
self.innerText = elTag.text #TODO: handle recursively
if elTag.tail:
self.innerText += elTag.tail
def __repr__(self):
ret = "<dompy.Node:{}".format(self.tagName)
if self.id:
ret += "#{}".format(self.id)
if self.className:
ret += ".{}".format(self.className)
return ret + ">"
def appendChild(self,child):
self.children.append(child)
if child.innerText:
self.innerText += child.innerText
def getElementsByTagName(self,tagName):
ret = []
if self.tagName == tagName:
ret.append(self)
for node in self.children:
ret += node.getElementsByTagName(tagName)
return ret
def getElementsByClassName(self,className):
ret = []
if self.className and className in self.className:
ret.append(self)
for node in self.children:
more = node.getElementsByClassName(className)
if more is not None:
ret.append(more)
##if there is only one match, just return it as a Node object,
##rather then an array (of length 1)
if len(ret) > 0:
return ret[0] if len(ret) == 1 else ret
else:
return None
def collect_nodes(tag):
"""
Iterate over a tag and collect its children.
"""
node = Node(tag,children=[])
childs = []
for sub in tag:
childs.append(collect_nodes(sub))
for child in childs:
node.appendChild(child)
return node
def parseTree(root):
ret = [root]
if len(root.children) < 1:
return ret
for node in root.children:
ret += parseTree(node)
return ret