-
Notifications
You must be signed in to change notification settings - Fork 19
/
HTMLDocument.java
221 lines (196 loc) · 6.04 KB
/
HTMLDocument.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
package de.l3s.boilerpipe.sax;
import java.io.ByteArrayInputStream;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.xml.sax.InputSource;
/**
* An {@link InputSourceable} for {@link HTMLFetcher}.
*
* @author Christian Kohlschütter
*/
public class HTMLDocument implements InputSourceable {
private Charset charset;
private byte[] data;
public HTMLDocument(final byte[] data, final Charset charset) {
this.data = data;
this.charset = charset;
}
public HTMLDocument(final String data) {
Charset cs = Charset.forName("utf-8");
this.data = data.getBytes(cs);
this.charset = cs;
}
public Charset getCharset() {
return charset;
}
public byte[] getData() {
return data;
}
public InputSource toInputSource() {
final InputSource is = new InputSource(new ByteArrayInputStream(data));
is.setEncoding(charset.name());
return is;
}
/*
* Encodes <img > tags as #img#<attributes>#/img#
*/
public void encodeImageTagsAsText()
{
String htmlDataString = new String(this.data);
htmlDataString = HTMLDocument.encodeImageTagsAsText(htmlDataString, this.charset.name());
final byte[] htmlData = htmlDataString.getBytes();
this.data = htmlData;
}
/*
* Encodes <img > tags as #img#<attributes>#/img#
*/
public static String encodeImageTagsAsText(String htmlDataString, String encoding)
{
ArrayList<String> images = new ArrayList<String>();
Pattern PAT_IMAGE_TAG = Pattern.compile("<img (.*?)[/]?>");
boolean repeat = true;
while(repeat) {
repeat = false;
Matcher matcher = PAT_IMAGE_TAG.matcher(htmlDataString);
if(matcher.find()) {
repeat = true;
String imageAttributes = matcher.group(1);
try {
imageAttributes = URLEncoder.encode(imageAttributes, encoding);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
imageAttributes = URLEncoder.encode(imageAttributes);
}
String encodedImageTag = "#img#" + imageAttributes + "#/img#";
// Ignore duplicate images
if(!images.contains(encodedImageTag))
{
images.add(encodedImageTag);
// System.out.println("encodedImageTag: " + encodedImageTag);
htmlDataString = matcher.replaceFirst(encodedImageTag);
}
else
{
// System.out.println("skipping duplicate encodedImageTag: " + encodedImageTag);
htmlDataString = matcher.replaceFirst("");
}
}
}
return htmlDataString;
}
/*
* Decodes #img#<attributes>#/img# as <img > tags
*/
public void restoreTextEncodedImageTags()
{
String htmlDataString = new String(this.data);
htmlDataString = HTMLDocument.restoreTextEncodedImageTags(htmlDataString, this.charset.name());
final byte[] htmlData = htmlDataString.getBytes();
this.data = htmlData;
}
/*
* Decodes #img#<attributes>#/img# as <img > tags
*/
public static String restoreTextEncodedImageTags(String htmlDataString, String encoding)
{
Pattern PAT_IMAGE_TAG = Pattern.compile("#img#(.*?)#/img#");
boolean repeat = true;
while(repeat) {
repeat = false;
Matcher matcher = PAT_IMAGE_TAG.matcher(htmlDataString);
if(matcher.find()) {
repeat = true;
String imageAttributes = matcher.group(1);
try {
imageAttributes = URLDecoder.decode(imageAttributes, encoding);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
imageAttributes = URLDecoder.decode(imageAttributes);
}
String decodedImageTag = "<img " + imageAttributes + ">";
// System.out.println("decodedImageTag: " + decodedImageTag);
htmlDataString = matcher.replaceFirst(decodedImageTag);
}
}
return htmlDataString;
}
/*
* Encodes &#xxxx; escaped chars as #esc#xxx#/esc#
*/
public void encodeEscapedCharsAsText()
{
String htmlDataString = new String(this.data);
htmlDataString = HTMLDocument.encodeEscapedCharsAsText(htmlDataString, this.charset.name());
final byte[] htmlData = htmlDataString.getBytes();
this.data = htmlData;
}
/*
* Encodes &#xxxx; escaped chars as #esc#xxx#/esc#
*/
public static String encodeEscapedCharsAsText(String htmlDataString, String encoding)
{
// Wrap any escaped chars in cdata
Pattern PAT_ESC_CHAR = Pattern.compile("&#(.*?);");
boolean repeat = true;
while(repeat) {
repeat = false;
Matcher matcher = PAT_ESC_CHAR.matcher(htmlDataString);
if(matcher.find()) {
repeat = true;
String escChar = matcher.group(1);
try {
escChar = URLEncoder.encode(escChar, encoding);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
escChar = URLEncoder.encode(escChar);
}
String encodedEscChar = "#esc#" + escChar + "#/esc#";
// System.out.println("encodedEscChar: " + encodedEscChar);
htmlDataString = matcher.replaceFirst(encodedEscChar);
}
}
return htmlDataString;
}
/*
* Decodes #esc#xxx#/esc# as &#xxxx; escaped chars
*/
public void restoreTextEncodedEscapedChars()
{
String htmlDataString = new String(this.data);
htmlDataString = HTMLDocument.restoreTextEncodedEscapedChars(htmlDataString, this.charset.name());
final byte[] htmlData = htmlDataString.getBytes();
this.data = htmlData;
}
/*
* Decodes #esc#xxx#/esc# as &#xxxx; escaped chars
*/
public static String restoreTextEncodedEscapedChars(String htmlDataString, String encoding)
{
// Wrap any escaped chars in cdata
Pattern PAT_ESC_CHAR = Pattern.compile("#esc#(.*?)#/esc#");
boolean repeat = true;
while(repeat) {
repeat = false;
Matcher matcher = PAT_ESC_CHAR.matcher(htmlDataString);
if(matcher.find()) {
repeat = true;
String escChar = matcher.group(1);
try {
escChar = URLDecoder.decode(escChar, encoding);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
escChar = URLDecoder.decode(escChar);
}
String decodedEscChar = "&#" + escChar + ";";
// System.out.println("decodedEscChar: " + decodedEscChar);
htmlDataString = matcher.replaceFirst(decodedEscChar);
}
}
return htmlDataString;
}
}