/
protocol.jr
170 lines (143 loc) · 4.18 KB
/
protocol.jr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
module org.commoncrawl.protocol.shared {
class ArcFileHeaderItem {
ustring itemKey = 1;
ustring itemValue = 2;
}
class ArcFileItem {
ustring uri = 1;
ustring hostIP = 2;
long timestamp =3;
ustring mimeType =4;
int recordLength =5;
vector<ArcFileHeaderItem> headerItems =6;
buffer content =7;
ustring arcFileName = 8;
int arcFilePos =9;
enum Flags {
TruncatedInDownload = 1;
TruncatedInInflate = 2;
}
int flags = 10;
int arcFileSize = 11;
}
// metadata information in an HTML document
class HTMLMetaTags {
enum RobotsFlags {
ALL = 1;
NO_INDEX = 2;
NO_FOLLOW = 4;
NO_ARCHIVE = 8;
}
// robots meta tag
byte robotsFlags = 1;
enum Pragmas {
NO_CACHE = 1;
}
// http pragmas
byte pragmas = 2;
// if meta refresh tag was present
ustring optionalRefreshURL = 3;
// refresh time
vint optionalRefreshTime = 4;
// other meta tags in name<LF>value<LF> format
ustring otherMetaTags = 6;
}
class ArcFileMetadata {
// optional arc file name idenitfying source arc file
ustring arcFileName = 1;
// url identifying this document
ustring docURL = 2;
// offset of this document within the arc file
vint arcFileOffset = 3;
// length of this document within the arc file
vint arcFileLength = 4;
// ip address of server document was retrieved from
int sourceIPAddress =5;
// content type (normalized)
ustring contentType = 6;
// content length (as specified by headers or computed)
vint contentLength = 7;
// charset if available
ustring charset = 8;
// signature of the arc file if present
buffer signature = 9;
// simhash (if text content type)
long textSimHashValue = 10;
enum Flags {
IS_ValidTextType = 1;
}
// flags
byte flags = 12;
// http result information
enum HTTPResponseFlags{
HEADER_MISSING = 1;
VERSION_MISSING = 2;
VERSION_0_9 = 4;
VERSION_1_0 = 8;
VERSION_1_1 = 16;
ENCODED_IS_GZIP = 32;
}
byte httpResponseFlags = 13;
vint httpResultCode = 14;
// charset detected via byte stream detector
ustring detectedCharset = 15;
// mapping of detected charset to java charset name
ustring javaCharset = 16;
// meta tag info
HTMLMetaTags metaTags = 17;
// title tag text
ustring titleText = 18;
// optional base tag ...
ustring baseURL = 20;
// anchors, as a stream of anchor + anchor text values separated by line feeds
ustring anchorTags = 21;
// cache information
vlong httpDate = 22;
vlong lastModifiedTime = 23;
ustring eTag = 24;
vlong maxAge = 25;
vlong expires = 26;
vlong age = 27;
enum CacheControlFlags {
NO_CACHE = 1;
NO_STORE = 2;
VARY = 4;
MUST_REVALIDATE= 8;
PRIVATE = 16;
}
byte cacheControlFlags = 28;
}
class URLFPV2 [nodirty] {
[key] vlong domainHash = 1;
[key] vlong urlHash = 2;
vlong rootDomainHash = 3;
}
// cc-cache data structure
class CacheItem {
// doucment url
ustring url = 1;
// url fingerprint
long urlFingerprint = 2;
enum Source {
WebRequest = 1;
S3Cache = 2;
}
// document source
byte source = 3;
// flags
enum Flags {
Flag_IsTemporaryRedirect = 1;
Flag_IsPermanentRedirect = 2;
Flag_IsCompressed = 4;
Flag_WasTruncatedDuringDownload = 8;
Flag_WasTruncatedDuringInflate = 16;
}
int flags = 4;
// if this was a redirect, the final url
ustring finalURL = 5;
// parsed header items ...
vector<org.commoncrawl.protocol.shared.ArcFileHeaderItem> headerItems =6;
// content (if available)
buffer content =7;
}
}