This repository has been archived by the owner on Jan 10, 2023. It is now read-only.
/
document.h
493 lines (374 loc) · 14.9 KB
/
document.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
// Copyright 2017 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SLING_NLP_DOCUMENT_DOCUMENT_H_
#define SLING_NLP_DOCUMENT_DOCUMENT_H_
#include <string>
#include <vector>
#include <unordered_map>
#include "sling/base/types.h"
#include "sling/frame/object.h"
#include "sling/frame/store.h"
#include "sling/nlp/document/token-properties.h"
#include "sling/string/text.h"
#include "sling/util/unicode.h"
namespace sling {
namespace nlp {
class Span;
class Document;
// Symbol names for documents.
struct DocumentNames : public SharedNames {
DocumentNames(Store *store) { CHECK(Bind(store)); }
Name n_document{*this, "document"};
Name n_title{*this, "title"};
Name n_url{*this, "url"};
Name n_text{*this, "text"};
Name n_tokens{*this, "tokens"};
Name n_mention{*this, "mention"};
Name n_theme{*this, "theme"};
Name n_index{*this, "index"};
Name n_start{*this, "start"};
Name n_size{*this, "size"};
Name n_word{*this, "word"};
Name n_break{*this, "break"};
Name n_style{*this, "style"};
Name n_begin{*this, "begin"};
Name n_length{*this, "length"};
Name n_evokes{*this, "evokes"};
};
// A token represents a range of characters in the document text. A token is a
// word or any other kind of lexical unit like punctuation, number, etc.
class Token {
public:
// Document that the token belongs to.
Document *document() const { return document_; }
// Handle for token in the store.
Handle handle() const { return handle_; }
// Index of token in document.
int index() const { return index_; }
// Text span for token in document text The [begin;end[ is a semi-open byte
// range of the UTF-8 encoded token in the document text, where begin is the
// index of the first byte of the token and end is the first byte after the
// token.
int begin() const { return begin_; }
int end() const { return end_; }
int size() const { return end_ - begin_; }
// Token word.
const string &word() const { return word_; }
// Break level before token.
BreakType brk() const { return brk_; }
// Token style change before token.
int style() const { return style_; }
// Lowest span covering the token.
Span *span() const { return span_; }
// Token fingerprint.
uint64 Fingerprint() const;
// Token case form.
CaseForm Form() const;
// Punctuation tokens etc. are skipped in phrase comparison.
bool skipped() const { return Fingerprint() == 1; }
// Check for initial token in a sentence.
bool initial() const { return index_ == 0 || brk_ >= SENTENCE_BREAK; }
private:
Document *document_; // document the token belongs to
Handle handle_; // handle for token in the store
int index_; // index of token in document
int begin_; // first byte position of token
int end_; // first byte position after token
string word_; // token word
BreakType brk_; // break level before token
int style_; // token style change before token
mutable uint64 fingerprint_; // fingerprint for token text
mutable CaseForm form_; // case form for token
Span *span_; // lowest span covering the token
friend class Document;
};
// A span represents a range of tokens in the document. The token span is
// represented as a mention frame which can record features of the mention as
// well as other frames that are evoked by this mention.
class Span {
public:
Span(Document *document, int begin, int end)
: document_(document), begin_(begin), end_(end) {}
// Returns the document that that the span belongs to.
Document *document() const { return document_; }
// Returns the begin and end token. This is a half-open interval, so the
// span covers the tokens in the range [begin;end[.
int begin() const { return begin_; }
int end() const { return end_; }
// Returns the length of the spans in number of tokens.
int length() const { return end_ - begin_; }
// Returns text for span.
string GetText() const;
// Returns true if this spans contains the other span.
bool Contains(Span *other) const {
return begin_ <= other->begin_ && end_ >= other->end_;
}
// Returns true if this span contains the token.
bool Contains(int token) const {
return begin_ <= token && token < end_;
}
// Returns true if this spans is contained by the other span.
bool ContainedBy(Span *other) const {
return begin_ >= other->begin_ && end_ <= other->end_;
}
// Returns the mention frame for the token span.
const Frame &mention() const { return mention_; }
// Returns true if the span has been deleted from the document.
bool deleted() const { return mention_.invalid(); }
// Returns the enclosing parent span, or null is it is a top-level span.
Span *parent() const { return parent_; }
// Returns the left-most enclosed child span.
Span *children() const { return children_; }
// Returns the sibling span to the right. All the child spans of a parent are
// linked together left-to-right through the sibling pointers.
Span *sibling() const { return sibling_; }
// Returns outer-most containing span.
Span *outer() {
Span *s = this;
while (s->parent_ != nullptr) s = s->parent_;
return s;
}
// Adds frame evocation to span.
void Evoke(const Frame &frame);
void Evoke(Handle frame);
// Replaces evoked frame.
void Replace(Handle existing, Handle replacement);
void Replace(const Frame &existing, const Frame &replacement) {
Replace(existing.handle(), replacement.handle());
}
// Returns (the first) evoked frame of a certain type.
Frame Evoked(Handle type) const;
Frame Evoked(const Name &type) const;
// Returns the first evoked frame.
Frame Evoked() const;
Handle evoked() const;
// Returns all evoked frames.
void AllEvoked(Handles *evoked) const;
// Checks if span evokes a certain frame.
bool Evokes(Handle frame) const;
bool Evokes(const Frame &frame) const { return Evokes(frame.handle()); }
// Checks if span evokes a certain type of frame.
bool EvokesType(Handle type) const;
bool EvokesType(const Name &type) const;
// Returns fingerprint for span phrase.
uint64 Fingerprint() const;
// Returns case form for span phrase.
CaseForm Form() const;
// Returns first/last token in span.
inline const Token &first() const;
inline const Token &last() const;
// Check for initial span in a sentence.
bool initial() const { return first().initial(); }
private:
// Document that span belongs to.
Document *document_;
// Tokens covered by span. The span covers tokens in the interval [begin;end[,
// i.e. begin is inclusive and end is exclusive.
int begin_;
int end_;
// Mention frame for span.
Frame mention_;
// Span indexing.
Span *parent_ = nullptr; // enclosing parent span
Span *sibling_ = nullptr; // first sibling to the right enclosed by parent
Span *children_ = nullptr; // left-most enclosed sub-span
// Span fingerprint and case form. This is lazily initialized and cached.
mutable uint64 fingerprint_ = 0;
mutable CaseForm form_ = CASE_INVALID;
friend class Document;
};
// A document wraps a frame that contains the token, span, and frame
// annotations for the document.
class Document {
public:
// Create empty document.
explicit Document(Store *store, const DocumentNames *names = nullptr);
// Initialize document from frame.
explicit Document(const Frame &top, const DocumentNames *names = nullptr);
// Copy constructor for making a shallow copy of the whole document.
Document(const Document &other, bool annotations);
Document(const Document &other) : Document(other, true) {}
// Make a shallow copy of parts of the document. Only annotations within the
// token range are copied.
Document(const Document &other, int begin, int end, bool annotations);
~Document();
// Return document frame.
const Frame &top() const { return top_; }
// Return store for document.
Store *store() const { return top_.store(); }
// Update the document frame.
void Update();
// Return the document text.
const string &text() const { return text_; }
// Return document title.
Text title() const { return top_.GetText(names_->n_title); }
// Return document url.
Text url() const { return top_.GetText(names_->n_url); }
// Set document text. This will delete all existing tokens.
void SetText(Handle text);
void SetText(const String &text) { SetText(text.handle()); }
void SetText(Text text);
// Add token to document.
void AddToken(Text word,
int begin = -1, int end = -1,
BreakType brk = SPACE_BREAK,
int style = 0);
// Returns the small enclosing span for [begin, end). If no such span exists,
// then returns nullptr. If a crossing span exists, then returns nullptr and
// sets 'crossing' to true.
Span *EnclosingSpan(int begin, int end, bool *crossing);
// Add new span to the document. The span is initialized with a mention frame
// for the span. If the span already exists, the type is added to the mention
// and the existing span is returned. Spans can be nested but are not allowed
// to cross, in which case null is returned.
Span *AddSpan(int begin, int end, Handle type);
Span *AddSpan(int begin, int end, const Name &type) {
return AddSpan(begin, end, type.Lookup(store()));
}
Span *AddSpan(int begin, int end) {
return AddSpan(begin, end, Handle::nil());
}
// Deletes span from the document.
void DeleteSpan(Span *span);
// Returns the number of spans in the document.
int num_spans() const { return spans_.size(); }
// Return span in document.
Span *span(int index) const { return spans_[index]; }
// Return all spans in document.
const std::vector<Span *> spans() const { return spans_; }
// Return the number of tokens in the document.
int length() const { return tokens_.size(); }
int num_tokens() const { return tokens_.size(); } // deprecated
// Return token in the document.
const Token &token(int index) const { return tokens_[index]; }
// Return document tokens.
const std::vector<Token> &tokens() const { return tokens_; }
// Locate token index containing text position.
int Locate(int position) const;
// Return fingerprint for token in document.
uint64 TokenFingerprint(int token) const {
return tokens_[token].Fingerprint();
}
// Returns the fingerprint for [begin, end).
uint64 PhraseFingerprint(int begin, int end) const;
// Returns case form for phrase [begin, end).
CaseForm PhraseForm(int begin, int end) const;
// Returns the phrase text for span.
string PhraseText(int begin, int end) const;
// Finds span for phrase, or null if there is no matching span.
Span *GetSpan(int begin, int end) const;
// Returns lowest span at token position or null if no spans are covering the
// token.
Span *GetSpanAt(int index) const { return tokens_[index].span(); }
// Adds thematic frame to document.
void AddTheme(Handle handle);
void AddTheme(const Frame &frame) { AddTheme(frame.handle()); }
// Removes thematic frame from document.
void RemoveTheme(Handle handle);
void RemoveTheme(const Frame &frame) { RemoveTheme(frame.handle()); }
// Returns list of thematic frames for document.
const Handles &themes() const { return themes_; }
// Add extra slots to document frame.
void AddExtra(Handle name, Handle value);
void AddExtra(const Name &name, Handle value) {
AddExtra(name.handle(), value);
}
void AddExtra(const Name &name, Text value) {
AddExtra(name.handle(), store()->AllocateString(value));
}
// Clears annotations (mentions and themes) from document.
void ClearAnnotations();
// Document schema.
const DocumentNames *names() const { return names_; }
private:
// Inserts the span in the span index. If the span already exists, the
// existing span is returned. Returns null if the new span crosses an existing
// span.
Span *Insert(int begin, int end);
// Removes the span from the span index.
void Remove(Span *span);
// Document frame.
Frame top_;
// Document text.
string text_;
// Document tokens.
std::vector<Token> tokens_;
// If the tokens have been changed the Update() method will update the tokens
// in the document frame.
bool tokens_changed_ = false;
// Document mention spans.
std::vector<Span *> spans_;
// List of thematic frames. These are frames that are not evoked by any
// particular phrase in the text.
Handles themes_;
// Additional slots that should be added to document.
Slots *extras_ = nullptr;
// Document symbol names.
const DocumentNames *names_;
friend class Span;
};
// Iteration over parts of a document based on break level.
class DocumentIterator {
public:
// Initialize iterator for iterating over document based on break level.
DocumentIterator(const Document *document, BreakType brk, int skip = 0)
: document_(document), brk_(brk), skip_(skip) {
next();
}
// Check if there are more parts in the document
bool more() const { return begin_ < document_->length(); }
// Go to next document part.
void next() {
int n = document_->length();
while (begin_ < n) {
// Find start of next part.
begin_ = end_++;
if (begin_ >= n) break;
while (end_ < n && document_->token(end_).brk() < brk_) end_++;
// Stop unless document part should be skipped.
if ((document_->token(begin_).style() & skip_) == 0) break;
}
}
// Return the span for the current document part.
int begin() const { return begin_; }
int end() const { return end_; }
// Return length of current document part.
int length() const { return end_ - begin_; }
private:
// Document to iterate over.
const Document *document_;
// Break level for document parts.
BreakType brk_;
// Token style mask for skipping parts.
int skip_;
// Current document part.
int begin_ = 0;
int end_ = 0;
};
// Document sentence iterator. Example:
//
// for (SentenceIterator s(document); s.more(); s.next()) {
// LOG(INFO) << "Sentence: " << s.begin() << " to " << s.end();
// }
//
class SentenceIterator : public DocumentIterator {
public:
SentenceIterator(const Document *document, int skip = 0)
: DocumentIterator(document, SENTENCE_BREAK, skip) {}
};
inline const Token &Span::first() const { return document_->token(begin_); }
inline const Token &Span::last() const { return document_->token(end_ - 1); }
} // namespace nlp
} // namespace sling
#endif // SLING_NLP_DOCUMENT_DOCUMENT_H_