Permalink
Fetching contributors…
Cannot retrieve contributors at this time
1096 lines (965 sloc) 30.1 KB
/**
* These thrift definitions provide general structures for storing
* collections of textual data with:
*
* - metadata generated by automatic taggers
*
* - annotations from humans, e.g. relevance judgments and labels
*
* - multiple transformed editions, e.g. clean_visible
*
*
* Log of Major Changes:
*
* December 2012: v0_2_0 replaced kba.thrift file used in TREC's
* Knowledge Base Acceleration evaluation in NIST's TREC 2012
* conference. http://trec-kba.org
*
* April 2013: v0_3_0 introduces the non-backwards compatible of
* changing MentionID to i32, so it can be unique across the whole
* document instead of only the sentence.
*
* March 2014: Amend v0_3_0 to add a FlagType to Label and Rating,
* replicate Rating's contents in Label, and make it possible to store
* a Label independently of its source StreamItem
*
*
* This is released as open source software under the MIT X11 license:
* Copyright (c) 2012-2014 Computable Insights.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
namespace java streamcorpus
namespace py streamcorpus
namespace cpp streamcorpus
/**
* StreamTime is a timestamp measured in seconds since the 1970 epoch.
* epoch_ticks is always in the UTC timezone. This is used in several
* structs below to record various moments in history.
*
* Implementations of these interfaces in various languages may
* provide convenience methods for insuring that these two fields are
* consistent with each other.
*/
struct StreamTime {
1: double epoch_ticks,
2: string zulu_timestamp,
}
/**
* AnnotatorID is used as a property in Annotator structs and also as
* a key on maps in ContentItem.
*
* It is just a string. There is no enum for it, so consistency and
* uniqueness depends on the system generating the AnnotatorID.
*
* AnnotatorID identifies the source of a Label or Rating object. It
* is not necessarily unique. We use these conventions:
*
* - Avoid whitespace.
*
* - email address is the best identifier
*
* - when a single email address is not appropriate, create a
* descriptive string, e.g. 'nist-trec-kba-2012-assessors'
*
* - 'author' means the person who wrote the original text
*/
typedef string AnnotatorID
/**
* An Annotator object describes a human (or possibly a set of humans)
* who generated the data stored in a Label or Rating object.
*/
struct Annotator {
1: AnnotatorID annotator_id,
/**
* Approximate time when annotations/judgments/labels was rendered
* by human. If this is missing, it means that the time was not
* recorded, which often happens when the author made the
* annotation.
*/
2: optional StreamTime annotation_time,
}
/**
* Offset and OffsetType are used by Annotation to identify the
* portion of a ContentItem that a human labeled with a tag.
*
* annotation applies to a range of line numbers
*
* annotation applies to a range of bytes
*
* annotation applies to a range of chars, typically unicode chars
*
* annotation applies to a range defined by xpaths (with relative char offsets)
*/
enum OffsetType {
LINES = 0,
BYTES = 1,
CHARS = 2,
XPATH_CHARS = 3,
}
/**
* Offset specifies a range within a field of data in this ContentItem
*/
struct Offset {
/**
* see comments on OffsetType
*/
1: OffsetType type,
/**
* actual offset, which could be measured in bytes, chars, or lines.
* The data element identified by 'first' is included, and that
* identified by first+length is also included.
*
* In set notation,
* [first:first+length-1]
*
* or equivalently
* [first:first+length)
*
* or in list slicing, like python's:
* [first:first+length]
*
* While thrift treats these as signed integers, negative values are
* meaningless in this context, i.e. we do not end wrap.
*
* N.B. When this is an xpath offset, `length` is always `0` and `first`
* is always the first xpath offset in correspondence with the `xpath`
* member.
*/
2: i64 first,
3: i32 length,
/**
* If this is an xpath offset, then this is set to the xpath address of the
* start text node. The relative start character offset is in `first`.
*/
4: optional string xpath,
/**
* name of the data element inside a ContentItem to which this label
* applies, e.g. 'raw' 'clean_html' or 'clean_visible'. Defaults to
* clean_visible, which is the most common case.
*/
5: optional string content_form = "clean_visible",
/**
* bytes specified by this offset extracted from the original; just
* to assist in debugging
*/
6: optional binary value,
/**
* If this is an xpath range, then this is set to the xpath address of the
* end text node. The relative end character offset is in `xpath2_offset`.
*
* Note that `xpath` and `first` have the same relationship as
* `xpath_end` and `xpath_end_offset`.
*/
7: optional string xpath_end,
/**
* If this is an xpath offset, then this is set to the ending xpath's
* relative char offset. (`first` contains the start offset.)
*
* Note that this offset participates in the half-open interval:
*
* [(xpath, first), (xpath_end, xpath_end_offset)).
*/
8: optional i64 xpath_end_offset,
}
/**
* Targets are "informationt targets," such as entities or topics,
* usually from a knowledge base, such as Wikipedia.
*/
struct Target {
/**
* unique string identifier, usually a URL into Wikipedia, Freebase,
* or some other structured reference system for info targets.
*/
1: string target_id,
/**
* kb_id is usually redundant if the target_id is a full URL,
* e.g. en.wikipedia.org
*/
2: optional string kb_id,
/**
* moment in history that the target_kb was accessed
*/
3: optional StreamTime kb_snapshot_time,
}
/**
* General purpose flags. These flags can be used to mark documents as
* meeting an extensible set of criteria.
*/
enum FlagType {
PROFILE = 0,
}
/**
* Labels are human generated assertions about a portion of a document
* For example, a human author might label their own text by inserting
* hyperlinks to Wikipedia, or a NIST assessor might record which
* tokens in a text mention a target entity.
*
* Label instances can be attached in three palces:
* - Token.labels list
* - Sentence.labels list
* - ContentItem.labels map
*/
struct Label {
/**
* identifies the source of this Label
*/
1: Annotator annotator,
/**
* identifies the information need assessed by annotator
*/
2: Target target,
/**
* pointers to data to which this label applies. If empty, then
* label applies to the entire Token, Sentence, or ContentItem to
* which it is attached.
*/
3: optional map<OffsetType, Offset> offsets = {},
/**
* Labels are usually positive assertions that the token mentions
* the target_id. It is sometimes useful to collect negative
* assertions that a token is NOT the target_id, which can be
* indicated by setting Label.positive to False
*/
4: optional bool positive = true,
/**
* Save notes from Annotator about this Rating
*/
5: optional string comments,
/**
* Record strings that are "mentions" of the target in this text.
*
* Note: there used to be a field 'contains mention' which would
* allow for a document to be labeled as about a thing without
* containing a string naming the thing. That hardly ever actually
* happened, but maybe someday it could be added back if needed.
*/
6: optional list<string> mentions,
/**
* numerical score assigned by annotator to "judge" or "rate" the
* utility of this StreamItem to addressing the target information
* need. The range and interpretation of relevance numbers depends
* on the annotator. relevance can represent a rank ordering or an
* enumeration such as -1=Garbage, 0=Neutral, 1=Useful, 2=Vital
*/
7: optional i16 relevance,
/**
* Stream ID for this label. This is the stream_id for the source
* StreamItem, if a label is stored independently from its original
* data.
*/
8: optional string stream_id,
/**
* General purpose flags. These flags can be used to mark documents
* as meeting an extensible set of criteria.
*/
9: optional list<FlagType> flags,
}
/**
* mention_id are i32 and are unique across a document. -1 is the
* "null" value. Making this i32 causes v0_3_0 to not be backward
* compatible with v0_2_0, because thrift does not (yet) have type
* promotion.
*/
typedef i32 MentionID
/**
* Different tagging tools have different strings for labeling the
* various common entity types. To avoid ambiguity, we define a
* canonical list here, which we will surely have to expand over time
* as new taggers recognize new types of entities.
*
* LOC: physical location
*
* MISC: uncategorized named entities, e.g. Civil War for Stanford CoreNLP
*/
enum EntityType {
PER = 0,
ORG = 1,
LOC = 2,
//MALE_PRONOUN = 3, // necessary but crufty
//FEMALE_PRONOUN = 4, // necessary but crufty
TIME = 5,
DATE = 6,
MONEY = 7,
PERCENT = 8,
MISC = 9,
GPE = 10,
FAC = 11,
VEH = 12,
WEA = 13,
phone = 14,
email = 15,
URL = 16,
CUSTOM_TYPE = 17,
LIST = 18,
RELIGION = 19,
NATIONALITY = 20,
TITLE = 21,
EVENT = 22,
}
enum MentionType {
NAME = 0,
PRO = 1,
NOM = 2,
}
enum Gender {
FEMALE = 0,
MALE = 1,
}
/**
* Attributes are based primarily on TAC KBP, see also saved in this directory
* http://surdeanu.info/kbp2013/TAC_2013_KBP_Slot_Descriptions_1.0.pdf
*
* Only slots that are not resolvable to unique entities are listed
* here as attributes. Most slots are relations, so see RelationType.
*/
enum AttributeType {
PER_AGE = 0,
PER_GENDER = 1,
PER_ALTERNATE_NAMES = 3,
PER_CAUSE_OF_DEATH = 4,
PER_TITLE = 5,
PER_CHARGES = 6,
ORG_ALTERNATE_NAMES = 7,
ORG_NUMBER_OF_EMPLOYEES_MEMBERS = 8,
}
/**
* Description of an attribute of an entity discovered by a tagger in
* the text.
*/
struct Attribute {
/**
* The type of the attribute, see documentation for AttributeType
*/
1: optional AttributeType attribute_type,
/**
* UTF-8 string that tagger asserts as evidence of an attribute
*/
2: optional string evidence,
/**
* A normalized, strongly typed value derived from the evidence.
* The actual type must be determined by programmatically
* interpretint the attribute_type. For example,
* attribute_type==AttributeType.PER_GENDER implies that this value
* will be a string containing an integer index into the Gender
* enum.
*
* For attribute_type that imply a value of type date-time, the
* value is a zulu_timestamp string from a StreamTime instance.
*/
3: optional string value,
/**
* Zero-based index into the sentences array for this TaggerID
*/
4: optional i32 sentence_id,
/**
* Index into the mentions in the document. This identifies the
* mention to which the attrribute applies
*/
5: optional MentionID mention_id,
}
/**
* Textual tokens identified by an NLP pipeline and marked up with
* metadata from automatic taggers and possibly also Labels from
* humans.
*/
struct Token {
/**
* zero-based index into the stream of tokens from a document
*/
1: i32 token_num,
/**
* actual token string, must always be a UTF8 encoded string, not a
* unicode string, because thrift stores them as 8-bit.
*/
2: string token,
/**
* offsets into the original data (see Offset.content_form)
*/
3: optional map<OffsetType, Offset> offsets = {},
/**
* zero-based index into the sentence, which is used for dependency
* parsed data
*/
4: optional i32 sentence_pos = -1,
/**
* lemmatization of the token, again must be UTF8
*/
5: optional string lemma,
/**
* part of speech labels defined by Penn TreeBank:
* http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
* Should probably convert this to an enum, analogous to EntityType
*/
6: optional string pos,
/**
* entity type from named entity recognizer (classifier)
*/
7: optional EntityType entity_type,
/**
* Identifier for a each mention in this TaggerID's description of
* the document. Is unique at the document level. Serves two
* purposes:
*
* 1) Distinguishing multi-token mention. Needed when the
* entity_type and equiv_id do not change between tokens that are
* part of separate mentions, e.g. "The senator is known to his
* friends as David, Davy, Zeus, and Mr. Elephant."
*
* 2) Refering to mentions used in Relation objects.
*/
8: optional MentionID mention_id = -1,
/**
* Within-doc coref chain ID. That is, identifier of equivalence
* class of co-referent tokens. Default is -1, meaning None.
*/
9: optional i32 equiv_id = -1,
/**
* parent sentence_pos in dependency parse. Default is -1, ie None
*/
10: optional i32 parent_id = -1,
/**
* grammatical relation label on path to parent in dependency parse,
* defined by whatever tagger was used -- should pick a canonical
* definition here and convert it to an enum.
*/
11: optional string dependency_path,
/**
* Labels attached to this token, defaults to an empty map
*/
12: optional map<AnnotatorID, list<Label>> labels = {},
/**
* Identify the type of mention, e.g. pronoun, description, proper name
*/
13: optional MentionType mention_type,
/**
* CUSTOM entity type from named entity recognizer (classifier). If
* used, then entity_type should be set to EntityType.CUSTOM_TYPE,
* i.e. 17.
*
* This is useful when a specialized tagger has a large number of
* unique entity types, such as entity:artefact:weapon:blunt Rather
* than expand EntityType with many more subtypes, we can escape the
* protection of the enum and just use a string here:
*/
14: optional string custom_entity_type,
}
struct Sentence {
/**
* tokens in this sentence
*/
1: list<Token> tokens = [],
/**
* array of instances of Label attached to this sentence, defaults to
* an empty map
*/
2: optional map<AnnotatorID, list<Label>> labels = {},
}
/**
* TaggerID is used as a key on maps in ContentItem.
*
* It is just a string. There is no enum for it, so consistency and
* uniqueness depends on the system generating the TaggerID.
*/
typedef string TaggerID
struct Tagging {
1: TaggerID tagger_id,
/**
* raw output of the tagging tool
*/
2: binary raw_tagging,
/**
* short human-readable description of configuration parameters
*/
3: optional string tagger_config,
/**
* short human-readable version string of the tagging tool
*/
4: optional string tagger_version,
/**
* time that tagging was generated
*/
5: optional StreamTime generation_time,
}
/**
* Desription of a selector discovered by an extractor in the text.
*/
struct Selector {
/**
* what type of selector this is
*/
1: optional string selector_type
/**
* the selector string as it appears in the document
*/
2: string raw_selector
/**
* the selector string in a canonical form
*/
3: string canonical_selector
/**
* pointer to the selector string within the clean_visible document
*/
4: optional map<OffsetType, Offset> offsets = {},
/**
* optional metadata binary string, such as a JSON or CBOR blob,
* depends on the selector_type.
*/
5: optional binary metadata
}
enum ZoneType {
UNZONED = 0,
HEADER = 1,
TITLE = 2,
BODY = 3,
FOOTER = 4,
}
/**
* Desription of a Zone discovered by an extractor in the text.
*/
struct Zone {
/**
* what type of zone this is
*/
1: ZoneType zone_type
/**
* For a given OffsetType provide a *list* of Offset objects
*/
2: map<OffsetType, list<Offset>> offsets = {},
}
/**
* RelationType is used in Relation to map relation "name" to type.
*
* Relations 0 through 50 borrow from ACE with these string replacements:
* s/-// and s/./_/
* http://projects.ldc.upenn.edu/ace/docs/English-Events-Guidelines_v5.4.3.pdf
*
* Relations 51- borrows from KBP slot filling
* http://surdeanu.info/kbp2013/TAC_2013_KBP_Slot_Descriptions_1.0.pdf
*
* Most entity slots are relations, so the PER_ and ORG_ and FAC_
* relations listed below are primary for slot filling.
*
* Many of the KBP-based slots are redundant or overlapping with the
* ACE-based slots. The KBP-based slots are generally simpler and
* were developed to support knowledge base population rather than
* single-document extraction (as ACE was). Therefore, for KB-focused
* tasks, we recommend using the Relations 51-
*/
enum RelationType {
PHYS_Located = 0,
PHYS_Near = 1,
PARTWHOLE_Geographical = 2,
PARTWHOLE_Subsidiary = 3,
PARTWHOLE_Artifact = 4,
PERSOC_Business = 5,
PERSOC_Family = 6,
PERSOC_LastingPersonal = 7,
ORGAFF_Employment = 8,
ORGAFF_Ownership = 9,
ORGAFF_Founder = 10,
ORGAFF_StudentAlum = 11,
ORGAFF_SportsAffiliation = 12,
ORGAFF_InvestorShareholder = 13,
ORGAFF_Membership = 14,
ART_UserOwnerInventorManufacturer = 15,
GENAFF_CitizenResidentReligionEthnicity = 16,
GENAFF_OrgLocation = 17,
Business_DeclareBankruptcy = 18,
Business_EndOrg = 19,
Business_MergeOrg = 20,
Business_StartOrg = 21,
Conflict_Attack = 22,
Conflict_Demonstrate = 23,
Contact_PhoneWrite = 24,
Contact_Meet = 25,
Justice_Acquit = 26,
Justice_Appeal = 27,
Justice_ArrestJail = 28,
Justice_ChargeIndict = 29,
Justice_Convict = 30,
Justice_Execute = 31,
Justice_Extradite = 32,
Justice_Fine = 33,
Justice_Pardon = 34,
Justice_ReleaseParole = 35,
Justice_Sentence = 36,
Justice_Sue = 37,
Justice_TrialHearing = 38,
Life_BeBorn = 39,
Life_Die = 40,
Life_Divorce = 41,
Life_Injure = 42,
Life_Marry = 43,
Movement_Transport = 44,
Personnel_Elect = 45,
Personnel_EndPosition = 46,
Personnel_Nominate = 47,
Personnel_StartPosition = 48,
Transaction_TransferMoney = 49,
Transaction_TransferOwnership = 50,
PER_DATE_OF_BIRTH = 51,
PER_COUNTRY_OF_BIRTH = 52,
PER_STATEORPROVINCE_OF_BIRTH = 53,
PER_CITY_OF_BIRTH = 54,
PER_ORIGIN = 55,
PER_DATE_OF_DEATH = 56,
PER_COUNTRY_OF_DEATH = 57,
PER_STATEORPROVINCE_OF_DEATH = 58,
PER_CITY_OF_DEATH = 59,
PER_COUNTRIES_OF_RESIDENCE = 60,
PER_STATESORPROVINCES_OF_RESIDENCE = 61,
PER_CITIES_OF_RESIDENCE = 62,
PER_SCHOOLS_ATTENDED = 63,
PER_EMPLOYEE_OR_MEMBER_OF = 64,
PER_RELIGION = 65,
PER_SPOUSE = 66,
PER_CHILDREN = 67,
PER_PARENTS = 68,
PER_SIBLINGS = 69,
PER_OTHER_FAMILY = 70,
ORG_TOP_MEMBERS_EMPLOYEES = 71,
ORG_MEMBERS = 72,
ORG_MEMBER_OF = 73,
ORG_SUBSIDIARIES = 74,
ORG_PARENTS = 75,
ORG_FOUNDED_BY = 76,
ORG_DATE_FOUNDED = 77,
ORG_DATE_DISSOLVED = 78,
ORG_COUNTRY_OF_HEADQUARTERS = 79,
ORG_STATEORPROVINCE_OF_HEADQUARTERS = 80,
ORG_CITY_OF_HEADQUARTERS = 81,
ORG_SHAREHOLDERS = 82,
ORG_POLITICAL_OR_RELIGIOUS_AFFILIATION = 83,
ORG_WEBSITE = 84,
FAC_LOCATED = 85,
FAC_VISITED_BY = 86,
FAC_OWNER = 87,
PER_WON_AWARD = 88,
PER_MET_WITH = 89, // PER or ORG
PER_ATTENDED = 90, // meeting event
PER_VISITED = 91, // FAC (more general than attended)
ORG_ATTENDED = 92, // meeting event
ORG_VISITED = 93, // meeting event
PER_WEBSITE = 94,
PER_NATIONALITY = 95,
}
/**
* Description of a relation between two entities that a tagger
* discovered in the text.
*/
struct Relation {
/**
* The type of the relation, see documentation for RelationType
*
*/
1: optional RelationType relation_type,
/**
* Zero-based index into the sentences array for this TaggerID
*/
2: optional i32 sentence_id_1,
/**
* Index into the mentions in the document. This identifies the
* origin of the relation. For example, the relation
* (Bob, PHYS_Located, Chicago)
* would have mention_id_1 point to Bob.
*/
3: optional MentionID mention_id_1,
/**
* Zero-based index into the sentences array for this TaggerID
*/
4: optional i32 sentence_id_2,
/**
* Index into the mentions in the document. This identifies the
* origin of the relation. For example, the relation
* (Bob, PHYS_Located, Chicago)
* would have mention_id_2 point to Chicago.
*/
5: optional MentionID mention_id_2,
// could add equiv_id_1 and equiv_id_2
}
/**
* Description of a natural language used in text
*/
struct Language {
/**
* two letter code for the language
*/
1: string code,
2: optional string name,
}
/**
* ContentItem contains raw data, an indication of its character
* encoding, and various transformed versions of the raw data.
*/
struct ContentItem {
/**
* original download, raw byte array
*/
1: optional binary raw,
/**
* guessed from raw and/or headers, e.g. by python-requests.org
*/
2: optional string encoding,
/**
* Content-type header from fetching the data, or MIME type
*/
3: optional string media_type,
/**
* HTML-formatted version of raw with UTF8 encoding and no broken
* tags. All HTML-escaped characters are converted to their UTF8
* equivalents. < > & are escaped.
*/
4: optional string clean_html,
/**
* All tags stripped from clean_html and replaced with whitespace,
* so they have the same byte offsets. The only escaped characters
* are < > &, so that this can be treated as Character Data in XML:
* http://www.w3.org/TR/xml/#syntax
*
* Again: must be UTF8
*/
5: optional string clean_visible,
/**
* Logs generated from processing pipeline, for forensics
*/
6: optional list<string> logs = [],
/**
* A set of auto-generated taggings, such as a One-Word-Per-Line
* (OWLP) tokenization and sentence chunking with part-of-speech,
* lemmatization, and NER classification. The string name should be
* the same as the tagger_id and also corresponds to the key in
* sentences or sentence_blobs, which get generated by transforming
* a Tagging.raw_tagging into Sentence and Token instances
*
* Taggings are generated from 'clean_visible' so offsets (byte,
* char, line) refer to clean_visible and clean_html -- not raw.
*/
7: optional map<TaggerID, Tagging> taggings = {},
/**
* sets of annotations
*/
8: optional map<AnnotatorID, list<Label>> labels = {},
/**
* parsed Sentence objects generated by an NLP pipeline identified
* by the string name, which is a tagger_id that connects this
* Sentences instance to the Tagging struct from which it came
*/
9: optional map<TaggerID, list<Sentence>> sentences = {},
/**
* same as 'sentences' except the array of Sentence instances are
* serialized into a binary string that can be read by the Thrift's
* binary protocol. This allows lazy deserialization via an
* iterator -- one sentence at a time. This might be totally
* unnecessary, because at least some of the Thrift language
* implementations have lazy object construction, e.g. --gen
* py:dynamic,slots
*/
10: optional map<TaggerID, binary> sentence_blobs = {},
/**
* indication of which natural language is used in the text
*/
11: optional Language language,
/**
* List of relations discovered in clean_visible
*/
12: optional map<TaggerID, list<Relation>> relations = {},
/**
* List of attributes discovered in clean_visible
*/
13: optional map<TaggerID, list<Attribute>> attributes = {},
/**
* Map of external identifier strings to mention_ids generated by a
* particular tagger. This allows external systems to associate
* record IDs with individual mentions, or sets of mentions.
*/
14: optional map<TaggerID, map<MentionID, string>> external_ids = {},
/**
* Map of external identifier strings to selectors in clean_visible
*/
15: optional map<TaggerID, list<Selector>> selectors = {},
/**
* Map of external identifier strings to Zones in clean_visible
*/
16: optional map<TaggerID, map<ZoneType, Zone>> zones = {},
}
/**
* Ratings are buman generated assertions about a entire document's
* utility for a particular topic or entity in a reference KB.
*/
struct Rating {
/**
* identifies the source of this Rating
*/
1: Annotator annotator,
/**
* identifies the information need assessed by annotator
*/
2: Target target,
/**
* numerical score assigned by annotator to "judge" or "rate" the
* utility of this StreamItem to addressing the target information
* need. The range and interpretation of relevance numbers depends
* on the annotator. relevance can represent a rank ordering or an
* enumeration such as -1=Garbage, 0=Neutral, 1=Useful, 2=Vital
*/
3: optional i16 relevance,
/**
* true|false indication of whether the document mentions the target
* entity. This is only partially correlated with relevance. For
* example, a document might mention the entity only in chrome text
* on the side such that it is a Garbage-rated text for that entity.
*/
4: optional bool contains_mention,
/**
* Save notes from Annotator about this Rating
*/
5: optional string comments,
/**
* Record strings that are "mentions" of the target in this text
*/
6: optional list<string> mentions,
/**
* General purpose flags. These flags can be used to mark documents
* as meeting an extensible set of criteria.
*/
7: optional list<FlagType> flags,
}
/**
* SourceMetadata is a binary object with format determined by the key
* in StreamItem.source_metadata, which is often the same as
* StreamItem.source.
*
* For the kba-stream-corpus-2012, the SourceMetadata was always one
* of these schemas where 'news', 'social', 'linking' is the string
* found in StreamItem.source and the source_metadata map's key:
* - http://trec-kba.org/schemas/v1.0/news-metadata.json
* - http://trec-kba.org/schemas/v1.0/linking-metadata.json
* - http://trec-kba.org/schemas/v1.0/social-metadata.json
*
* Other keys in the source_metadata map can be:
*
* - http_headers
*/
typedef binary SourceMetadata
/**
* Versions of this protocol are enumerated so that when we expand,
* everybody can see which version a particular data file used.
*
* v0_1_0 refers to the kba.thrift definition, which was before
* Versions was included in the spec.
*/
enum Versions {
v0_2_0 = 0,
v0_3_0 = 1,
}
/**
* SystemID and DocIDorStreamID are used below in
* StreamItem.external_ids, these are just to make this file more
* self-documenting.
*/
typedef string SystemID
typedef string DocIDorStreamID
/**
* This is the primary interface to the corpus data. It is called
* StreamItem rather than CorpusItem and has a required StreamTime
* attribute, because even for a static corpus, each document was
* captured at a particular time in Earth history and might have been
* different if captured earlier or later. All corpora are stream
* corpora, even if they were not explicitly created as such.
*
* stream_id is the unique identifier for documents in the corpus.
*
* This is similar to the StreamItem defined in kba.thrift for TREC
* KBA 2012, however it removes the 'title' and 'anchor' fields, which
* can now be represented in other_content. This means that code that
* was written to read messages from kba.thrift must be updated.
*/
struct StreamItem {
/**
* must provide a version number here
*/
1: Versions version = Versions.v0_3_0,
/**
* md5 hash of the abs_url
*/
2: string doc_id,
/**
* normalized form of the original_url, should be a valid URL
*/
3: optional binary abs_url,
/**
* scheme://hostname parsed from abs_url
*/
4: optional string schost,
/**
* string obtain from some source. Only present if not a valid URL,
* in which case abs_url was derived from original_url
*/
5: optional binary original_url,
/**
* string uniquely identifying this data set, should start with a
* year string, such as 'news' or 'social'
*/
6: optional string source,
/**
* primary content
*/
7: optional ContentItem body,
/**
* see above for explanation of the values that can appear in this
* dictionary of metadata info from the source. The string keys in
* this map should be short, descriptive, and free of whitespace.
*/
8: optional map<string, SourceMetadata> source_metadata = {},
/**
* stream_id is actual unique identifier for a StreamItem. The
* format is:
*
* stream_id = '%d-%s' % (int(stream_time.epoch_ticks), doc_id)
*/
9: string stream_id,
/**
* earliest time that this content was known to exist. Usually,
* body.raw was also saved at the time of that first observation.
*/
10: StreamTime stream_time,
/**
* such as title, anchor, extracted, etc. When present, 'anchor',
* is a single anchor text of a URL pointing to this doc. Note that
* this does not have metadata like the URL of the page that
* contained this anchor. Such general link graph data may
* eventually motivate an extension to this thrift interface.
*/
11: optional map<string, ContentItem> other_content = {},
/**
* doc-level judgments relating entire StreamItem to a Target
*/
12: optional map<AnnotatorID, list<Rating>> ratings = {},
/**
* doc-level map connecting either doc_id or stream_id (or both) to
* external identifiers. This allows external systems to associate
* record IDs with individual doc_id or stream_id of this document.
* The keys in the second level map can be either doc_id or
* stream_id, or possibly other IDs in the future.
*/
14: optional map<SystemID, map<DocIDorStreamID, string>> external_ids = {},
}