From a979367ad667f4b9afab3b1927f1a1a73971cc99 Mon Sep 17 00:00:00 2001 From: Benjamin Piwowarski Date: Wed, 6 Mar 2024 15:09:33 +0100 Subject: [PATCH] Updates and bug fixes --- docs/source/api/conversation.rst | 13 ++-- docs/source/api/embeddings.rst | 6 ++ docs/source/api/index.rst | 3 + docs/source/api/ir.rst | 15 +++++ docs/source/api/nlp.rst | 4 ++ docs/source/api/recommendation.rst | 6 ++ docs/source/api/text.rst | 2 + requirements.txt | 2 +- src/datamaestro_text/data/__init__.py | 0 .../data/conversation/canard.py | 18 +----- .../data/conversation/orconvqa.py | 18 +----- src/datamaestro_text/data/ir/__init__.py | 19 +++--- src/datamaestro_text/data/ir/base.py | 62 +++---------------- src/datamaestro_text/data/ir/cord19.py | 12 +--- src/datamaestro_text/data/ir/csv.py | 13 ++-- src/datamaestro_text/data/ir/formats.py | 8 +-- src/datamaestro_text/datasets/irds/data.py | 51 +++++---------- src/datamaestro_text/interfaces/trec.py | 4 +- .../transforms/ir/__init__.py | 20 +++--- 19 files changed, 103 insertions(+), 173 deletions(-) create mode 100644 docs/source/api/embeddings.rst create mode 100644 docs/source/api/nlp.rst create mode 100644 docs/source/api/recommendation.rst create mode 100644 src/datamaestro_text/data/__init__.py diff --git a/docs/source/api/conversation.rst b/docs/source/api/conversation.rst index b096d5f..b679d2e 100644 --- a/docs/source/api/conversation.rst +++ b/docs/source/api/conversation.rst @@ -29,20 +29,19 @@ Data classes Contextual query reformulation ------------------------------ +.. autoxpmconfig:: datamaestro_text.data.conversation.base.ConversationDataset + .. autoclass:: ContextualizedRewrittenQuery :members: .. autoxpmconfig:: datamaestro_text.data.conversation.canard.CanardDataset :members: iter -.. autoclass:: OrConvQADatasetAnswer - :members: +.. autoxpmconfig:: datamaestro_text.data.conversation.orconvqa.OrConvQADataset + :members: iter -.. autoclass:: OrConvQADatasetHistoryEntry +.. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetAnswer :members: -.. autoclass:: OrConvQADataset +.. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetHistoryEntry :members: - -.. autoxpmconfig:: OrConvQADataset - :members: iter diff --git a/docs/source/api/embeddings.rst b/docs/source/api/embeddings.rst new file mode 100644 index 0000000..ccc1cd6 --- /dev/null +++ b/docs/source/api/embeddings.rst @@ -0,0 +1,6 @@ +Embeddings +========== + +.. autoxpmconfig:: datamaestro_text.data.embeddings.WordEmbeddings + +.. autoxpmconfig:: datamaestro_text.data.embeddings.WordEmbeddingsText diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst index 61b72a1..400f04f 100644 --- a/docs/source/api/index.rst +++ b/docs/source/api/index.rst @@ -6,3 +6,6 @@ Datamaestro Text API text ir conversation + embeddings + recommendation + nlp diff --git a/docs/source/api/ir.rst b/docs/source/api/ir.rst index d808b69..4f1bc03 100644 --- a/docs/source/api/ir.rst +++ b/docs/source/api/ir.rst @@ -20,10 +20,15 @@ Topics :members: iter, count .. autoxpmconfig:: datamaestro_text.data.ir.csv.Topics +.. autoxpmconfig:: datamaestro_text.data.ir.TopicsStore .. autoxpmconfig:: datamaestro_text.transforms.ir.TopicWrapper +Dataset-specific Topics +----------------------- + .. autoxpmconfig:: datamaestro_text.data.ir.trec.TrecTopics +.. autoxpmconfig:: datamaestro_text.data.ir.cord19.Topics Documents --------- @@ -33,6 +38,9 @@ Documents :members: iter_documents, iter_ids, documentcount .. autoxpmconfig:: datamaestro_text.data.ir.cord19.Documents .. autoxpmconfig:: datamaestro_text.data.ir.trec.TipsterCollection +.. autoxpmconfig:: datamaestro_text.data.ir.csv.Documents +.. autoxpmconfig:: datamaestro_text.data.ir.stores.OrConvQADocumentStore +.. autoxpmconfig:: datamaestro_text.datasets.irds.data.LZ4DocumentStore Assessments ----------- @@ -52,12 +60,19 @@ Runs .. autoxpmconfig:: datamaestro_text.data.ir.csv.AdhocRunWithText .. autoxpmconfig:: datamaestro_text.data.ir.trec.TrecAdhocRun + Results ------- +.. autoxpmconfig:: datamaestro_text.data.ir.AdhocResults .. autoxpmconfig:: datamaestro_text.data.ir.trec.TrecAdhocResults :members: get_results +Evaluation +---------- + +.. autoxpmconfig:: datamaestro_text.data.ir.Measure + Reranking --------- diff --git a/docs/source/api/nlp.rst b/docs/source/api/nlp.rst new file mode 100644 index 0000000..25c5925 --- /dev/null +++ b/docs/source/api/nlp.rst @@ -0,0 +1,4 @@ +NLP +=== + +.. autoxpmconfig:: datamaestro_text.data.tagging.CoNLL_U diff --git a/docs/source/api/recommendation.rst b/docs/source/api/recommendation.rst new file mode 100644 index 0000000..b54af6d --- /dev/null +++ b/docs/source/api/recommendation.rst @@ -0,0 +1,6 @@ +Recommendation +============== + + +.. autoxpmconfig:: datamaestro_text.data.recommendation.RatedItems +.. autoxpmconfig:: datamaestro_text.data.recommendation.Movielens diff --git a/docs/source/api/text.rst b/docs/source/api/text.rst index eed8b72..8a98323 100644 --- a/docs/source/api/text.rst +++ b/docs/source/api/text.rst @@ -2,3 +2,5 @@ Text API ======== .. autoxpmconfig:: datamaestro_text.data.text.TextFolder +.. autoxpmconfig:: datamaestro_text.data.text.TextFile +.. autoxpmconfig:: datamaestro_text.data.text.TrainingText diff --git a/requirements.txt b/requirements.txt index 38eedf9..fdb15d3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -datamaestro>=1.0.2 +datamaestro>=1.1.0 ir_datasets attrs diff --git a/src/datamaestro_text/data/__init__.py b/src/datamaestro_text/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/datamaestro_text/data/conversation/canard.py b/src/datamaestro_text/data/conversation/canard.py index 17f89d4..ed2116a 100644 --- a/src/datamaestro_text/data/conversation/canard.py +++ b/src/datamaestro_text/data/conversation/canard.py @@ -1,16 +1,10 @@ -from typing import Iterator, List, Optional +from typing import Iterator, List from attr import define import json from datamaestro.data import File -from datamaestro.record import recordtypes -from datamaestro_text.data.ir.base import GenericTopicRecord from .base import ( - AnswerEntry, ConversationTree, - RetrievedEntry, SingleConversationTree, - SimpleDecontextualizedItem, - AnswerConversationRecord, ) from . import ConversationDataset @@ -35,16 +29,6 @@ class CanardConversation: """Question number""" -@recordtypes(SimpleDecontextualizedItem) -class CanardTopicRecord(GenericTopicRecord): - pass - - -@recordtypes(AnswerEntry, RetrievedEntry) -class CanardAnswerRecord(AnswerConversationRecord): - pass - - class CanardDataset(ConversationDataset, File): """A dataset in the CANARD JSON format""" diff --git a/src/datamaestro_text/data/conversation/orconvqa.py b/src/datamaestro_text/data/conversation/orconvqa.py index a5af319..55c6d86 100644 --- a/src/datamaestro_text/data/conversation/orconvqa.py +++ b/src/datamaestro_text/data/conversation/orconvqa.py @@ -3,11 +3,9 @@ from attr import define import json from datamaestro.data import File -from datamaestro.record import recordtypes +from datamaestro.record import Record from datamaestro_text.data.ir.base import ( - TopicRecord, - GenericTopicRecord, IDItem, SimpleTextItem, ) @@ -65,16 +63,6 @@ class OrConvQADatasetEntry: """Relevance status for evidences""" -@recordtypes(SimpleDecontextualizedItem) -class OrConvQATopicRecord(GenericTopicRecord): - pass - - -@recordtypes(AnswerEntry, RetrievedEntry) -class OrConvQAAnswerRecord(AnswerConversationRecord): - pass - - class OrConvQADataset(ConversationDataset, File): def entries(self) -> Iterator[OrConvQADatasetEntry]: """Iterates over re-written query with their context""" @@ -114,14 +102,14 @@ def __iter__(self) -> Iterator[ConversationTree]: # Add to current history.append( - OrConvQATopicRecord( + Record( IDItem(query_no), SimpleTextItem(entry.query), SimpleDecontextualizedItem(entry.rewrite), ) ) history.append( - OrConvQAAnswerRecord( + Record( AnswerEntry(entry.answer.text), RetrievedEntry(entry.evidences, entry.retrieval_labels), ) diff --git a/src/datamaestro_text/data/ir/__init__.py b/src/datamaestro_text/data/ir/__init__.py index 365c12d..4584780 100644 --- a/src/datamaestro_text/data/ir/__init__.py +++ b/src/datamaestro_text/data/ir/__init__.py @@ -11,7 +11,7 @@ from datamaestro.data import Base from datamaestro_text.utils.files import auto_open from datamaestro_text.utils.iter import BatchIterator -from datamaestro.record import Record +from datamaestro.record import record_type, RecordType from .base import ( # noqa: F401 # Record items IDItem, @@ -21,13 +21,8 @@ DocumentRecord, SimpleTextItem, ScoredItem, - # Pre-defined usual records - GenericTopicRecord, - GenericDocumentRecord, - IDTopicRecord, - IDDocumentRecord, - SimpleTextTopicRecord, - SimpleTextDocumentRecord, + # Create records + create_record, # Other things AdhocAssessment, ) @@ -237,13 +232,13 @@ def count(self): @property @abstractmethod - def topic_recordtype(self) -> Type[Record]: + def topic_recordtype(self) -> RecordType: """The set of records for topics""" ... @property @abstractmethod - def document_recordtype(self) -> Type[Record]: + def document_recordtype(self) -> RecordType: """The class for documents""" ... @@ -281,12 +276,12 @@ def _topic(self): @cached_property def topic_recordtype(self) -> Type[TopicRecord]: """The class for topics""" - return IDTopicRecord if self.topic_ids else SimpleTextTopicRecord + return record_type(IDItem) if self.topic_ids else record_type(SimpleTextItem) @cached_property def document_recordtype(self) -> Type[DocumentRecord]: """The class for documents""" - return IDDocumentRecord if self.doc_ids else SimpleTextDocumentRecord + return record_type(IDItem) if self.doc_ids else record_type(SimpleTextItem) @define(kw_only=True) diff --git a/src/datamaestro_text/data/ir/base.py b/src/datamaestro_text/data/ir/base.py index 4c41988..b727019 100644 --- a/src/datamaestro_text/data/ir/base.py +++ b/src/datamaestro_text/data/ir/base.py @@ -1,25 +1,10 @@ from abc import ABC, abstractmethod from attrs import define from typing import List -from datamaestro.record import Record, Item, recordtypes +from datamaestro.record import Record, Item, record_type -class BaseRecord(Record): - @classmethod - def from_text(cls, text: str, *items: Item): - return cls(SimpleTextItem(text), *items) - - @classmethod - def from_id(cls, id: str, *items: Item): - return cls(IDItem(id), *items) - - -class TopicRecord(BaseRecord): - """Topic record""" - - -class DocumentRecord(BaseRecord): - """Document record""" +TopicRecord = DocumentRecord = Record @define() @@ -79,38 +64,11 @@ class AdhocAssessedTopic: """List of assessments for this topic""" -# --- Commonly used types - - -@recordtypes(IDItem) -class IDTopicRecord(TopicRecord): - pass - - -@recordtypes(IDItem) -class IDDocumentRecord(DocumentRecord): - pass - - -@recordtypes(SimpleTextItem) -class SimpleTextTopicRecord(TopicRecord): - pass - - -@recordtypes(SimpleTextItem) -class SimpleTextDocumentRecord(DocumentRecord): - pass - - -@recordtypes(IDItem, TextItem) -class GenericDocumentRecord(DocumentRecord): - @classmethod - def create(cls, id: str, text: str, *items: Item): - return cls(IDItem(id), SimpleTextItem(text), *items) - - -@recordtypes(IDItem, TextItem) -class GenericTopicRecord(TopicRecord): - @classmethod - def create(cls, id: str, text: str, *items: Item): - return cls(IDItem(id), SimpleTextItem(text), *items) +def create_record(*items: Item, id: str = None, text: str = None): + """Easy creation of a text/id item""" + extra_items = [] + if id is not None: + extra_items.append(IDItem(id)) + if text is not None: + extra_items.append(SimpleTextItem(text)) + return Record(*items, *extra_items) diff --git a/src/datamaestro_text/data/ir/cord19.py b/src/datamaestro_text/data/ir/cord19.py index b83f9f1..ef14c31 100644 --- a/src/datamaestro_text/data/ir/cord19.py +++ b/src/datamaestro_text/data/ir/cord19.py @@ -2,7 +2,7 @@ from typing import Iterator from datamaestro.data import File, documentation -from datamaestro.record import recordtypes +from datamaestro.record import Record from datamaestro_text.data.ir import Documents, TopicRecord, Topics, IDItem from datamaestro_text.data.ir.formats import ( DocumentWithTitle, @@ -11,12 +11,6 @@ ) from datamaestro.data.csv import Generic as GenericCSV import xml.etree.ElementTree as ET -from datamaestro_text.data.ir.base import GenericDocumentRecord - - -@recordtypes(DocumentWithTitle) -class CordDocumentRecord(GenericDocumentRecord): - pass class Topics(Topics, File): @@ -42,11 +36,11 @@ def topic_recordtype(self): class Documents(Documents, GenericCSV): @documentation - def iter(self) -> Iterator[CordDocumentRecord]: + def iter(self) -> Iterator[Record]: """Returns an iterator over adhoc documents""" with self.path.open("r") as fp: for row in DictReader(fp): - yield CordDocumentRecord( + yield Record( IDItem(row["cord_uid"]), DocumentWithTitle(row["abstract"], row["title"]), ) diff --git a/src/datamaestro_text/data/ir/csv.py b/src/datamaestro_text/data/ir/csv.py index 9280c0c..85af1d5 100644 --- a/src/datamaestro_text/data/ir/csv.py +++ b/src/datamaestro_text/data/ir/csv.py @@ -1,11 +1,12 @@ +from functools import cached_property from pathlib import Path from typing import Iterator, Tuple, Type from experimaestro import Param, Option, Constant, Meta from datamaestro.definitions import argument -from datamaestro.record import Record +from datamaestro.record import Record, RecordType import datamaestro_text.data.ir as ir -from datamaestro_text.data.ir.base import GenericTopicRecord, IDItem, SimpleTextItem +from datamaestro_text.data.ir.base import IDItem, SimpleTextItem from datamaestro_text.interfaces.plaintext import read_tsv @@ -23,14 +24,14 @@ class Topics(ir.Topics): def iter(self): return ( - GenericTopicRecord(IDItem(qid), SimpleTextItem(title)) + Record(IDItem(qid), SimpleTextItem(title)) for qid, title in read_tsv(self.path) ) - @property - def topic_recordtype(self) -> Type[Record]: + @cached_property + def topic_recordtype(self) -> RecordType: """The class for topics""" - return GenericTopicRecord + return RecordType(IDItem, SimpleTextItem) class Documents(ir.Documents): diff --git a/src/datamaestro_text/data/ir/formats.py b/src/datamaestro_text/data/ir/formats.py index ae7ac22..daac244 100644 --- a/src/datamaestro_text/data/ir/formats.py +++ b/src/datamaestro_text/data/ir/formats.py @@ -1,9 +1,9 @@ from functools import cached_property from typing import ClassVar, Tuple from attrs import define -from datamaestro.record import recordtypes +from datamaestro.record import record_type from ir_datasets.datasets.wapo import WapoDocMedia -from .base import TextItem, SimpleTextItem, IDTopicRecord +from .base import TextItem, SimpleTextItem, IDItem from ir_datasets.datasets.cord19 import Cord19FullTextSection @@ -174,6 +174,4 @@ class TrecTopic(SimpleTextItem): narrative: str -@recordtypes(TrecTopic) -class TrecTopicRecord(IDTopicRecord): - ... +TrecTopicRecord = record_type(IDItem, TrecTopic) diff --git a/src/datamaestro_text/datasets/irds/data.py b/src/datamaestro_text/datasets/irds/data.py index 8297d2d..dd32935 100644 --- a/src/datamaestro_text/datasets/irds/data.py +++ b/src/datamaestro_text/datasets/irds/data.py @@ -16,7 +16,7 @@ from experimaestro import Config, Param from experimaestro.compat import cached_property from experimaestro import Option -from datamaestro.record import recordtypes +from datamaestro.record import RecordType, record_type import datamaestro_text.data.ir as ir from datamaestro_text.data.ir.base import ( Record, @@ -26,8 +26,7 @@ AdhocAssessedTopic, SimpleAdhocAssessment, IDItem, - IDTopicRecord, - IDDocumentRecord, + create_record, ) import datamaestro_text.data.ir.formats as formats @@ -201,12 +200,7 @@ def document_int(self, ix): @cached_property def document_recordtype(self): - return DocumentRecord.from_types( - f"{self.converter.target_cls.__name__}Record", - IDItem, - self.converter.target_cls, - module=__name__, - ) + return record_type(IDItem, self.converter.target_cls) @cached_property def converter(self): @@ -380,13 +374,8 @@ def count(self): return self.dataset.queries_count() @cached_property - def topic_recordtype(self) -> Type[Record]: - return TopicRecord.from_types( - f"{self.handler.target_cls.__name__}Record", - IDItem, - self.handler.target_cls, - module=Topics.__class__.__module__, - ) + def topic_recordtype(self) -> RecordType: + return record_type(IDItem, self.handler.target_cls) @cached_property def handler(self): @@ -440,16 +429,6 @@ def iter(self) -> Iterator[ir.TopicRecord]: """Returns an iterator over topics""" return iter(self.records) - @recordtypes( - IDItem, SimpleTextItem, DecontextualizedDictItem, ConversationHistoryItem - ) - class Cast2020TopicRecord(TopicRecord): - ... - - @recordtypes(RetrievedEntry) - class Cast2020ResponseRecord(AnswerConversationRecord): - ... - class Cast2020TopicsHandler(CastTopicsHandler): @cached_property def records(self): @@ -471,7 +450,7 @@ def records(self): "auto": query.automatic_rewritten_utterance, }, ) - topic = Cast2020TopicRecord( + topic = Record( IDItem(query.query_id), SimpleTextItem(query.raw_utterance), decontextualized, @@ -492,9 +471,7 @@ def records(self): conversation.append(node) node = node.add( ConversationTreeNode( - Cast2020ResponseRecord( - RetrievedEntry(query.manual_canonical_result_id) - ) + Record(RetrievedEntry(query.manual_canonical_result_id)) ) ) conversation.append(node) @@ -527,21 +504,21 @@ class TrainingTriplets(ir.TrainingTriplets, IRDSId): CONVERTERS = { GenericDocPair: lambda qid, doc1_id, doc2_id: ( - IDTopicRecord.from_id(qid), - IDDocumentRecord.from_id(doc1_id), - IDDocumentRecord.from_id(doc2_id), + create_record(id=qid), + create_record(id=doc1_id), + create_record(id=doc2_id), ) } @cached_property - def topic_recordtype(self) -> Type[Record]: + def topic_recordtype(self) -> RecordType: """The set of records for topics""" - return TopicRecord.from_types("TopicIDRecord", IDItem, module=__name__) + return record_type(IDItem) @cached_property - def document_recordtype(self) -> Type[Record]: + def document_recordtype(self) -> RecordType: """The class for documents""" - return DocumentRecord.from_types("DocumentIDRecord", IDItem, module=__name__) + return record_type(IDItem) @cached_property def converter(self): diff --git a/src/datamaestro_text/interfaces/trec.py b/src/datamaestro_text/interfaces/trec.py index e5d9f85..87f95dc 100644 --- a/src/datamaestro_text/interfaces/trec.py +++ b/src/datamaestro_text/interfaces/trec.py @@ -2,9 +2,9 @@ from pathlib import Path from typing import Iterator, Optional import re -from datamaestro.record import recordtypes from datamaestro_text.data.ir.base import ( AdhocAssessedTopic, + TopicRecord, SimpleAdhocAssessment, IDItem, ) @@ -37,7 +37,7 @@ def cleanup(s: Optional[str]) -> str: return s.replace("\t", " ").strip() if s is not None else "" -def parse_query_format(file, xml_prefix=None) -> Iterator[TrecTopicRecord]: +def parse_query_format(file, xml_prefix=None) -> Iterator[TopicRecord]: """Parse TREC XML query format""" if xml_prefix is None: xml_prefix = "" diff --git a/src/datamaestro_text/transforms/ir/__init__.py b/src/datamaestro_text/transforms/ir/__init__.py index 960e49a..80006a3 100644 --- a/src/datamaestro_text/transforms/ir/__init__.py +++ b/src/datamaestro_text/transforms/ir/__init__.py @@ -5,7 +5,7 @@ from typing import Type from experimaestro import Config, Task, Param, Annotated, pathgenerator, Option, tqdm import numpy as np -from datamaestro.record import Record +from datamaestro.record import RecordType import datamaestro_text.data.ir as ir from datamaestro_text.utils.shuffle import shuffle @@ -30,7 +30,7 @@ class StoreTrainingTripletTopicAdapter(ir.TrainingTriplets): """Input data""" def __validate__(self): - assert self.data.topic_recordtype.has_type(ir.IDItem), ( + assert self.data.topic_recordtype.has(ir.IDItem), ( f"Topics {self.data.topic_recordtype}" f" have no ID: {self.data.topic_recordtype.itemtypes}" ) @@ -43,12 +43,12 @@ def count(self): return self.data.count() @property - def topic_recordtype(self) -> Type[Record]: + def topic_recordtype(self) -> RecordType: """The class for topics""" return self.store.topic_recordtype @property - def document_recordtype(self) -> Type[Record]: + def document_recordtype(self) -> RecordType: """The class for documents""" return self.data.document_recordtype @@ -89,12 +89,12 @@ def count(self): return self.data.count() @property - def topic_recordtype(self) -> Type[Record]: + def topic_recordtype(self) -> RecordType: """The class for topics""" return self.store.topic_recordtype @property - def document_recordtype(self) -> Type[Record]: + def document_recordtype(self) -> RecordType: """The class for documents""" return self.data.document_recordtype @@ -131,20 +131,20 @@ class ShuffledTrainingTripletsLines(Task): def __validate__(self): if self.topic_ids: - assert self.data.topic_recordtype.has_type( + assert self.data.topic_recordtype.has( ir.IDItem ), f"No topic ID in the source data ({self.data.topic_recordtype})" else: - assert self.data.topic_recordtype.has_type( + assert self.data.topic_recordtype.has( ir.TextItem ), f"No topic text in the source data ({self.data.topic_recordtype})" if self.doc_ids: - assert self.data.document_recordtype.has_type( + assert self.data.document_recordtype.has( ir.IDItem ), "No doc ID in the source data" else: - assert self.data.document_recordtype.has_type( + assert self.data.document_recordtype.has( ir.TextItem ), "No doc text in the source data"