Skip to content

Commit

Permalink
Updates and bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
bpiwowar committed Mar 6, 2024
1 parent 151fbdf commit a979367
Show file tree
Hide file tree
Showing 19 changed files with 103 additions and 173 deletions.
13 changes: 6 additions & 7 deletions docs/source/api/conversation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,19 @@ Data classes
Contextual query reformulation
------------------------------

.. autoxpmconfig:: datamaestro_text.data.conversation.base.ConversationDataset

.. autoclass:: ContextualizedRewrittenQuery
:members:

.. autoxpmconfig:: datamaestro_text.data.conversation.canard.CanardDataset
:members: iter

.. autoclass:: OrConvQADatasetAnswer
:members:
.. autoxpmconfig:: datamaestro_text.data.conversation.orconvqa.OrConvQADataset
:members: iter

.. autoclass:: OrConvQADatasetHistoryEntry
.. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetAnswer
:members:

.. autoclass:: OrConvQADataset
.. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetHistoryEntry
:members:

.. autoxpmconfig:: OrConvQADataset
:members: iter
6 changes: 6 additions & 0 deletions docs/source/api/embeddings.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Embeddings
==========

.. autoxpmconfig:: datamaestro_text.data.embeddings.WordEmbeddings

.. autoxpmconfig:: datamaestro_text.data.embeddings.WordEmbeddingsText
3 changes: 3 additions & 0 deletions docs/source/api/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ Datamaestro Text API
text
ir
conversation
embeddings
recommendation
nlp
15 changes: 15 additions & 0 deletions docs/source/api/ir.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,15 @@ Topics
:members: iter, count

.. autoxpmconfig:: datamaestro_text.data.ir.csv.Topics
.. autoxpmconfig:: datamaestro_text.data.ir.TopicsStore

.. autoxpmconfig:: datamaestro_text.transforms.ir.TopicWrapper

Dataset-specific Topics
-----------------------

.. autoxpmconfig:: datamaestro_text.data.ir.trec.TrecTopics
.. autoxpmconfig:: datamaestro_text.data.ir.cord19.Topics

Documents
---------
Expand All @@ -33,6 +38,9 @@ Documents
:members: iter_documents, iter_ids, documentcount
.. autoxpmconfig:: datamaestro_text.data.ir.cord19.Documents
.. autoxpmconfig:: datamaestro_text.data.ir.trec.TipsterCollection
.. autoxpmconfig:: datamaestro_text.data.ir.csv.Documents
.. autoxpmconfig:: datamaestro_text.data.ir.stores.OrConvQADocumentStore
.. autoxpmconfig:: datamaestro_text.datasets.irds.data.LZ4DocumentStore

Assessments
-----------
Expand All @@ -52,12 +60,19 @@ Runs
.. autoxpmconfig:: datamaestro_text.data.ir.csv.AdhocRunWithText
.. autoxpmconfig:: datamaestro_text.data.ir.trec.TrecAdhocRun


Results
-------

.. autoxpmconfig:: datamaestro_text.data.ir.AdhocResults
.. autoxpmconfig:: datamaestro_text.data.ir.trec.TrecAdhocResults
:members: get_results

Evaluation
----------

.. autoxpmconfig:: datamaestro_text.data.ir.Measure


Reranking
---------
Expand Down
4 changes: 4 additions & 0 deletions docs/source/api/nlp.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
NLP
===

.. autoxpmconfig:: datamaestro_text.data.tagging.CoNLL_U
6 changes: 6 additions & 0 deletions docs/source/api/recommendation.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Recommendation
==============


.. autoxpmconfig:: datamaestro_text.data.recommendation.RatedItems
.. autoxpmconfig:: datamaestro_text.data.recommendation.Movielens
2 changes: 2 additions & 0 deletions docs/source/api/text.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ Text API
========

.. autoxpmconfig:: datamaestro_text.data.text.TextFolder
.. autoxpmconfig:: datamaestro_text.data.text.TextFile
.. autoxpmconfig:: datamaestro_text.data.text.TrainingText
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
datamaestro>=1.0.2
datamaestro>=1.1.0
ir_datasets
attrs
Empty file.
18 changes: 1 addition & 17 deletions src/datamaestro_text/data/conversation/canard.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
from typing import Iterator, List, Optional
from typing import Iterator, List
from attr import define
import json
from datamaestro.data import File
from datamaestro.record import recordtypes
from datamaestro_text.data.ir.base import GenericTopicRecord
from .base import (
AnswerEntry,
ConversationTree,
RetrievedEntry,
SingleConversationTree,
SimpleDecontextualizedItem,
AnswerConversationRecord,
)
from . import ConversationDataset

Expand All @@ -35,16 +29,6 @@ class CanardConversation:
"""Question number"""


@recordtypes(SimpleDecontextualizedItem)
class CanardTopicRecord(GenericTopicRecord):
pass


@recordtypes(AnswerEntry, RetrievedEntry)
class CanardAnswerRecord(AnswerConversationRecord):
pass


class CanardDataset(ConversationDataset, File):
"""A dataset in the CANARD JSON format"""

Expand Down
18 changes: 3 additions & 15 deletions src/datamaestro_text/data/conversation/orconvqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@
from attr import define
import json
from datamaestro.data import File
from datamaestro.record import recordtypes
from datamaestro.record import Record

from datamaestro_text.data.ir.base import (
TopicRecord,
GenericTopicRecord,
IDItem,
SimpleTextItem,
)
Expand Down Expand Up @@ -65,16 +63,6 @@ class OrConvQADatasetEntry:
"""Relevance status for evidences"""


@recordtypes(SimpleDecontextualizedItem)
class OrConvQATopicRecord(GenericTopicRecord):
pass


@recordtypes(AnswerEntry, RetrievedEntry)
class OrConvQAAnswerRecord(AnswerConversationRecord):
pass


class OrConvQADataset(ConversationDataset, File):
def entries(self) -> Iterator[OrConvQADatasetEntry]:
"""Iterates over re-written query with their context"""
Expand Down Expand Up @@ -114,14 +102,14 @@ def __iter__(self) -> Iterator[ConversationTree]:

# Add to current
history.append(
OrConvQATopicRecord(
Record(
IDItem(query_no),
SimpleTextItem(entry.query),
SimpleDecontextualizedItem(entry.rewrite),
)
)
history.append(
OrConvQAAnswerRecord(
Record(
AnswerEntry(entry.answer.text),
RetrievedEntry(entry.evidences, entry.retrieval_labels),
)
Expand Down
19 changes: 7 additions & 12 deletions src/datamaestro_text/data/ir/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from datamaestro.data import Base
from datamaestro_text.utils.files import auto_open
from datamaestro_text.utils.iter import BatchIterator
from datamaestro.record import Record
from datamaestro.record import record_type, RecordType
from .base import ( # noqa: F401
# Record items
IDItem,
Expand All @@ -21,13 +21,8 @@
DocumentRecord,
SimpleTextItem,
ScoredItem,
# Pre-defined usual records
GenericTopicRecord,
GenericDocumentRecord,
IDTopicRecord,
IDDocumentRecord,
SimpleTextTopicRecord,
SimpleTextDocumentRecord,
# Create records
create_record,
# Other things
AdhocAssessment,
)
Expand Down Expand Up @@ -237,13 +232,13 @@ def count(self):

@property
@abstractmethod
def topic_recordtype(self) -> Type[Record]:
def topic_recordtype(self) -> RecordType:
"""The set of records for topics"""
...

@property
@abstractmethod
def document_recordtype(self) -> Type[Record]:
def document_recordtype(self) -> RecordType:
"""The class for documents"""
...

Expand Down Expand Up @@ -281,12 +276,12 @@ def _topic(self):
@cached_property
def topic_recordtype(self) -> Type[TopicRecord]:
"""The class for topics"""
return IDTopicRecord if self.topic_ids else SimpleTextTopicRecord
return record_type(IDItem) if self.topic_ids else record_type(SimpleTextItem)

@cached_property
def document_recordtype(self) -> Type[DocumentRecord]:
"""The class for documents"""
return IDDocumentRecord if self.doc_ids else SimpleTextDocumentRecord
return record_type(IDItem) if self.doc_ids else record_type(SimpleTextItem)


@define(kw_only=True)
Expand Down
62 changes: 10 additions & 52 deletions src/datamaestro_text/data/ir/base.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,10 @@
from abc import ABC, abstractmethod
from attrs import define
from typing import List
from datamaestro.record import Record, Item, recordtypes
from datamaestro.record import Record, Item, record_type


class BaseRecord(Record):
@classmethod
def from_text(cls, text: str, *items: Item):
return cls(SimpleTextItem(text), *items)

@classmethod
def from_id(cls, id: str, *items: Item):
return cls(IDItem(id), *items)


class TopicRecord(BaseRecord):
"""Topic record"""


class DocumentRecord(BaseRecord):
"""Document record"""
TopicRecord = DocumentRecord = Record


@define()
Expand Down Expand Up @@ -79,38 +64,11 @@ class AdhocAssessedTopic:
"""List of assessments for this topic"""


# --- Commonly used types


@recordtypes(IDItem)
class IDTopicRecord(TopicRecord):
pass


@recordtypes(IDItem)
class IDDocumentRecord(DocumentRecord):
pass


@recordtypes(SimpleTextItem)
class SimpleTextTopicRecord(TopicRecord):
pass


@recordtypes(SimpleTextItem)
class SimpleTextDocumentRecord(DocumentRecord):
pass


@recordtypes(IDItem, TextItem)
class GenericDocumentRecord(DocumentRecord):
@classmethod
def create(cls, id: str, text: str, *items: Item):
return cls(IDItem(id), SimpleTextItem(text), *items)


@recordtypes(IDItem, TextItem)
class GenericTopicRecord(TopicRecord):
@classmethod
def create(cls, id: str, text: str, *items: Item):
return cls(IDItem(id), SimpleTextItem(text), *items)
def create_record(*items: Item, id: str = None, text: str = None):
"""Easy creation of a text/id item"""
extra_items = []
if id is not None:
extra_items.append(IDItem(id))
if text is not None:
extra_items.append(SimpleTextItem(text))
return Record(*items, *extra_items)
12 changes: 3 additions & 9 deletions src/datamaestro_text/data/ir/cord19.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Iterator

from datamaestro.data import File, documentation
from datamaestro.record import recordtypes
from datamaestro.record import Record
from datamaestro_text.data.ir import Documents, TopicRecord, Topics, IDItem
from datamaestro_text.data.ir.formats import (
DocumentWithTitle,
Expand All @@ -11,12 +11,6 @@
)
from datamaestro.data.csv import Generic as GenericCSV
import xml.etree.ElementTree as ET
from datamaestro_text.data.ir.base import GenericDocumentRecord


@recordtypes(DocumentWithTitle)
class CordDocumentRecord(GenericDocumentRecord):
pass


class Topics(Topics, File):
Expand All @@ -42,11 +36,11 @@ def topic_recordtype(self):

class Documents(Documents, GenericCSV):
@documentation
def iter(self) -> Iterator[CordDocumentRecord]:
def iter(self) -> Iterator[Record]:
"""Returns an iterator over adhoc documents"""
with self.path.open("r") as fp:
for row in DictReader(fp):
yield CordDocumentRecord(
yield Record(
IDItem(row["cord_uid"]),
DocumentWithTitle(row["abstract"], row["title"]),
)
13 changes: 7 additions & 6 deletions src/datamaestro_text/data/ir/csv.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from functools import cached_property
from pathlib import Path
from typing import Iterator, Tuple, Type

from experimaestro import Param, Option, Constant, Meta
from datamaestro.definitions import argument
from datamaestro.record import Record
from datamaestro.record import Record, RecordType
import datamaestro_text.data.ir as ir
from datamaestro_text.data.ir.base import GenericTopicRecord, IDItem, SimpleTextItem
from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
from datamaestro_text.interfaces.plaintext import read_tsv


Expand All @@ -23,14 +24,14 @@ class Topics(ir.Topics):

def iter(self):
return (
GenericTopicRecord(IDItem(qid), SimpleTextItem(title))
Record(IDItem(qid), SimpleTextItem(title))
for qid, title in read_tsv(self.path)
)

@property
def topic_recordtype(self) -> Type[Record]:
@cached_property
def topic_recordtype(self) -> RecordType:
"""The class for topics"""
return GenericTopicRecord
return RecordType(IDItem, SimpleTextItem)


class Documents(ir.Documents):
Expand Down
Loading

0 comments on commit a979367

Please sign in to comment.