In [1]:
#| default_exp sequencing

# Sequencer
The Sequencer abstract class defines the interface for implementations that identify the sequence of target items generated in raw response text. Example concrete implementations of this class can be based on compositions of Segmenter and Matcher strategies or on more holistic heuristic or statistical methods.

In [2]:
#| code-summary: specify the abstract class for implementing target item sequencing strategies

#| export
from abc import ABC, abstractmethod
from typing import List, Dict, Optional


class Sequencer(ABC):

    """
    Abstract base class for implementing target item sequencing strategies. To create a custom sequencer, inherit from this class and override the sequence method.
    """

    @abstractmethod
    def __call__(
        self, response_transcript: str, target_items: List[str], target_context: str = '') -> Dict[str, object]:
        """
        Identifies the sequence of target items in the input text using the
        provided Segmenter and Matcher instances.

        Parameters:
            text (str): The input text to be segmented.

        Returns:
            Dict[str, List[str]]: Dictionary containing:
                - 'target_context': The string containing the context of the target items. (if applicable)
                - 'target_items': The list of target items
                - 'response_transcript': The input text
                - 'response_units': The list of response units, a dictionary of the form {'text': str, 'span' [(start, end)]}
                - 'matches': a 2-D boolean numpy array of shape (len(target_items), len(response_units)) containing True if the target item matches the response unit at the corresponding index.
        """
        pass


## Pipeline Sequencer
A concrete class that allows users to compose concrete Segmenter and Matcher classes to identify the sequence of target items generated in a trial. This class should provide a flexible way to create and test various Segmenter and Matcher implementations combinations.


In [3]:
#| code-summary: implement a pipeline sequencer that composes a segmenter and matcher to identify the sequence of target items generated in a response text

#| export
from response_sequencer.segmenting import Segmenter
from response_sequencer.matching import Matcher
import numpy as np

class PipelineSequencer(Sequencer):
    """
    Concrete implementation of Sequencer that composes Segmenter and Matcher
    classes to identify the sequence of target items generated in a response text.
    """

    def __init__(self, segmenter: Segmenter, matcher: Matcher):
        """
        Initializes the PipelineSequencer with a given Segmenter and Matcher.

        Parameters:
            segmenter (Segmenter): An instance of a concrete Segmenter implementation.
        matcher (Matcher): An instance of a concrete Matcher implementation.
        """
        self.segmenter = segmenter
        self.matcher = matcher

    def __call__(
        self, response_transcript: str, target_items: List[str], target_context: str = '') -> Dict[str, object]:
        """
        Identifies the sequence of target items in the input text using the
        provided Segmenter and Matcher instances.

        Parameters:
            text (str): The input text to be segmented.

        Returns:
            Dict[str, List[str]]: Dictionary containing:
                - 'target_context': The string containing the context of the target items. (if applicable)
                - 'target_items': The list of target items
                - 'response_transcript': The input text
                - 'response_units': The list of response units, a dictionary of the form {'text': str, 'span' [(start, end)]}
                - 'matches': list of lists representing a 2-D boolean numpy array of shape (len(target_items), len(response_units)) containing True if the target item matches the response unit at the corresponding index.
        """

        response_units = self.segmenter(response_transcript)
        matching = self.matcher(
            response_units, target_items, response_transcript, target_context)

        return {
            'target_context': target_context,
            'target_items': target_items,
            'response_transcript': response_transcript,
            'response_units': response_units,
            'matches': matching
        }

## GPT-3 Sequencer
A concrete class that uses the GPT-3 API to generate a sequence of target items based on a given prompt. This class should provide a flexible way to create and test various GPT-3 API implementations.

In [4]:
#| code-summary: implement a GPT-3-based sequencer that identifies the sequence of target items generated in a response text

import openai

class GPT3Sequencer(Sequencer):
    """
    Concrete implementation of Sequencer that uses a GPT-3-based approach for
    identifying the sequence of target items generated in a response text.
    """

    def __init__(self, api_key: str, model: str = "text-davinci-002"):
        """
        Initializes the GPT3Sequencer with an API key and a GPT-3 model name.

        Parameters:
        ----------
        api_key : str
            The API key for accessing the OpenAI GPT-3 API.
        model : str, optional
            The GPT-3 model to be used for sequencing (default is "text-davinci-002").
        """
        self.api_key = api_key
        self.model = model
        openai.api_key = self.api_key

    def sequence(self, input_text: str, target_items: List[str]) -> List[str]:
        """
        Identifies the sequence of target items in the input text using the GPT-3 model.

        Parameters:
        ----------
        input_text : str
            The input text to be processed.
        target_items : List[str]
            A list of target items to identify in the input text.

        Returns:
        --------
        List[str]
            A list of target items in the order they appear in the input text.
        """
        prompt = f"Given the text: '{input_text}', find the sequence of the following target items: {', '.join(target_items)}.\n"
        response = openai.Completion.create(
            engine=self.model,
            prompt=prompt,
            max_tokens=50,
            n=1,
            stop=None,
            temperature=0.5,
        )

        result = response.choices[0].text.strip()
        sequence = result.split(", ")
        return sequence


## Reference Sequencers

In [5]:
#| code-summary: implement a range of reference sequencers used for comparison against a baseline sequencer

#| export
from response_sequencer.segmenting import MultiSentenceFragmentsSegmenter, SentenceSegmenter
from response_sequencer.scoring import ContextualizedEmbeddingScorer, SentenceTransformerScorer
from response_sequencer.matching import MaximumScoreMatcher

class BaselineSequencer(Sequencer):
    
    def __init__(self):
        model_name = 'all-mpnet-base-v2'
        segmenter = SentenceSegmenter()
        scorer = SentenceTransformerScorer(
            model_name='all-mpnet-base-v2')
        matcher = MaximumScoreMatcher(scorer)
        self.sequencer = PipelineSequencer(segmenter, matcher)

    def __call__(
        self, response_transcript: str, target_items: List[str], target_context: str = ''
        ) -> Dict[str, object]:

            return self.sequencer(response_transcript, target_items, target_context)

class BaselinePlusContextSequencer(Sequencer):
    def __init__(self):
        layer_depth = 1
        model_name = 'sentence-transformers/all-mpnet-base-v2'
        segmenter = SentenceSegmenter()
        scorer = ContextualizedEmbeddingScorer(
            model_name=model_name,
            layer_depth=layer_depth)
        matcher = MaximumScoreMatcher(scorer)
        self.sequencer = PipelineSequencer(segmenter, matcher)

    def __call__(
        self, response_transcript: str, target_items: List[str], target_context: str = ''
        ) -> Dict[str, object]:

            return self.sequencer(response_transcript, target_items, target_context)

class BaselinePlusFragmentsSequencer(Sequencer):

    def __init__(self):
        min_tokens = 2
        max_sentences = 2
        model_name = 'all-mpnet-base-v2'
        segmenter = MultiSentenceFragmentsSegmenter(
            max_sentences=max_sentences, min_tokens=min_tokens)
        scorer = SentenceTransformerScorer(
            model_name=model_name)
        matcher = MaximumScoreMatcher(scorer)
        self.sequencer = PipelineSequencer(segmenter, matcher)

    def __call__(
        self, response_transcript: str, target_items: List[str], target_context: str = ''
        ) -> Dict[str, object]:

            return self.sequencer(response_transcript, target_items, target_context)

class BaselinePlusContextPlusFragmentsSequencer(Sequencer):

    def __init__(self):
        layer_depth = 1
        min_tokens = 2
        max_sentences = 2
        model_name = 'sentence-transformers/all-mpnet-base-v2'
        segmenter = MultiSentenceFragmentsSegmenter(
            max_sentences=max_sentences, min_tokens=min_tokens)
        scorer = ContextualizedEmbeddingScorer(
            model_name=model_name,
            layer_depth=layer_depth)
        matcher = MaximumScoreMatcher(scorer)
        self.sequencer = PipelineSequencer(segmenter, matcher)

    def __call__(
        self, response_transcript: str, target_items: List[str], target_context: str = ''
        ) -> Dict[str, object]:

            return self.sequencer(response_transcript, target_items, target_context)

In [6]:
#| code-summary: demonstrate the use of the baseline sequencer

from response_sequencer.datasets import SensesDataset
from response_sequencer.demo import flatten_matches
import os
import json

section_tag = 'base' # unique identifier for this variation of notebook parameters
output_dir = '../../data/'

dataset = SensesDataset(
    os.path.join(output_dir, f'{section_tag}_senses.h5'), 
    os.path.join(output_dir, f'{section_tag}_sense_pool.txt'))
sequencer = BaselineSequencer()

trial_index = 0
trial = dataset.__getitem__(trial_index)

response_text = trial["response_transcript"]
matched_target_items, reference_units = flatten_matches(
        trial['target_items'], trial['response_units'], trial['matches'])
result = sequencer(response_text, matched_target_items)

json.loads(json.dumps([result, result]))

[{'target_context': '',
  'target_items': ['(n) ace (one of four playing cards in a deck having a single pip on its face)',
   '(n) ace, adept, champion, sensation, maven, mavin, virtuoso, genius, hotshot, star, superstar, whiz, whizz, wizard, wiz (someone who is dazzlingly skilled in any field)',
   '(n) ace (a serve that the receiver is unable to reach)'],
  'response_transcript': "Ace, um, I think it's like a, something to do with tennis, I don't really know. Um, there's also, like, um, in a card game there's the ace, um, which is like generally, like the highest point card, God, I haven't played cards in so long. Um, an ace can be someone that's really good at something.",
  'response_units': [{'text': "Ace, um, I think it's like a, something to do with tennis, I don't really know.",
    'spans': [[0, 79]]},
   {'text': "Um, there's also, like, um, in a card game there's the ace, um, which is like generally, like the highest point card, God, I haven't played cards in so long.",
   

In [7]:
#| hide
import nbdev; nbdev.nbdev_export()