# Base Pipeline

![](../assets/classification_pipeline.png)

## data preparation

first one need to split data to train and test and then launch client with vector db

In [1]:
import json

banking77 = json.load(open('../data/records/banking77.json'))
banking77[0]

{'intent_id': 0,
 'intent_name': 'activate_my_card',
 'sample_utterances': ["Please help me with my card.  It won't activate.",
  'I tired but an unable to activate my card.',
  'I want to start using my card.',
  'How do I verify my new card?',
  "I tried activating my plug-in and it didn't piece of work"],
 'regexp_for_sampling': [],
 'regexp_as_rules': []}

In [2]:
import itertools as it
from sklearn.model_selection import train_test_split


def get_sample_utterances(dataset: list[dict]):
    """get plain list of all sample utterances and their intent labels"""
    utterances = [intent['sample_utterances'] for intent in dataset]
    labels = [[intent['intent_id']] * len(uts) for intent, uts in zip(dataset, utterances)]

    utterances = list(it.chain.from_iterable(utterances))
    labels = list(it.chain.from_iterable(labels))

    return utterances, labels


def split_sample_utterances(dataset: list[dict]):
    """
    Return: utterances_train, utterances_test, labels_train, labels_test
    
    TODO: ensure stratified train test splitting (test set must contain all classes)
    """

    utterances, labels = get_sample_utterances(dataset)

    return train_test_split(
        utterances,
        labels,
        test_size=0.25,
        random_state=0,
        stratify=labels,
        shuffle=True
    )

In [3]:
utterances_train, utterances_test, labels_train, labels_test = split_sample_utterances(banking77)
len(utterances_train), len(utterances_test)

(288, 97)

In [4]:
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction as EmbFunc
from chromadb import PersistentClient, ClientAPI


def create_collection(
    utterances: list[str],
    labels: list[int],
    client: ClientAPI,
    name="example_collection",
    embedder_name="Alibaba-NLP/gte-base-en-v1.5",
):
    labels_set = set(labels)
    n_classes = len(labels_set)
    assert set(range(n_classes)) == labels_set, "labels must be from [0,n_classes-1]"

    collection = client.get_or_create_collection(
        name=name,
        embedding_function=EmbFunc(model_name=embedder_name, trust_remote_code=True),
        metadata={'n_classes': n_classes}
    )

    collection.add(
        documents=utterances,
        ids=[str(i) for i in range(len(utterances))],
        metadatas=[{'intent_id': lab} for lab in labels]
    )
    
    return collection

In [5]:
client = PersistentClient(path='../data/chroma')
client.delete_collection("example_collection")

collection = create_collection(
    utterances_train,
    labels_train,
    client
)
collection.count()

  from tqdm.autonotebook import tqdm, trange


288

## RegExp

In [7]:
import json

dream = json.load(open('../data/records/dream.json'))
dream[0]

{'intent_id': 0,
 'intent_name': 'what_are_you_talking_about',
 'sample_utterances': [],
 'regexp_for_sampling': ['(alexa ){0,1}what are ((you)|(we)) ((talking about)|(discussing))',
  '(alexa ){0,1}what ((you)|(we)) are (even ){0,1}((talking about)|(discussing))',
  '(alexa ){0,1}what does it mean',
  '(alexa ){0,1}pass that by me again',
  "(alexa ){0,1}i ((don't)|(didn't)|(do not)|(did not)) get it",
  '(alexa ){0,1}what it is about',
  '(alexa ){0,1}what is it about',
  'i lost common ground',
  '(alexa ){0,1}what (even ){0,1}is that',
  "(i ((did not get)|(don't understand)|(don't get)) ){0,1}what do you mean( alexa){0,1}",
  "(sorry, ){0,1}i ((don't)|(do not)|(didn't)|(did not)) ((understand)|(get))( ((what you mean)|(what are you talking about)))( alexa){0,1}",
  '((what you mean)|(what are you talking about))( alexa){0,1}',
  "i don't know what you just said"],
 'regexp_as_rules': ['(alexa ){0,1}are we having a communication problem',
  "(alexa ){0,1}i don't think you understan

In [8]:
import re

def regexp(utterance: str, intents_patterns: list[dict]):
    detected = set()
    for intent in intents_patterns:
        for pattern in intent['regexp_for_sampling'] + intent['regexp_as_rules']:
            if re.match(pattern, utterance) is None:
                continue
            detected.add(intent['intent_id'])
    return detected

In [9]:
regexp(
    utterance='what are you talking about',
    intents_patterns=dream
)

{0, 5}

In [10]:
regexp(
    utterance='tell me something else',
    intents_patterns=dream
)

{1, 6}

In [11]:
regexp(
    utterance='kind of',
    intents_patterns=dream
)

{6}

## Retrieval

In [6]:
from chromadb import Collection


def retrieval(utterance: str, collection: Collection):
    query_embeddings = collection._embedding_function([utterance])
    query_res = collection.query(
        query_embeddings=query_embeddings,
        n_results=4,
        include=["metadatas", "documents"]  # one can add "embeddings", "distances"
    )
    query_res['utterance_embedding'] = query_embeddings[0]
    return query_res

In [7]:
utterance = 'i want a new card'
query_res = retrieval(utterance, collection)

In [8]:
query_res

{'ids': [['40', '23', '240', '161']],
 'distances': None,
 'metadatas': [[{'intent_id': 39},
   {'intent_id': 39},
   {'intent_id': 43},
   {'intent_id': 12}]],
 'embeddings': None,
 'documents': [['I want some extra physical cards.',
   "I'd like to order an additional card",
   'Can I request a card?',
   'I need to get my card quickly']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents'],
 'utterance_embedding': [0.12841056287288666,
  -0.6709969639778137,
  -0.9106872081756592,
  -0.03350679576396942,
  -0.4920174479484558,
  -0.2790452837944031,
  -1.537368893623352,
  -0.498529851436615,
  -0.87599116563797,
  -1.1809755563735962,
  0.27483147382736206,
  -0.0233471542596817,
  1.3668469190597534,
  0.6037641167640686,
  -0.24536572396755219,
  1.1974239349365234,
  -0.5540984272956848,
  -0.2033131867647171,
  -0.5764469504356384,
  0.18888752162456512,
  -0.43474602699279785,
  -0.07547374814748764,
  0.6128082871437073,
  -1.3009823560714722,
  -0.507562398

## Scoring

modules:
- knn
- linear
- dnnc

In [39]:
import numpy as np


class AbstractScorer:
    def fit(self, collection: Collection):
        raise NotImplementedError()

    def predict_proba(self, utterance: str):
        raise NotImplementedError()
    
    def predict_topk(self, utterance, k=3):        
        scores = self.predict_proba(utterance)
        top_indices = np.argpartition(scores, kth=-k)[-k:]
        top_scores = scores[top_indices]
        return top_indices[np.argsort(top_scores)][::-1]

### knn

In [40]:
import numpy as np

class KNNScorer(AbstractScorer):
    """
    TODO:
    - add weighted knn?
    """
    def fit(self, collection: Collection):
        self._collection = collection
        self._n_classes = collection.metadata['n_classes']
    
    def predict_proba(self, utterance: str):
        query_res = retrieval(utterance, self._collection)
        labels = [dct['intent_id'] for dct in query_res['metadatas'][0]]
        
        counters = np.bincount(labels, minlength=self._n_classes)
        scores = counters / counters.sum()

        return scores

In [43]:
knn_scorer = KNNScorer()
knn_scorer.fit(collection)
knn_scorer.predict_proba('i want a new card')

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.25, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.5 , 0.  , 0.  , 0.  , 0.25,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])

In [44]:
knn_scorer.predict_topk('i want a new card')

array([39, 12, 43])

### linear

In [45]:
from sklearn.linear_model import LogisticRegressionCV


class LinearScorer(AbstractScorer):
    """
    TODO:
    - implement different modes (incremental learning with SGD and simple learning with LogisticRegression)
    - control n_jobs
    - adjust cv
    - ensure that embeddings of train set are not recalculated
    """

    def fit(self, collection: Collection):
        dataset = collection.get(include=['embeddings', 'metadatas'])
        features = dataset['embeddings']
        labels = [dct['intent_id'] for dct in dataset['metadatas']]
        clf = LogisticRegressionCV(cv=3, n_jobs=8, multi_class='multinomial')
        clf.fit(features, labels)

        self._clf = clf
        self._collection = collection
    
    def predict_proba(self, utterance: str):
        features = self._collection._embedding_function([utterance])
        return self._clf.predict_proba(features)[0]

In [46]:
linear_scorer = LinearScorer()
linear_scorer.fit(collection)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [47]:
linear_scorer.predict_proba('i want a new card')

array([5.36315606e-02, 2.06736865e-03, 1.07819996e-03, 2.84117638e-03,
       1.06543725e-03, 4.14270486e-04, 4.56166199e-04, 9.31523609e-04,
       8.16927696e-04, 1.04347418e-01, 4.14846962e-03, 1.07265188e-02,
       2.87261476e-02, 2.22940571e-02, 4.22492414e-02, 2.29241231e-03,
       3.62684145e-03, 1.13575060e-03, 3.57048793e-03, 5.06323550e-04,
       1.13312843e-03, 9.95915608e-03, 7.87386713e-04, 1.55948122e-02,
       1.01617718e-02, 1.36223739e-02, 7.43484887e-04, 1.21027345e-03,
       1.65229683e-03, 1.14075433e-02, 2.32802378e-02, 1.24439988e-03,
       8.63093249e-04, 3.74815527e-03, 1.91142796e-03, 5.09926445e-04,
       8.36122168e-04, 4.88069047e-03, 2.72668840e-03, 2.16004925e-01,
       5.72615173e-02, 1.98442906e-01, 7.95156098e-04, 5.20176928e-02,
       1.22214771e-03, 1.21927601e-03, 6.29880435e-04, 6.33465044e-03,
       3.60845091e-04, 1.24068092e-03, 8.97284111e-04, 1.52737685e-03,
       3.98661705e-03, 2.60228011e-03, 3.17184426e-03, 1.78225807e-03,
      

In [48]:
linear_scorer.predict_topk('i want a new card')

array([39, 41,  9])

### DNNC