# Base Pipeline

![](../assets/classification_pipeline.png)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')

## Modules

### data preparation

first one need to split data to train and test and then launch client with vector db

In [3]:
import json

banking77 = json.load(open('../data/intent_records/banking77.json'))
banking77[0]

{'intent_id': 0,
 'intent_name': 'activate_my_card',
 'sample_utterances': ["Please help me with my card.  It won't activate.",
  'I tired but an unable to activate my card.',
  'I want to start using my card.',
  'How do I verify my new card?',
  "I tried activating my plug-in and it didn't piece of work"],
 'regexp_for_sampling': [],
 'regexp_as_rules': []}

In [4]:
from src import DataHandler

data_handler = DataHandler(banking77)

In [5]:
# data_handler.delete_collection(db_name="example_collection")

### RegExp

In [6]:
import json

dream = json.load(open('../data/intent_records/dream.json'))
dream[0]

{'intent_id': 0,
 'intent_name': 'what_are_you_talking_about',
 'sample_utterances': [],
 'regexp_for_sampling': ['(alexa ){0,1}what are ((you)|(we)) ((talking about)|(discussing))',
  '(alexa ){0,1}what ((you)|(we)) are (even ){0,1}((talking about)|(discussing))',
  '(alexa ){0,1}what does it mean',
  '(alexa ){0,1}pass that by me again',
  "(alexa ){0,1}i ((don't)|(didn't)|(do not)|(did not)) get it",
  '(alexa ){0,1}what it is about',
  '(alexa ){0,1}what is it about',
  'i lost common ground',
  '(alexa ){0,1}what (even ){0,1}is that',
  "(i ((did not get)|(don't understand)|(don't get)) ){0,1}what do you mean( alexa){0,1}",
  "(sorry, ){0,1}i ((don't)|(do not)|(didn't)|(did not)) ((understand)|(get))( ((what you mean)|(what are you talking about)))( alexa){0,1}",
  '((what you mean)|(what are you talking about))( alexa){0,1}',
  "i don't know what you just said"],
 'regexp_as_rules': ['(alexa ){0,1}are we having a communication problem',
  "(alexa ){0,1}i don't think you understan

In [7]:
import re

def regexp(utterance: str, intents_patterns: list[dict]):
    detected = set()
    for intent in intents_patterns:
        for pattern in intent['regexp_for_sampling'] + intent['regexp_as_rules']:
            if re.match(pattern, utterance) is None:
                continue
            detected.add(intent['intent_id'])
    return detected

In [8]:
regexp(
    utterance='what are you talking about',
    intents_patterns=dream
)

{0, 5}

In [9]:
regexp(
    utterance='tell me something else',
    intents_patterns=dream
)

{1, 6}

In [10]:
regexp(
    utterance='kind of',
    intents_patterns=dream
)

{6}

### Retrieval

In [11]:
from src.modules import VectorDBModule

vectordb = VectorDBModule(model_name='infgrad/stella-base-en-v2', k=10)

  from tqdm.autonotebook import tqdm, trange


In [12]:
vectordb.fit(data_handler)

No sentence-transformers model found with name infgrad/stella-base-en-v2. Creating a new one with mean pooling.


### Scoring

modules:
- knn
- linear
- dnnc

In [13]:
from src.modules import KNNScorer

In [14]:
knn_scorer = KNNScorer(k=10)
knn_scorer.fit(data_handler)
knn_scorer.predict(['i want a new card', 'new card please'])

array([[0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.1, 0. , 0. , 0.1,
        0. , 0.2, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.1,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0.2, 0.1, 0. , 0. , 0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.2, 0.1,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.1, 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0.2, 0.2, 0. , 0. , 0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])

In [15]:
knn_scorer.predict_topk(['i want a new card', 'new card please'])

array([[14, 39,  0],
       [11, 40, 39]])

#### linear

In [16]:
from src.modules import LinearScorer

In [17]:
linear_scorer = LinearScorer()
linear_scorer.fit(data_handler)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [18]:
linear_scorer.predict(['i want a new card', 'new card please'])

array([[5.06857367e-02, 4.16150564e-03, 1.37571193e-03, 6.29149965e-03,
        2.06469180e-03, 9.78793881e-04, 8.17828404e-04, 1.14443451e-03,
        4.47680380e-03, 1.59703771e-01, 3.27302955e-02, 1.52204605e-02,
        2.71018459e-02, 1.92528275e-02, 8.30385490e-02, 7.42028009e-03,
        1.51428275e-03, 3.00459763e-03, 4.10515740e-03, 4.57421712e-04,
        1.82732687e-03, 1.00036981e-02, 2.43404716e-03, 1.92502069e-02,
        1.40963900e-02, 1.64224361e-02, 5.50657899e-04, 1.52087228e-03,
        4.12858346e-03, 1.20586539e-02, 2.19611457e-02, 8.55034389e-04,
        1.28547936e-03, 3.52925538e-03, 1.38407059e-03, 1.31016307e-03,
        1.24381276e-03, 1.26546543e-02, 4.28389326e-03, 1.16650568e-01,
        6.74566349e-02, 9.86701907e-02, 2.27468386e-03, 4.96292450e-02,
        1.36061695e-03, 7.51627122e-04, 9.67389846e-04, 2.57934135e-03,
        2.31915731e-04, 2.08830240e-03, 1.11843217e-03, 2.36497811e-03,
        5.91962170e-03, 2.67785402e-03, 2.77533925e-03, 6.642024

In [19]:
linear_scorer.predict_topk(['i want a new card', 'new card please'], k=5)

array([[ 9, 39, 41, 14, 40],
       [39, 40,  9, 41, 43]])

#### DNNC

In [20]:
from src.modules import DNNCScorer

In [21]:
dnnc_scorer = DNNCScorer(model_name="BAAI/bge-reranker-base", k=10)
dnnc_scorer.fit(data_handler)

In [22]:
dnnc_scorer.predict(['i want a new card', 'new card please'])

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.99912232, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [23]:
dnnc_scorer.predict_topk(['i want a new card', 'new card please'], k=1)

array([[43],
       [43]])

### Prediction

In [24]:
from src.modules import ThresholdModule

In [25]:
threshold_predictor = ThresholdModule(single_thresh=True)
threshold_predictor.fit(data_handler)

In [26]:
linear_scores = linear_scorer.predict(['i want a new card', 'new card please'])
print(linear_scores.shape)
threshold_predictor.predict(linear_scores)

(2, 77)


array([-1, -1])

## Metrics

### Retrieval

In [27]:
from src.metrics import retrieval_map

y_true = 1
y_pred = [2,1,1]

retrieval_map([y_true], [y_pred], k=3)

0.5833333333333333

In [28]:
print(retrieval_map.__doc__)


    Arguments
    ---
    - `query_labels`: for each query, this list contains its class labels
    - `candidates_labels`: for each query, these lists contain class labels of items ranked by a retrieval model (from most to least relevant)
    - `k`: the number of top items to consider for each query

    Return
    ---
    retrieval metric, averaged over all queries
    

    TODO:
    - implement multilabel case, where query_labels: list[list[int]], i.e. each query has multiple intents



In [29]:
from src.metrics import retrieval_hit_rate

query_labels = [1]
candidates_labels = [[1, 4, 5, 2]]
k = 2

retrieval_hit_rate(query_labels, candidates_labels, k)

1.0

In [30]:
from src.metrics import retrieval_precision

query_labels = [1]
candidates_labels = [[1, 1, 3, 4, 5]]
k = 3

retrieval_precision(query_labels, candidates_labels, k)

0.6666666666666666

In [31]:
from src.metrics import retrieval_ndcg

query_labels = [1]
candidates_labels = [[1, 2, 1, 2, 5]]
k = 3

retrieval_ndcg(query_labels, candidates_labels, k=k)

0.9197207891481876

In [32]:
from src.metrics import retrieval_mrr

query_labels = [1]
candidates_labels = [[2, 2, 1, 2, 5]]

retrieval_mrr(query_labels, candidates_labels)

0.3333333333333333

### Scoring

In [33]:
from src.metrics import scoring_neg_cross_entropy

scores = [[0.1, 0.6, 0.3]]
labels = [1]

scoring_neg_cross_entropy(labels, scores)

0.5108256237659907

In [34]:
from src.metrics import scoring_roc_auc

scores = [[0.1, 0.6, 0.3],[0.1, 0.6, 0.3],[0.1, 0.6, 0.3]]
labels = [1, 2, 0]

scoring_roc_auc(labels, scores)

0.5

### Prediction

In [35]:
from src.metrics import prediction_accuracy

y_true = [1,2,3]
y_pred = [1,1,3]

prediction_accuracy(y_true, y_pred)

0.6666666666666666

## Nodes

In [36]:
import yaml

pipeline_config = yaml.safe_load(open('base_pipeline.assets/example-config.yaml'))
pipeline_config

{'nodes': [{'node_type': 'retrieval',
   'metric': 'retrieval_hit_rate',
   'modules': [{'module_type': 'vector_db',
     'k': [10],
     'model_name': ['avsolatorio/GIST-small-Embedding-v0',
      'infgrad/stella-base-en-v2']}]},
  {'node_type': 'scoring',
   'metric': 'scoring_roc_auc',
   'modules': [{'module_type': 'knn', 'k': [1, 3, 5, 10]},
    {'module_type': 'linear'},
    {'module_type': 'dnnc',
     'model_name': ['BAAI/bge-reranker-base',
      'cross-encoder/ms-marco-MiniLM-L-6-v2'],
     'k': [1, 3, 5, 10]}]},
  {'node_type': 'prediction',
   'metric': 'prediction_accuracy',
   'modules': [{'module_type': 'threshold', 'single_thresh': [True]}]}]}

In [37]:
from src.nodes import Node, RetrievalNode, ScoringNode, PredictionNode

available_nodes = {
    'retrieval': RetrievalNode,
    'scoring': ScoringNode,
    'prediction': PredictionNode
}

fitted_nodes = []
for node_config in pipeline_config['nodes']:
    node: Node = available_nodes[node_config['node_type']](
        modules_search_spaces=node_config['modules'],
        metric=node_config['metric']
    )
    node.fit(data_handler)
    fitted_nodes.append(node)
    print('fitted!')

Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 10
Add of existing embedding ID: 11
Add of existing embedding ID: 12
Add of existing embedding ID: 13
Add of existing embedding ID: 14
Add of existing embedding ID: 15
Add of existing embedding ID: 16
Add of existing embedding ID: 17
Add of existing embedding ID: 18
Add of existing embedding ID: 19
Add of existing embedding ID: 20
Add of existing embedding ID: 21
Add of existing embedding ID: 22
Add of existing embedding ID: 23
Add of existing embedding ID: 24
Add of existing embedding ID: 25
Add of existing embedding ID: 26
Add of existing embedding ID: 27
Add of existing embedding ID: 28
Add of existing embedding ID: 29
Add of existing embe

fitted!




fitted!
fitted!
