### **data**

In [1]:
%%capture
!pip install openpyxl

In [2]:
import pandas as pd

In [3]:
df = pd.read_excel('../service_question_answer.xlsx')
df

Unnamed: 0,Услуга,Вопрос,Ответ
0,УЧЕТ01,Какие документы необходимо предоставить для по...,Для постановки на воинский учет при переезде н...
1,УЧЕТ01,Можно ли подать документы на воинский учет чер...,"Нет, подача документов через доверенное лицо н..."
2,УЧЕТ01,"Каков срок, в течение которого должны быть вып...",Срок предоставления услуги по постановке на во...
3,УЧЕТ01,Какие документы выдаются в результате предоста...,В результате предоставления услуги могут быть ...
4,УЧЕТ01,"Нужно ли предоставлять копию паспорта, если в ...","Да, необходимо предоставить копию паспорта РФ ..."
...,...,...,...
415,СПРАВКА10,"Может ли вдова, вступившая в новый брак, прете...","Нет, вдова, вступившая в новый брак, не может ..."
416,СПРАВКА10,До какого возраста дети могут получать компенс...,"Дети в возрасте до 23 лет, обучающиеся в образ..."
417,СПРАВКА10,Какие документы подтверждают родство при обращ...,"Документы, подтверждающие родство, могут включ..."
418,СПРАВКА10,"Каковы причины, по которым может быть выдан от...",Отказ в выдаче справки может быть обусловлен о...


In [4]:
services = df['Вопрос']
services_list = services.tolist()
services

0      Какие документы необходимо предоставить для по...
1      Можно ли подать документы на воинский учет чер...
2      Каков срок, в течение которого должны быть вып...
3      Какие документы выдаются в результате предоста...
4      Нужно ли предоставлять копию паспорта, если в ...
                             ...                        
415    Может ли вдова, вступившая в новый брак, прете...
416    До какого возраста дети могут получать компенс...
417    Какие документы подтверждают родство при обращ...
418    Каковы причины, по которым может быть выдан от...
419    Можно ли подать заявление на получение компенс...
Name: Вопрос, Length: 420, dtype: object

In [5]:
ground_truth = [[item] for item in df['Услуга']]
ground_truth[:5]

[['УЧЕТ01'], ['УЧЕТ01'], ['УЧЕТ01'], ['УЧЕТ01'], ['УЧЕТ01']]

### **imports, functions**

In [6]:
%%capture
!pip install -r ./hybrid_search_requirements.txt

In [7]:
from hybrid_search_module import *

In [8]:
stopwords_path = '/home/darhanovev/stopwords/russian.txt'

rubert_tiny_dir = "/home/darhanovev/hugging_face/rubert_tiny2"
rubert_base_path = "/home/darhanovev/hugging_face/rubert_base"
bi_encoder_dir = '/home/darhanovev/hugging_face/bi_encoder_russian_msmarco'
user_bge_dir = '/home/darhanovev/hugging_face/USER_bge_m3'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"{device = }")
print(f"{torch.cuda.device_count() = }")
print(f"{torch.cuda.current_device() = }")

device = device(type='cuda')
torch.cuda.device_count() = 1
torch.cuda.current_device() = 0


In [9]:
from metrics_module import *

In [10]:
weights_range = [(round(1 - elem, 2), round(elem, 2)) for elem in np.linspace(0, 1, 11)]
weights_range

[(1.0, 0.0),
 (0.9, 0.1),
 (0.8, 0.2),
 (0.7, 0.3),
 (0.6, 0.4),
 (0.5, 0.5),
 (0.4, 0.6),
 (0.3, 0.7),
 (0.2, 0.8),
 (0.1, 0.9),
 (0.0, 1.0)]

In [11]:
def process_scores(scores):
    """ for one query """
    result = [elem[0] for elem in scores]
    return result


def process_searcher(searcher, weights_tuple, services_list, ground_truth, top):
    """ for one searcher and one weights """
    result = []
    for query in services_list:
        weights_cur = {'bm_25': weights_tuple[0], 'embedding': weights_tuple[1]}
        scores = searcher.make_query(query, top=10, pooling=True, weights=weights_cur)
        processed = process_scores(scores[:5])
        result.append(processed)
    metrics = RetrieverMetrics(ground_truth, result, top)
    to_return = [weights_tuple, metrics.report()]

    return to_return


def process_weights(searcher, weights, services_list, ground_truth, top):
    """ for one searcher and all weights """
    result = []
    for weights_tuple in weights:
        scores = process_searcher(searcher, weights_tuple, services_list, ground_truth, top)
        result.append(scores)
        print(f"{scores = }")

    return result

### **ruBERT-tiny-2**

In [12]:
searcher_1 = HybridSearch(stopwords_path, rubert_tiny_dir, services_list, device, df)

In [13]:
scores = searcher_1.make_query('Дубликат пенсионного удостоверения от Военкомата', top=5, pooling=False)
scores

[(56, 0.9325843006372452),
 (52, 0.875095887356912),
 (51, 0.868788359934007),
 (54, 0.8453308522951954),
 (279, 0.4240027070045471)]

In [15]:
scores = searcher_1.make_query('Дубликат пенсионного удостоверения от Военкомата', top=10, pooling=True)
scores

[('ДОКУМЕНТ02', 0.9325843006372452),
 ('ПЕНСИЯ02', 0.7278832277081815),
 ('ДОКУМЕНТ01', 0.4228782057762146),
 ('ДОКУМЕНТ03', 0.42273224890232086),
 ('ВЫПЛАТЫ13', 0.4224635511636734),
 ('ПЕНСИЯ03', 0.42092934250831604),
 ('ДОКУМЕНТ04', 0.2912494548148473)]

In [16]:
process_scores(scores)

['ДОКУМЕНТ02',
 'ПЕНСИЯ02',
 'ДОКУМЕНТ01',
 'ДОКУМЕНТ03',
 'ВЫПЛАТЫ13',
 'ПЕНСИЯ03',
 'ДОКУМЕНТ04']

In [18]:
%%time
result_1 = process_weights(searcher_1, weights_range, services_list, ground_truth, 5)

scores = [(1.0, 0.0), (0.42802645502645503, 0.9279541188738268, 0.9254365079365079)]
scores = [(0.9, 0.1), (0.42623809523809525, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.8, 0.2), (0.42623809523809525, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.7, 0.3), (0.42623809523809525, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.6, 0.4), (0.42623809523809525, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.5, 0.5), (0.42623809523809525, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.4, 0.6), (0.42623809523809525, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.3, 0.7), (0.42623809523809525, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.2, 0.8), (0.42623809523809525, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.1, 0.9), (0.42623809523809525, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.0, 1.0), (0.42623809523809525, 0.9333680917622523, 0.9313888888888889)]
CPU times: user 48min 11s, sys: 57.1 s, total: 49min 8s
Wall time

In [28]:
print("BM25\truBERT-tiny-2\tMAP@5\t\tnorm_MAP@5\tMRR@5")
for elem in result_1:
    print(f"{elem[0][0]}\t{elem[0][1]}\t\t{elem[1][0]:.4f}\t\t{elem[1][1]:.4f}\t\t{elem[1][2]:.4f}")

BM25	ruBERT-tiny-2	MAP@5		norm_MAP@5	MRR@5
1.0	0.0		0.4280		0.9280		0.9254
0.9	0.1		0.4262		0.9334		0.9314
0.8	0.2		0.4262		0.9334		0.9314
0.7	0.3		0.4262		0.9334		0.9314
0.6	0.4		0.4262		0.9334		0.9314
0.5	0.5		0.4262		0.9334		0.9314
0.4	0.6		0.4262		0.9334		0.9314
0.3	0.7		0.4262		0.9334		0.9314
0.2	0.8		0.4262		0.9334		0.9314
0.1	0.9		0.4262		0.9334		0.9314
0.0	1.0		0.4262		0.9334		0.9314


### **ruBERT-base**

In [19]:
searcher_2 = HybridSearch(stopwords_path, rubert_base_path, services_list, device, df)

No sentence-transformers model found with name /home/darhanovev/hugging_face/rubert_base. Creating a new one with mean pooling.
  return self.fget.__get__(instance, owner)()


In [20]:
scores = searcher_2.make_query('Дубликат пенсионного удостоверения от Военкомата', top=6, pooling=True)
scores

[('ДОКУМЕНТ02', 0.938690796494484),
 ('ДОКУМЕНТ03', 0.4413982778787613),
 ('ДОКУМЕНТ05', 0.42581577599048615),
 ('СПРАВКА07', 0.42529168725013733)]

In [21]:
%%time
result_2 = process_weights(searcher_2, weights_range, services_list, ground_truth, 5)

scores = [(1.0, 0.0), (0.42802645502645503, 0.9279541188738268, 0.9254365079365079)]
scores = [(0.9, 0.1), (0.4257440476190476, 0.9312825860271116, 0.9290079365079366)]
scores = [(0.8, 0.2), (0.4257440476190476, 0.9312825860271116, 0.9290079365079366)]
scores = [(0.7, 0.3), (0.4257440476190476, 0.9312825860271116, 0.9290079365079366)]
scores = [(0.6, 0.4), (0.4257440476190476, 0.9312825860271116, 0.9290079365079366)]
scores = [(0.5, 0.5), (0.4257440476190476, 0.9312825860271116, 0.9290079365079366)]
scores = [(0.4, 0.6), (0.4257440476190476, 0.9312825860271116, 0.9290079365079366)]
scores = [(0.3, 0.7), (0.4257440476190476, 0.9312825860271116, 0.9290079365079366)]
scores = [(0.2, 0.8), (0.4257440476190476, 0.9312825860271116, 0.9290079365079366)]
scores = [(0.1, 0.9), (0.4257440476190476, 0.9312825860271116, 0.9290079365079366)]
scores = [(0.0, 1.0), (0.4257440476190476, 0.9312825860271116, 0.9290079365079366)]
CPU times: user 40min 22s, sys: 8.68 s, total: 40min 30s
Wall time: 40min 5

In [29]:
print("BM25\truBERT-base\tMAP@5\t\tnorm_MAP@5\tMRR@5")
for elem in result_2:
    print(f"{elem[0][0]}\t{elem[0][1]}\t\t{elem[1][0]:.4f}\t\t{elem[1][1]:.4f}\t\t{elem[1][2]:.4f}")

BM25	ruBERT-base	MAP@5		norm_MAP@5	MRR@5
1.0	0.0		0.4280		0.9280		0.9254
0.9	0.1		0.4257		0.9313		0.9290
0.8	0.2		0.4257		0.9313		0.9290
0.7	0.3		0.4257		0.9313		0.9290
0.6	0.4		0.4257		0.9313		0.9290
0.5	0.5		0.4257		0.9313		0.9290
0.4	0.6		0.4257		0.9313		0.9290
0.3	0.7		0.4257		0.9313		0.9290
0.2	0.8		0.4257		0.9313		0.9290
0.1	0.9		0.4257		0.9313		0.9290
0.0	1.0		0.4257		0.9313		0.9290


### **bi-encoder**

In [31]:
searcher_3 = HybridSearch(stopwords_path, bi_encoder_dir, services_list, device, df)

In [32]:
scores = searcher_3.make_query('Дубликат пенсионного удостоверения от Военкомата', top=5, pooling=True)
scores

[('ДОКУМЕНТ02', 0.5),
 ('ПЕНСИЯ05', 0.38593780994415283),
 ('СПРАВКА09', 0.38577376306056976),
 ('ВЫПЛАТЫ11', 0.3699050173163414),
 ('ПЕНСИЯ03', 0.3622478023171425)]

In [33]:
%%time
result_3 = process_weights(searcher_3, weights_range, services_list, ground_truth, 5)

scores = [(1.0, 0.0), (0.42802645502645503, 0.9279541188738268, 0.9254365079365079)]
scores = [(0.9, 0.1), (0.42684920634920637, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.8, 0.2), (0.42684920634920637, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.7, 0.3), (0.42684920634920637, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.6, 0.4), (0.42684920634920637, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.5, 0.5), (0.42684920634920637, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.4, 0.6), (0.42684920634920637, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.3, 0.7), (0.42684920634920637, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.2, 0.8), (0.42684920634920637, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.1, 0.9), (0.42684920634920637, 0.9333680917622523, 0.9313888888888889)]
scores = [(0.0, 1.0), (0.42684920634920637, 0.9333680917622523, 0.9313888888888889)]
CPU times: user 41min 53s, sys: 12.6 s, total: 42min 6s
Wall time

In [34]:
print("BM25\tbi-encoder\tMAP@5\t\tnorm_MAP@5\tMRR@5")
for elem in result_3:
    print(f"{elem[0][0]}\t{elem[0][1]}\t\t{elem[1][0]:.4f}\t\t{elem[1][1]:.4f}\t\t{elem[1][2]:.4f}")

BM25	bi-encoder	MAP@5		norm_MAP@5	MRR@5
1.0	0.0		0.4280		0.9280		0.9254
0.9	0.1		0.4268		0.9334		0.9314
0.8	0.2		0.4268		0.9334		0.9314
0.7	0.3		0.4268		0.9334		0.9314
0.6	0.4		0.4268		0.9334		0.9314
0.5	0.5		0.4268		0.9334		0.9314
0.4	0.6		0.4268		0.9334		0.9314
0.3	0.7		0.4268		0.9334		0.9314
0.2	0.8		0.4268		0.9334		0.9314
0.1	0.9		0.4268		0.9334		0.9314
0.0	1.0		0.4268		0.9334		0.9314


### **USER-bge-m3**

In [35]:
searcher_4 = HybridSearch(stopwords_path, user_bge_dir, services_list, device, df)

In [36]:
scores = searcher_4.make_query('Дубликат пенсионного удостоверения от Военкомата', top=7, pooling=True)
scores

[('ДОКУМЕНТ02', 0.9363677054643631),
 ('ДОКУМЕНТ03', 0.4416535198688507),
 ('РЕКВИЗИТЫ01', 0.4162900298833847),
 ('ПЕНСИЯ02', 0.3038805207036344)]

In [37]:
%%time
result_4 = process_weights(searcher_4, weights_range, services_list, ground_truth, 5)

scores = [(1.0, 0.0), (0.42802645502645503, 0.9279541188738268, 0.9254365079365079)]
scores = [(0.9, 0.1), (0.4265436507936508, 0.9333680917622522, 0.9313888888888889)]
scores = [(0.8, 0.2), (0.4265436507936508, 0.9333680917622522, 0.9313888888888889)]
scores = [(0.7, 0.3), (0.4265436507936508, 0.9333680917622522, 0.9313888888888889)]
scores = [(0.6, 0.4), (0.4265436507936508, 0.9333680917622522, 0.9313888888888889)]
scores = [(0.5, 0.5), (0.4265436507936508, 0.9333680917622522, 0.9313888888888889)]
scores = [(0.4, 0.6), (0.4265436507936508, 0.9333680917622522, 0.9313888888888889)]
scores = [(0.3, 0.7), (0.4265436507936508, 0.9333680917622522, 0.9313888888888889)]
scores = [(0.2, 0.8), (0.4265436507936508, 0.9333680917622522, 0.9313888888888889)]
scores = [(0.1, 0.9), (0.4265436507936508, 0.9333680917622522, 0.9313888888888889)]
scores = [(0.0, 1.0), (0.4265436507936508, 0.9333680917622522, 0.9313888888888889)]
CPU times: user 38min 39s, sys: 9.44 s, total: 38min 48s
Wall time: 39min 2

In [38]:
print("BM25\tUSER-bge-m3\tMAP@5\t\tnorm_MAP@5\tMRR@5")
for elem in result_4:
    print(f"{elem[0][0]}\t{elem[0][1]}\t\t{elem[1][0]:.4f}\t\t{elem[1][1]:.4f}\t\t{elem[1][2]:.4f}")

BM25	USER-bge-m3	MAP@5		norm_MAP@5	MRR@5
1.0	0.0		0.4280		0.9280		0.9254
0.9	0.1		0.4265		0.9334		0.9314
0.8	0.2		0.4265		0.9334		0.9314
0.7	0.3		0.4265		0.9334		0.9314
0.6	0.4		0.4265		0.9334		0.9314
0.5	0.5		0.4265		0.9334		0.9314
0.4	0.6		0.4265		0.9334		0.9314
0.3	0.7		0.4265		0.9334		0.9314
0.2	0.8		0.4265		0.9334		0.9314
0.1	0.9		0.4265		0.9334		0.9314
0.0	1.0		0.4265		0.9334		0.9314
