In [1]:
from deepeval import evaluate
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase



In [2]:
import os

In [3]:
from embedchain import App

In [4]:
from transformers import pipeline

In [5]:
def calculateContextRelevancy(actual_output = "", retrieval_context=[], input=""):
    metric = ContextualRelevancyMetric(
        threshold=0.7,
        model="gpt-4",
        include_reason=True
    )
    test_case = LLMTestCase(
        input,
        actual_output=actual_output,
        retrieval_context=retrieval_context
    )

    metric.measure(test_case)
    print(metric.score)
    print(metric.reason)
    print()
    
    
    return test_case, metric

In [6]:
def evaluate_in_bulk(test_case, metric):
    # evaluate test cases in bulk
    evaluate([test_case], [metric])

In [7]:
inputs = ["""Voiko luottamustehtävässä edelleen olla, jos muuttaa toiselle paikkakunnalle (answer IN FINNISH)?""",
         """Voidaanko turhaa irtaimistoa lahjoittaa yliopistolle ja kuka päättää irtaimiston lahjoittamisesta (answer IN FINNISH)?""",
         """Kuka päättää uusien virkojen perustamisesta (answer IN FINNISH)?""",
         """Mikäli toimipisteessä sattuu vesivahinko, jonka myötä toiminta
         tulee siirtää muualle, kuka päättää toimipisteen siirrosta, mikäli
         asiasta on jo tehty viranhaltijapäätös kahdelle kuukaudelle, mutta
         alkuperäisessä toimipisteessä olevien laajojen ongelmien vuoksi,
         toimintaa joudutaan edelleen jatkamaan toisessa toimipisteessä (answer IN FINNISH)?""",
         """Mikä on aluehallituksen rooli osakeyhtiön perustamiseen liittyvissä asioissa (answer IN FINNISH)?"""]

In [8]:
dir_path = "./chunks_hallinto/"

In [9]:
file_names = []
for i, file in enumerate(os.listdir(dir_path)):
        file = dir_path + file
        # Check if the file has a .txt extension
        if file.endswith('.txt'):
            file_names.append( file)

In [10]:
file_names

['./chunks_hallinto/chunk.txt',
 './chunks_hallinto/chunk2.txt',
 './chunks_hallinto/chunk3.txt',
 './chunks_hallinto/chunk4.txt',
 './chunks_hallinto/chunk5.txt',
 './chunks_hallinto/chunk6.txt']

## Evaluate RAG + Vector DB

### Initialize embedcahin

In [11]:
app = App()
app.add('./chunks_hallinto/chunk.txt', data_type="text_file")
app.add('./chunks_hallinto/chunk2.txt', data_type="text_file")
app.add('./chunks_hallinto/chunk3.txt', data_type="text_file")
app.add('./chunks_hallinto/chunk4.txt', data_type="text_file")
app.add('./chunks_hallinto/chunk5.txt', data_type="text_file")
app.add('./chunks_hallinto/chunk6.txt', data_type="text_file")



'3e655cea63f45a0f404b6ba32ad6e35f'

In [12]:
embedchain_results = []

#### Analyze the inputs

In [13]:

for input in inputs:
    answer, sources = app.query(input, citations=True)
    source_list = []
    for s in sources:
        source_list.append(s[0])
    test_case_ragV, metric_ragV = calculateContextRelevancy(answer, source_list, input)
    embedchain_results.append((test_case_ragV, metric_ragV))

Output()

0.0
The score is 0.00 because none of the statements in the retrieval context address the question of whether one can continue in a position of trust if one moves to another municipality. All statements referenced matters not relevant to the question, such as 'the authority in personnel matters when no laws or administrative regulations are established' and 'the eligibility of certain individuals to be elected into the regional council', among others.



Output()

0.0
The score is 0.00 because the retrieval context does not contain any statements related to the input about donating unnecessary property to the university. All the statements quoted are irrelevant as they discuss topics such as roles and responsibilities of board members, eligibility rules for a regional council, and the composition of the council instead of the donation process.



Output()

0.14285714285714285
The score is 0.14 because the retrieval context mainly contains information about different aspects of job positions, such as changing the name, qualification requirements, refilling, and reorganization, which are irrelevant to the decision on the establishment of new positions. However, it does include two statements about which bodies decide on the establishment and abolition of certain positions, which are somewhat relevant to the input query.



Output()

0.5454545454545454
The score is 0.55 because while the retrieved context does mention 'Viran siirtämisestä toimialalta toiselle päättää hyvinvointialuejohtaja tai hänen määräämänsä viranhaltija' which is somewhat relevant to the input about decision-making process in the event of water damage, much of the context discusses unrelated topics such as 'authority in personnel matters', 'establishing and terminating positions', and 'changes in the employment relationship'.



Output()

0.07142857142857142
The score is 0.07 because the retrieval context mainly discusses the general duties, responsibilities and roles of the regional government, which are not specifically related to the establishment of a company. Only one relevant statement was found: 'Aluehallitus päättää osakeyhtiön perustamisesta'.



## Evaluate RagBERT

In [15]:
from RAGBERT import RagBERT

In [16]:
bert_model = "distil_bert_fine_chunk_hallinto_classifier-v2"
classifier = pipeline("text-classification", model=bert_model)

Device set to use cuda:0


In [17]:
ragbert_results = []

In [18]:
chunk_map = [{'label':'LABEL_0', 'file':'chunk.txt'},
                 {'label':'LABEL_1', 'file':'chunk2.txt'},
                 {'label':'LABEL_2', 'file':'chunk3.txt'},
                 {'label':'LABEL_3', 'file':'chunk4.txt'},
                 {'label':'LABEL_4', 'file':'chunk5.txt'},
                 {'label':'LABEL_5', 'file':'chunk6.txt'}]

In [19]:
for input in inputs:
    resp, sources, query = RagBERT(classifier, topk=1, query=input, chunk_map=chunk_map, source="./chunks_hallinto")
    test_case_ragB, metric_ragB = calculateContextRelevancy(resp, sources, query)
    ragbert_results.append((test_case_ragB, metric_ragB))



[[{'label': 'LABEL_0', 'score': 0.7795916199684143}, {'label': 'LABEL_1', 'score': 0.03009897656738758}, {'label': 'LABEL_2', 'score': 0.09950163960456848}, {'label': 'LABEL_3', 'score': 0.03744695708155632}, {'label': 'LABEL_4', 'score': 0.03293641656637192}, {'label': 'LABEL_5', 'score': 0.020424384623765945}]]
LABEL_0 [{'label': 'LABEL_0', 'file': 'chunk.txt'}]
[{'label': 'LABEL_0', 'file': 'chunk.txt'}, {'label': 'LABEL_1', 'file': 'chunk2.txt'}, {'label': 'LABEL_2', 'file': 'chunk3.txt'}, {'label': 'LABEL_3', 'file': 'chunk4.txt'}, {'label': 'LABEL_4', 'file': 'chunk5.txt'}, {'label': 'LABEL_5', 'file': 'chunk6.txt'}]
final_query:  
    
        76 Â§
Yleinen vaalikelpoisuus
Vaalikelpoinen hyvinvointialueen luottamustoimeen on henkilÃ¶:

1)â€‚joka on kyseisen hyvinvointialueen asukas;

2)â€‚jolla on jollakin hyvinvointialueella Ã¤Ã¤nioikeus aluevaaleissa sinÃ¤ vuonna, jona valtuutetut valitaan tai vaali muuhun luottamustoimeen toimitetaan; ja

3)â€‚jota ei ole julistettu vajaavalt

Output()

1.0
The score is 1.00 because the retrieval context is directly relevant to the input, as evidenced by the statement 'Vaalikelpoinen hyvinvointialueen luottamustoimeen on henkilÃ¶: 1)â€‚joka on kyseisen hyvinvointialueen asukas.' and there are no stated reasons for irrelevancy.

[[{'label': 'LABEL_0', 'score': 0.02966916188597679}, {'label': 'LABEL_1', 'score': 0.006383607164025307}, {'label': 'LABEL_2', 'score': 0.9290233254432678}, {'label': 'LABEL_3', 'score': 0.012153920717537403}, {'label': 'LABEL_4', 'score': 0.011851023882627487}, {'label': 'LABEL_5', 'score': 0.01091888826340437}]]
LABEL_2 [{'label': 'LABEL_2', 'file': 'chunk3.txt'}]
[{'label': 'LABEL_0', 'file': 'chunk.txt'}, {'label': 'LABEL_1', 'file': 'chunk2.txt'}, {'label': 'LABEL_2', 'file': 'chunk3.txt'}, {'label': 'LABEL_3', 'file': 'chunk4.txt'}, {'label': 'LABEL_4', 'file': 'chunk5.txt'}, {'label': 'LABEL_5', 'file': 'chunk6.txt'}]
final_query:  
    
        50 Â§ Toimivalta henkilÃ¶stÃ¶asioissa
MikÃ¤li toimivallast



Output()

0.0
The score is 0.00 because all statements in the retrieval context are about various personnel and post related matters, and none of them are about donating unnecessary property to the university, as required by the input.

[[{'label': 'LABEL_0', 'score': 0.002337086945772171}, {'label': 'LABEL_1', 'score': 0.0011278341989964247}, {'label': 'LABEL_2', 'score': 0.9913187623023987}, {'label': 'LABEL_3', 'score': 0.0021168228704482317}, {'label': 'LABEL_4', 'score': 0.0013927258551120758}, {'label': 'LABEL_5', 'score': 0.0017067393055185676}]]
LABEL_2 [{'label': 'LABEL_2', 'file': 'chunk3.txt'}]
[{'label': 'LABEL_0', 'file': 'chunk.txt'}, {'label': 'LABEL_1', 'file': 'chunk2.txt'}, {'label': 'LABEL_2', 'file': 'chunk3.txt'}, {'label': 'LABEL_3', 'file': 'chunk4.txt'}, {'label': 'LABEL_4', 'file': 'chunk5.txt'}, {'label': 'LABEL_5', 'file': 'chunk6.txt'}]
final_query:  
    
        50 Â§ Toimivalta henkilÃ¶stÃ¶asioissa
MikÃ¤li toimivallasta henkilÃ¶stÃ¶asioissa ei ole sÃ¤Ã¤detty laissa



Output()

1.0
The score is 1.00 because there are no irrelevant statements listed and the relevant statements directly answer the question about who decides on the establishment of new positions, in Finnish.

[[{'label': 'LABEL_0', 'score': 0.05718198046088219}, {'label': 'LABEL_1', 'score': 0.013595126569271088}, {'label': 'LABEL_2', 'score': 0.7902374267578125}, {'label': 'LABEL_3', 'score': 0.0677618458867073}, {'label': 'LABEL_4', 'score': 0.04866572842001915}, {'label': 'LABEL_5', 'score': 0.022558031603693962}]]
LABEL_2 [{'label': 'LABEL_2', 'file': 'chunk3.txt'}]
[{'label': 'LABEL_0', 'file': 'chunk.txt'}, {'label': 'LABEL_1', 'file': 'chunk2.txt'}, {'label': 'LABEL_2', 'file': 'chunk3.txt'}, {'label': 'LABEL_3', 'file': 'chunk4.txt'}, {'label': 'LABEL_4', 'file': 'chunk5.txt'}, {'label': 'LABEL_5', 'file': 'chunk6.txt'}]
final_query:  
    
        50 Â§ Toimivalta henkilÃ¶stÃ¶asioissa
MikÃ¤li toimivallasta henkilÃ¶stÃ¶asioissa ei ole sÃ¤Ã¤detty laissa eikÃ¤ mÃ¤Ã¤rÃ¤tty hallintosÃ¤Ã¤nnÃ¶



Output()

0.4
The score is 0.40 because the retrieval context contains some relevant information about who decides on relocating operations ('Viran siirtÃ¤misestÃ¤ toimialalta toiselle pÃ¤Ã¤ttÃ¤Ã¤ hyvinvointialuejohtaja tai hÃ¤nen mÃ¤Ã¤rÃ¤Ã¤mÃ¤nsÃ¤ viranhaltija.', 'Kuitenkin mikÃ¤li aluevaltuusto tai aluehallitus tai pelastuslautakunta pÃ¤Ã¤ttÃ¤Ã¤ viran tÃ¤yttÃ¤misestÃ¤, aluevaltuusto ja vastaavasti aluehallitus ja pelastuslautakunta myÃ¶s pÃ¤Ã¤ttÃ¤Ã¤ viran siirtÃ¤misestÃ¤.'), but a significant proportion of the context is not relevant to the input, such as information about 'perustamisesta ja lakkauttamisesta sekÃ¤ virkanimikkeen muuttamisesta' and the statement 'Pelastuslautakunta pÃ¤Ã¤ttÃ¤Ã¤ pelastuslaitosta koskevat asiat 25 Â§:n mukaisesti'.

[[{'label': 'LABEL_0', 'score': 0.20487233996391296}, {'label': 'LABEL_1', 'score': 0.24692319333553314}, {'label': 'LABEL_2', 'score': 0.11118076741695404}, {'label': 'LABEL_3', 'score': 0.06688091158866882}, {'label': 'LABEL_4', 'score': 0.1877386271



Output()

0.4
The score is 0.40 because while the relevant statements do discuss a 'Toimialajohtaja' and their responsibilities within a regional government, the irrelevant statements have no connection to the role of regional government in the establishment of a company.



## Show Results

In [20]:
# Embedchain
for t,m in embedchain_results:
    print(m.score)
    print(m.reason)

0.0
The score is 0.00 because none of the statements in the retrieval context address the question of whether one can continue in a position of trust if one moves to another municipality. All statements referenced matters not relevant to the question, such as 'the authority in personnel matters when no laws or administrative regulations are established' and 'the eligibility of certain individuals to be elected into the regional council', among others.
0.0
The score is 0.00 because the retrieval context does not contain any statements related to the input about donating unnecessary property to the university. All the statements quoted are irrelevant as they discuss topics such as roles and responsibilities of board members, eligibility rules for a regional council, and the composition of the council instead of the donation process.
0.14285714285714285
The score is 0.14 because the retrieval context mainly contains information about different aspects of job positions, such as changing th

In [21]:
# RagBERT
for t,m in ragbert_results:
    print(m.score)
    print(m.reason)

1.0
The score is 1.00 because the retrieval context is directly relevant to the input, as evidenced by the statement 'Vaalikelpoinen hyvinvointialueen luottamustoimeen on henkilÃ¶: 1)â€‚joka on kyseisen hyvinvointialueen asukas.' and there are no stated reasons for irrelevancy.
0.0
The score is 0.00 because all statements in the retrieval context are about various personnel and post related matters, and none of them are about donating unnecessary property to the university, as required by the input.
1.0
The score is 1.00 because there are no irrelevant statements listed and the relevant statements directly answer the question about who decides on the establishment of new positions, in Finnish.
0.4
The score is 0.40 because the retrieval context contains some relevant information about who decides on relocating operations ('Viran siirtÃ¤misestÃ¤ toimialalta toiselle pÃ¤Ã¤ttÃ¤Ã¤ hyvinvointialuejohtaja tai hÃ¤nen mÃ¤Ã¤rÃ¤Ã¤mÃ¤nsÃ¤ viranhaltija.', 'Kuitenkin mikÃ¤li aluevaltuusto tai alueh

## Evaluate in Bulk

In [None]:
# Embedchain
for t,m in embedchain_results:
    evaluate_in_bulk(t, m)

In [None]:
# RagBERT
for t,m in ragbert_results:
    evaluate_in_bulk(t, m)