# Evaluation
For evaluation purpose, Chroma and ElasticSearch were selected. 

## ElasticSearch

```bash
docker run -it \
     --rm \
     --name elasticsearch \
     -p 9200:9200 \
     -p 9300:9300 \
     -e "discovery.type=single-node" \
     -e "xpack.security.enabled=false" \
     -e "ES_JAVA_OPTS=-Xms512m -Xmx512m" \
     docker.elastic.co/elasticsearch/elasticsearch:8.4.3
```

In [3]:
import pandas as pd
import json
from elasticsearch import Elasticsearch
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from tqdm.auto import tqdm
load_dotenv(dotenv_path='../.envrc')

True

In [4]:
df = pd.read_csv('article_info.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,source,page,text,id
0,https://papers.nips.cc/paper/4824-imagenet-cla...,0,ImageNet Classiﬁcation with Deep Convolutional...,0
1,https://papers.nips.cc/paper/4824-imagenet-cla...,0,over 15 million labeled high-resolution images...,1
2,https://papers.nips.cc/paper/4824-imagenet-cla...,1,"Despite the attractive qualities of CNNs, and ...",2
3,https://papers.nips.cc/paper/4824-imagenet-cla...,1,"1000 categories. In all, there are roughly 1.2...",3
4,https://papers.nips.cc/paper/4824-imagenet-cla...,1,ﬁve convolutional and three fully-connected. B...,4


In [5]:
es_client = Elasticsearch('http://localhost:9200') 

In [26]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "source": {"type": "text"},
            "page": {"type": "integer"},
            "id": {"type": "integer"} 
        }
    }
}

index_name = "article_info"

# es_client.indices.create(index=index_name, body=index_settings)

In [6]:
documents = df.to_dict(orient='records')

In [13]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/4305 [00:00<?, ?it/s]

## FAISS

In [32]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import DataFrameLoader

embedding_function = OpenAIEmbeddings()
index = faiss.IndexFlatL2(len(embedding_function.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embedding_function,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

df = pd.read_csv('article_info.csv').drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,source,page,text,id
0,https://papers.nips.cc/paper/4824-imagenet-cla...,0,ImageNet Classiﬁcation with Deep Convolutional...,0
1,https://papers.nips.cc/paper/4824-imagenet-cla...,0,over 15 million labeled high-resolution images...,1
2,https://papers.nips.cc/paper/4824-imagenet-cla...,1,"Despite the attractive qualities of CNNs, and ...",2
3,https://papers.nips.cc/paper/4824-imagenet-cla...,1,"1000 categories. In all, there are roughly 1.2...",3
4,https://papers.nips.cc/paper/4824-imagenet-cla...,1,ﬁve convolutional and three fully-connected. B...,4


In [33]:
loader = DataFrameLoader(df, page_content_column='text')
documents = loader.load()

In [34]:
ids = [doc.metadata['id'] for doc in documents]

In [35]:
vector_store.add_documents(documents=documents,ids=ids)

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


## Generate questions for evaluation

I took some random texts available and generated questions about them.

In [136]:
from openai import OpenAI
client = OpenAI()

In [137]:
dataset_template = """
You emulate a student who's taking deep learning class.
Based on the given text, formulate 3 questions this student might ask based on it. 
The record should contain the answer to the questions, and the questions should be complete and not too short.
Make questions specific to the deep learning details.
If possible, use as fewer words as possible from the record. 

The text:
{text}

Provide the output in parsable JSON without using code blocks:

['question': question,
'answer': answer]

""".strip()

In [138]:
def generate_questions(text_sample):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": dataset_template.format(text=text_sample)}]
    )
    
    return response.choices[0].message.content

In [20]:
print("SOURCE OF QUESTION:")
print(df.loc[0, 'text'][:100])
print("GENERATED QUESTIONS:")
print(generate_questions(df.loc[0, 'text']))

SOURCE OF QUESTION:
ImageNet Classiﬁcation with Deep Convolutional
Neural Networks
Alex Krizhevsky
University of Toronto
GENERATED QUESTIONS:


[
    {
        'question': 'What were the top-1 and top-5 error rates achieved by the deep convolutional neural network on the ImageNet test data?',
        'answer': 'The top-1 error rate was 37.5% and the top-5 error rate was 17.0%.'
    },
    {
        'question': 'How many parameters and neurons did the neural network contain, and what layers did it consist of?',
        'answer': 'The neural network contained 60 million parameters and 650,000 neurons, consisting of five convolutional layers, max-pooling layers, and three fully-connected layers.'
    },
    {
        'question': 'What regularization method was used in the fully-connected layers to reduce overfitting, and why was it considered effective?',
        'answer': 'A regularization method called “dropout” was used in the fully-connected layers, and it proved to be very effective in reducing overfitting.'
    }
]


In [126]:
import numpy as np

random_ids = np.random.randint(0, 4305, 200)

In [139]:
generated_questions = []
for i in random_ids:
    try:
        generated_questions.append({
            'id': df.loc[i, 'id'],
            'questions': json.loads(generate_questions(df.loc[i, 'text']))
        })
    except:
        continue

In [140]:
formatted_questions = []
for qst in generated_questions:
    for q in qst['questions']:
        formatted_questions.append({
            'id': qst['id'],
            'question': q['question'],
            'answer': q['answer']
        })

In [29]:
pd.DataFrame(formatted_questions).to_csv('ground_truth_data.csv')

## Retrieval Evaluation

In [147]:
import pandas as pd
df_ground_truth = pd.read_csv('ground_truth_data.csv')
df_ground_truth.drop('Unnamed: 0', axis=1, inplace=True)
ground_truth = df_ground_truth.to_dict(orient='records')

In [148]:
ground_truth[:5]

[{'id': 69,
  'question': 'What is the primary metric used for measuring recognition performance as mentioned in the text?',
  'answer': 'The primary metric used for measuring recognition performance is mean average precision (mAP) across classes.'},
 {'id': 69,
  'question': 'How is performance evaluated on the VOC-2007 and VOC-2012 validation sets?',
  'answer': 'Performance is evaluated by examining the validation sets of VOC-2007 and VOC-2012.'},
 {'id': 69,
  'question': 'What method is suggested for improving performance with image descriptors?',
  'answer': 'Aggregating image descriptors computed at multiple scales by averaging is suggested for improving performance.'},
 {'id': 4122,
  'question': 'What does Theorem 247 imply about the complexity of the problem involving two sequences of quadruples compared to the problem involving their respective sequences?',
  'answer': 'Theorem 247 states that, for some positive 5, the complexity of the problem involving the quadruples (än →

In [149]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

### Elastic Search evaluation

In [150]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                    }
                },
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [151]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['id']
    results = elastic_search(query=q['question'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/471 [00:00<?, ?it/s]

In [152]:
print("ELASTIC SEARCH")
print('hit rate', round(hit_rate(relevance_total), 3))
print('mrr', round(mrr(relevance_total), 3))

ELASTIC SEARCH
hit rate 0.798
mrr 0.647


### FAISS evaluation

In [153]:
def faiss_search(query):
    results = vector_store.similarity_search(query, k=5)
    
    result_docs = []

    for res in results:
        result_docs.append({
            'source': res.metadata['source'],
            'page': res.metadata['page'],
            'text': res.page_content,
            'id': res.metadata['id']
        })
    
    return result_docs

In [154]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['id']
    results = faiss_search(query=q['question'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/471 [00:00<?, ?it/s]

In [155]:
print("FAISS SEARCH")
print('hit rate', round(hit_rate(relevance_total), 3))
print('mrr', round(mrr(relevance_total), 3))

FAISS SEARCH
hit rate 0.682
mrr 0.536


### Finding best parameters for Elastic Search 
because FAISS gave lower results

In [156]:
test_ids = np.random.randint(0, 471, 350)
valid_ids = np.array([i for i in range(0, 132) if i not in test_ids])

print(valid_ids)

[130 192 345 132 181 426 177 404 468 348 383 113 359 374 254 220  58 234
 447 158 284 113 234 370 457 436 175 339 137 125 273 463  21 259  87  58
 466 338 173 100  53  20 305  68 185 411  25 358  61 309 172 296 242  86
 163 134 314 276 375 153 223  51 286  61 360  19 179  39  95 145 111 420
 311 386 107  75 439 432  95 190 203 211  20 398 172 159 256  47 393 404
  15 247 448 117 451  16 112 323 160 224 418 310 397 287 333 251 138 327
  38 412 157 133 452 254 146 378 422  41 306 188 318 456 340   6 407 309
 208 211 205 315 325 219 175  38 200 236  98 397 209  83 210 300 290 416
 371  13 319  14  58 417 409 318 317 162 160  72  69 387 408 351  31 465
 123 311 287 154 420  21 387 206 205 370 249 356  18 342  47 163  19 425
  56  84 314 133 404 370 336 225 425 203  65 447 108  47 440 442 246 267
  48 227 104  90 395 267 343 177 184 293  60 210 253 468 465 226  87 186
 415 194 289  30  71  84 194 118 258 140 439 165 262 135 430 189 435 199
 314 258 425 172 462 356  61  19 221  30  94  24 35

In [157]:
import numpy as np
test_ids = np.random.randint(0, 471, 350)
test_ids = [130 192 345 132 181 426 177 404 468 348 383 113 359 374 254 220  58 234
 447 158 284 113 234 370 457 436 175 339 137 125 273 463  21 259  87  58
 466 338 173 100  53  20 305  68 185 411  25 358  61 309 172 296 242  86
 163 134 314 276 375 153 223  51 286  61 360  19 179  39  95 145 111 420
 311 386 107  75 439 432  95 190 203 211  20 398 172 159 256  47 393 404
  15 247 448 117 451  16 112 323 160 224 418 310 397 287 333 251 138 327
  38 412 157 133 452 254 146 378 422  41 306 188 318 456 340   6 407 309
 208 211 205 315 325 219 175  38 200 236  98 397 209  83 210 300 290 416
 371  13 319  14  58 417 409 318 317 162 160  72  69 387 408 351  31 465
 123 311 287 154 420  21 387 206 205 370 249 356  18 342  47 163  19 425
  56  84 314 133 404 370 336 225 425 203  65 447 108  47 440 442 246 267
  48 227 104  90 395 267 343 177 184 293  60 210 253 468 465 226  87 186
 415 194 289  30  71  84 194 118 258 140 439 165 262 135 430 189 435 199
 314 258 425 172 462 356  61  19 221  30  94  24 359   5  62  70 287 279
 223 260   1 296 433  85 315 113 235  42 407 217 216 292 416  18 328 328
 130 138 458 104 448 388 469  94 299  26 293 162 249 224 350 312 141 458
 393   2   7 203 305 175 235 212 156 206 349   4 232 285 262  50 126 232
 176 356 304 244   7   5 117 347 160  52 150 225 387 202 171 366 344 284
 303 243 298 326 147 388  91  74 407 212 244 316 152 396 470 212 157 295
 410 361  57  73 230 407  24  48]
valid_ids = [i for i in range(0, 132) if i not in test_ids]
valid_ids = [  0   3   8   9  10  11  12  17  22  23  27  28  29  32  33  34  35  36
  37  40  43  44  45  46  49  54  55  59  63  64  66  67  76  77  78  79
  80  81  82  88  89  92  93  96  97  99 101 102 103 105 106 109 110 114
 115 116 119 120 121 122 124 127 128 129 131]

SyntaxError: invalid syntax. Perhaps you forgot a comma? (171016504.py, line 3)

In [40]:
df_test = df_ground_truth[df_ground_truth.index.isin(test_ids)]
df_valid = df_ground_truth[df_ground_truth.index.isin(valid_ids)]

In [43]:
gt_val = df_valid.to_dict(orient='records')

In [44]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [58]:
def elastic_search(query, boost=None):
    if boost is None:
        boost = {"query": query}
    else:
        boost = {"query": query,
                 "fields": [f"text^{boost}"]}
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": boost,
                },
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [45]:
evaluate(gt_val, lambda q: elastic_search(q['question']))

  0%|          | 0/55 [00:00<?, ?it/s]

{'hit_rate': 0.7636363636363637, 'mrr': 0.6672727272727272}

In [100]:
import numpy as np

import numpy as np

def optimize_parameter(iterations=1000, search_space=(1, 10)):

    best_x = None
    best_value = float('-inf')

    for i in range(iterations):
        x = np.random.uniform(search_space[0], search_space[1])

        value = objective(x)
        if value > best_value:
            best_value = value
            best_x = x

        if i % 100 == 0:
            print(f"Iteration {i}: x = {x:.4f}, f(x) = {value:.4f}")

    return best_x, best_value

In [101]:
param_ranges = {
    'text': (0, 3)
}

def objective(boost_param):
    def search_function(q):
        return elastic_search(q['question'], boost_param)
    
    results = evaluate(gt_val, search_function)
    return results['mrr']

In [103]:
optimize_parameter(iterations=20, search_space=(1, 15))

  0%|          | 0/55 [00:00<?, ?it/s]

Iteration 0: x = 7.0386, f(x) = 0.6673


  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

(7.038554341023638, 0.6672727272727272)

In [None]:
evaluate(gt_val, lambda q: elastic_search(q['question'], boost=))