In [1]:
import sys
import os
import json
import pandas as pd
sys.path.append("../src")
from InvertedIndex import InvertedIndex
from query_parser import load_queries_from_csv
from BooleanRetrieval import BooleanRetrieval
from analyzer import get_preprocessing_stats

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dominik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/dominik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/dominik/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/dominik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Exercise 1 

## a)
typically represented using a hash map where:

- the key is a term from the document
- the value is a list (postings list) of a document IDs that contain that word

## b)

In [2]:
index = InvertedIndex()

if not os.path.exists("../data/index.json"):
    with open("../data/cranfield-data-list.json", "r", encoding="utf-8") as data:
        documents = json.load(data)

    for doc in documents:
        doc_id = doc["id"]
        doc_title = doc["title"]
        doc_author = doc["author"]
        doc_text = doc["body"]

        full_text = f"{doc_title} {doc_text} {doc_author}"
        index.add_doc(doc_id, full_text)

    index.write_to_disk()
    print("Created new index.")
else:
    index.load_from_disk()
    print(f"Loaded existing index.")

Loaded existing index.


## Exercise 2 

In [3]:
retrieval = BooleanRetrieval(index)
queries = load_queries_from_csv()

# print similarity measure sim(q, d) 
for i, q in enumerate(queries[:20]):
    print(f"\nQuery {i+1}: {q}")
    results = retrieval.search(q)
    for doc_id, score in results[:3]:
        print(f" - doc-id:  {doc_id} (score: {score:.2f})")


Query 1: what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft.
query tokens (11):  ['similar', 'law', 'must', 'obey', 'construct', 'aeroelast', 'model', 'heat', 'high', 'speed', 'aircraft']
 - doc-id:  576 (score: 0.45)
 - doc-id:  12 (score: 0.36)
 - doc-id:  51 (score: 0.36)

Query 2: what are the structural and aeroelastic problems associated with flight of high speed aircraft.
query tokens (8):  ['structur', 'aeroelast', 'problem', 'associ', 'flight', 'high', 'speed', 'aircraft']
 - doc-id:  12 (score: 0.50)
 - doc-id:  1089 (score: 0.50)
 - doc-id:  82 (score: 0.50)

Query 3: can a criterion be developed to show empirically the validity of flow solutions for chemically reacting gas mixtures based on the simplifying assumption of instantaneous local chemical equilibrium.
query tokens (18):  ['criterion', 'develop', 'show', 'empir', 'valid', 'flow', 'solut', 'chemic', 'react', 'ga', 'mixtur', 'base', 'simplifi', 'assumpt', 'instanta

# Exercise 3


In [None]:
with open("../data/cranfield-data-list.json", "r", encoding="utf-8") as data:
    documents = json.load(data)

rows = []
for doc in documents:
    doc_id = doc["id"]
    full_text = f"{doc['title']} {doc['body']} {doc['author']}"
    stats = get_preprocessing_stats(full_text)
    for version, count in stats.items():
        rows.append({"doc_id": doc_id, "option": version, "count": count})

df = pd.DataFrame(rows)
display(df.head(20))


Unnamed: 0,doc_id,option,count
0,1,original,149
1,1,only-removed-stopwords,83
2,1,removed-stopwords+stemming,83
3,1,removed-stopwords+stemming+lemmatization,83
4,2,original,204
5,2,only-removed-stopwords,121
6,2,removed-stopwords+stemming,121
7,2,removed-stopwords+stemming+lemmatization,121
8,3,original,38
9,3,only-removed-stopwords,26
