In [116]:
# End-to-End IR System
# Elijah Hoedl
# Assistance by Copilot
import os
import json
import csv
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
from flask import Flask, request, jsonify
import requests
from urllib.parse import urljoin, urlparse
from nltk.metrics import edit_distance
import shutil

In [117]:
# CRAWLER
class Crawler:
    def __init__(self, base_url, max_pages=50, max_depth=2, save_dir='corpus'):
        self.base_url = base_url
        self.max_pages = max_pages
        self.max_depth = max_depth
        self.visited = set()
        self.save_dir = save_dir
        os.makedirs(save_dir, exist_ok=True)
        self.pages_saved = 0

        # Clear corpus folder if it exists
        if os.path.exists(save_dir):
            shutil.rmtree(save_dir)
        os.makedirs(save_dir, exist_ok=True)

        self.pages_saved = 0

    def crawl_site(self, url=None, depth=0):
        if url is None:
            url = self.base_url
        if depth > self.max_depth or self.pages_saved >= self.max_pages:
            return
        if url in self.visited:
            return
        self.visited.add(url)
        try:
            print(f"Crawling {self.pages_saved + 1}/{self.max_pages}")
            r = requests.get(url, timeout=5)
            html = r.text
            # Save HTML
            filename = os.path.join(self.save_dir, f"page_{self.pages_saved}.html")
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(html)
            self.pages_saved += 1
            # Find links
            soup = BeautifulSoup(html, 'html.parser')
            for link_tag in soup.find_all('a', href=True):
                link = urljoin(url, link_tag['href'])
                if urlparse(link).netloc == urlparse(self.base_url).netloc:
                    self.crawl_site(link, depth + 1)
        except Exception as e:
            print(f"Failed to crawl {url}: {e}")

    def crawl(self):
        self.crawl_site(self.base_url)

In [118]:
# INDEXER

class Indexer:
    def __init__(self, corpus_dir='corpus', index_file='index.json'):
        self.corpus_dir = corpus_dir
        self.index_file = index_file
        self.documents = []
        self.doc_ids = []

    def preprocess_html(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        text = soup.get_text(separator=' ')
        text = re.sub(r'\s+', ' ', text)
        return(text.lower())

    def build_index(self):
        # Load documents
        for filename in os.listdir(self.corpus_dir):
            if filename.endswith(".html"):
                path = os.path.join(self.corpus_dir, filename)
                with open(path, 'r', encoding='utf-8') as f:
                    html = f.read()
                    text = self.preprocess_html(html)
                    self.documents.append(text)
                    self.doc_ids.append(filename)

        # Build TF-IDF
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform(self.documents)

        # Save index as JSON
        index_data = {
            "doc_ids": self.doc_ids,
            "vocabulary": self.vectorizer.get_feature_names_out().tolist(),
            "tfidf_matrix": self.tfidf_matrix.toarray().tolist()
        }
        with open(self.index_file, 'w', encoding='utf-8') as f:
            json.dump(index_data, f)
        print(f"Index saved to {self.index_file}")

In [119]:
# QUERY PROCESSOR

class QueryProcessor:
    def __init__(self, index_file='index.json'):
        self.index_file = index_file
        self.load_index()

    def load_index(self):
        import json
        with open(self.index_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        self.doc_ids = data['doc_ids']
        self.vocab = {term: i for i, term in enumerate(data['vocabulary'])}
        self.tfidf_matrix = np.array(data['tfidf_matrix'])

    def preprocess_query(self, query):
        return(re.sub(r'\s+', ' ', query.lower()))

    def correct_spelling(self, word):
        # Return the closest vocabulary term if word not found.
        if word in self.vocab:
            return word
        min_distance = float("inf")
        closest_term = None
        for term in self.vocab.keys():
            dist = edit_distance(word, term)
            if dist < min_distance:
                min_distance = dist
                closest_term = term
        if closest_term:
            print(f"Did you mean: {closest_term}? Using corrected term.")
            return closest_term
        return(word)

    def query_to_vector(self, query):
        # Convert query string to a TF-IDF vector compatible with indexed docs.
        query_vec = np.zeros(len(self.vocab))
        query_words = self.preprocess_query(query).split()
        for word in query_words:
            corrected = self.correct_spelling(word)
            if corrected in self.vocab:
                query_vec[self.vocab[corrected]] = 1
        return(query_vec.reshape(1, -1))

    def search(self, query, top_k=5):
        query_vec = self.query_to_vector(query)
        scores = cosine_similarity(query_vec, self.tfidf_matrix)[0]
        top_indices = np.argsort(scores)[::-1][:top_k]
        results = [{"doc_id": self.doc_ids[i], "score": float(scores[i])} for i in top_indices]
        return(results)


def save_query_results_csv(queries, output_csv='query_results.csv', top_k=5):
    qp = QueryProcessor(index_file='index.json')
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['query', 'doc_id', 'score'])
        for q in queries:
            results = qp.search(q, top_k=top_k)
            for r in results:
                writer.writerow([q, r['doc_id'], r['score']])
    print(f"Query results saved to {output_csv}")

In [120]:
site = "https://books.toscrape.com/"
crawler = Crawler(site, max_pages=10, max_depth=2)
crawler.crawl()

indexer = Indexer()
indexer.build_index()

queries = ["Histrorical"]
save_query_results_csv(queries)

Crawling 1/10
Crawling 2/10
Crawling 3/10
Crawling 4/10
Crawling 5/10
Crawling 6/10
Crawling 7/10
Crawling 8/10
Crawling 9/10
Crawling 10/10
Index saved to index.json
Did you mean: historical? Using corrected term.
Query results saved to query_results.csv


In [121]:
app = Flask(__name__)
query_processor = QueryProcessor(index_file='index.json')

@app.route('/search', methods=['POST'])
def search_api():
    data = request.get_json()
    query = data.get("query", "")
    top_k = data.get("top_k", 5)
    results = query_processor.search(query, top_k=top_k)
    return jsonify(results)
if __name__ == "__main__":
    app.run(port=5000, debug=False)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


In [None]:
stop = False
while not stop:
    print("Enter Query:")
    query = input()
    response = requests.post(
        "http://localhost:5000/search",
        json={"query": query, "top_k": 3}
    )
    print(response.json())
    print("Continue? (Y/N)")
    cont = input()
    while not stop:
        if cont == "Y":
            stop = False
            break
        if cont == "N":
            stop = True
            print("Thanks for searching!")
        else:
            print("Invalid input.")
            print("Continue? (Y/N)")
            cont = input()

Enter Query:


 histrorical


127.0.0.1 - - [04/Dec/2025 14:35:18] "POST /search HTTP/1.1" 200 -


Did you mean: historical? Using corrected term.
[{'doc_id': 'page_5.html', 'score': 0.08752523499162378}, {'doc_id': 'page_8.html', 'score': 0.05848660342984474}, {'doc_id': 'page_3.html', 'score': 0.05601379570814349}]
Continue? (Y/N)


 Y


Enter Query:


 historical


127.0.0.1 - - [04/Dec/2025 14:35:28] "POST /search HTTP/1.1" 200 -


[{'doc_id': 'page_5.html', 'score': 0.08752523499162378}, {'doc_id': 'page_8.html', 'score': 0.05848660342984474}, {'doc_id': 'page_3.html', 'score': 0.05601379570814349}]
Continue? (Y/N)


 Y


Enter Query:


 Philosophy


127.0.0.1 - - [04/Dec/2025 14:35:48] "POST /search HTTP/1.1" 200 -


[{'doc_id': 'page_8.html', 'score': 0.11697320685968948}, {'doc_id': 'page_3.html', 'score': 0.028006897854071747}, {'doc_id': 'page_7.html', 'score': 0.018997906134360466}]
Continue? (Y/N)


 Y


Enter Query:


 Hpilosohpy


127.0.0.1 - - [04/Dec/2025 14:36:05] "POST /search HTTP/1.1" 200 -


Did you mean: philosophy? Using corrected term.
[{'doc_id': 'page_8.html', 'score': 0.11697320685968948}, {'doc_id': 'page_3.html', 'score': 0.028006897854071747}, {'doc_id': 'page_7.html', 'score': 0.018997906134360466}]
Continue? (Y/N)


 WRONGINPUT


Invalid input.
Continue? (Y/N)


 Y


Enter Query:


 black


127.0.0.1 - - [04/Dec/2025 14:36:59] "POST /search HTTP/1.1" 200 -


[{'doc_id': 'page_9.html', 'score': 0.06281930262283093}, {'doc_id': 'page_2.html', 'score': 0.03324204103288293}, {'doc_id': 'page_1.html', 'score': 0.03285743060565798}]
Continue? (Y/N)
