In [2]:
import os
import string
from bs4 import BeautifulSoup
from collections import defaultdict
import random
import chardet

# Constants for tag weights
TAG_WEIGHTS = {
    "title": 6,
    "h1": 5, "h2": 5, "h3": 5,
    "a": 4,
    "i": 3, "b": 3,
    "body": 1
}

# Stop words list
STOP_WORDS = {"is", "the", "of", "and", "in", "on", "at", "for", "a", "an"}

# Dataset directory (replace with your actual dataset path)
DATASET_DIR = "E:\webkb"

# Initialize Index Tables
page_info_table = []  # Stores metadata for documents
word_info_table = defaultdict(list)  # Maps words to document IDs


# STAGE 1: Dataset Preparation and Preprocessing
def clean_and_tokenize(text):
    """Clean and tokenize text, removing stop words and special characters."""
    translator = str.maketrans('', '', string.punctuation)
    cleaned_text = text.translate(translator).lower()
    tokens = [word for word in cleaned_text.split() if word not in STOP_WORDS]
    return tokens


def process_document(file_path, doc_id):
    """Process a single HTML document."""
    with open(file_path, 'r', encoding='latin1') as file:  # Try 'latin1' instead of 'utf-8'
        content = file.read()
    soup = BeautifulSoup(content, 'html.parser')

    # Initialize metadata for the document
    page_metadata = {
        "ID": doc_id,
        "PageName": os.path.basename(file_path),
        "TotalWordCount": 0,
        "TotalTagWeight": 0
    }

    # Extract weighted content
    for tag, weight in TAG_WEIGHTS.items():
        for element in soup.find_all(tag):
            if element.string:
                tokens = clean_and_tokenize(element.string)
                page_metadata["TotalWordCount"] += len(tokens)
                page_metadata["TotalTagWeight"] += len(tokens) * weight
                for token in tokens:
                    word_info_table[token].append(doc_id)

    page_info_table.append(page_metadata)

def process_dataset(dataset_dir):
    """Process all documents in the dataset."""
    doc_id = 1
    for root, _, files in os.walk(dataset_dir):
        for file in files:
            if file.endswith(".html"):
                process_document(os.path.join(root, file), doc_id)
                doc_id += 1


# STAGE 2: Advanced Document Indexing Method (ADIM)
def build_index():
    """Build the index by processing the dataset."""
    process_dataset(DATASET_DIR)
    print(f"Processed {len(page_info_table)} documents.")
    print(f"Indexed {len(word_info_table)} unique words.")


# STAGE 3: Query Searching Algorithm (QSA)
def query_search(query):
    """Process a query and return relevant document IDs."""
    tokens = clean_and_tokenize(query)
    relevant_docs = set(word_info_table[tokens[0]])
    for token in tokens[1:]:
        relevant_docs.intersection_update(word_info_table[token])
    return relevant_docs


# STAGE 4: Evolutionary Algorithm (MGA and CA Integration)
def calculate_fitness(doc_id, query_tokens):
    """Calculate fitness of a document based on query relevance."""
    page = next(p for p in page_info_table if p["ID"] == doc_id)
    return page["TotalTagWeight"]


def genetic_algorithm(query_tokens, population_size=10, generations=15):
    """Run the genetic algorithm to rank relevant documents."""
    # Initial Population
    population = list(query_search(" ".join(query_tokens)))
    if len(population) > population_size:
        population = random.sample(population, population_size)

    for _ in range(generations):
        # Evaluate Fitness
        fitness_scores = {doc: calculate_fitness(doc, query_tokens) for doc in population}

        # Selection (Elitism)
        sorted_population = sorted(population, key=lambda doc: fitness_scores[doc], reverse=True)
        parents = sorted_population[:len(sorted_population) // 2]

        # Crossover
        offspring = []
        for i in range(0, len(parents), 2):
            if i + 1 < len(parents):
                offspring.append(parents[i])
                offspring.append(parents[i + 1])

        # Mutation
        for i in range(len(offspring)):
            if random.random() < 0.1:  # Mutation Probability
                offspring[i] = random.choice(population)

        # Next Generation
        population = parents + offspring

    # Return sorted final population
    return sorted(population, key=lambda doc: calculate_fitness(doc, query_tokens), reverse=True)


# Main Script
if __name__ == "__main__":
    print("Building index...")
    build_index()

    # Example Query
    print("\nEnter a query (e.g., 'computer networks'):")
    user_query = input().strip()
    query_tokens = clean_and_tokenize(user_query)

    # Query Search
    relevant_docs = query_search(user_query)
    print(f"\nRelevant Documents: {relevant_docs}")

    # Genetic Algorithm Ranking
    ranked_docs = genetic_algorithm(query_tokens)
    print(f"\nRanked Documents: {ranked_docs}")

    # Display top-ranked document
    if ranked_docs:
        top_doc_id = ranked_docs[0]
        top_doc = next(p for p in page_info_table if p["ID"] == top_doc_id)
        print(f"\nTop Document: {top_doc}")


Building index...
Processed 3948 documents.
Indexed 30764 unique words.

Enter a query (e.g., 'computer networks'):
computer networks

Relevant Documents: {1665, 1794, 259, 3330, 3079, 264, 777, 3208, 3463, 3472, 3090, 3606, 663, 3734, 1187, 2980, 2981, 3236, 1319, 3750, 3369, 683, 684, 1323, 1326, 2094, 3371, 2609, 3505, 2995, 2996, 2615, 2999, 3896, 698, 3002, 3657, 587, 1100, 1103, 208, 720, 3412, 469, 2901, 1367, 3158, 3416, 3418, 3671, 3420, 1373, 3296, 3424, 3936, 3431, 1128, 3690, 620, 3696, 3442, 1012, 2036, 1270, 2934, 3574, 3193, 3581, 1790}

Ranked Documents: [2980, 2980, 2980, 2980, 2980, 2980, 2980, 2980]

Top Document: {'ID': 2980, 'PageName': 'http_^^www.cs.wisc.edu^~shavlik^mlrg^publications.html', 'TotalWordCount': 813, 'TotalTagWeight': 2926}
