In [40]:
import requests
from bs4 import BeautifulSoup
import json
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk
from collections import defaultdict
import math

# Κατέβασμα απαραίτητων δεδομένων από NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Georg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Georg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Georg\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [41]:
# Συνάρτηση προεπεξεργασίας κειμένου
def preprocess_text(content):
    tokens = word_tokenize(content)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    filtered_tokens = [token for token in lemmatized_tokens if token.lower() not in stop_words]
    cleaned_tokens = [re.sub(r'[^\w\s]', '', token) for token in filtered_tokens]
    return " ".join(cleaned_tokens)

In [42]:
# Δημιουργία ανεστραμμένου ευρετηρίου
def build_inverted_index(data):
    inverted_index = defaultdict(list)
    for doc_id, document in enumerate(data):
        terms = set(document["processed_content"].split())
        for term in terms:
            inverted_index[term].append(doc_id)
    return inverted_index

In [43]:
# Υπολογισμός TF-IDF
def calculate_tfidf(query_tokens, all_data, inverted_index):
    N = len(all_data)
    scores = defaultdict(float)
    for term in query_tokens:
        doc_ids = inverted_index.get(term, [])
        df = len(doc_ids)
        if df > 0:
            idf = math.log(N / df)
            for doc_id in doc_ids:
                tf = all_data[doc_id]["processed_content"].split().count(term)
                scores[doc_id] += tf * idf
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

In [44]:
# Boolean Retrieval
def process_query_boolean(query, inverted_index):
    query_tokens = word_tokenize(query.lower())
    results = set()
    current_set = set()
    operator = None
    for token in query_tokens:
        if token == "and":
            operator = "AND"
        elif token == "or":
            operator = "OR"
        elif token == "not":
            operator = "NOT"
        else:
            current_set = set(inverted_index.get(token, []))
            if operator == "AND":
                results &= current_set
            elif operator == "OR":
                results |= current_set
            elif operator == "NOT":
                results -= current_set
            else:
                results = current_set
    return list(results)

In [45]:
# Okapi BM25
def process_query_bm25(query, all_data, inverted_index, k1=1.5, b=0.75):
    query_tokens = word_tokenize(preprocess_text(query).lower())
    N = len(all_data)
    avg_doc_len = sum(len(doc["processed_content"].split()) for doc in all_data) / N
    scores = defaultdict(float)
    for term in query_tokens:
        doc_ids = inverted_index.get(term, [])
        df = len(doc_ids)
        if df > 0:
            idf = math.log((N - df + 0.5) / (df + 0.5) + 1)
            for doc_id in doc_ids:
                doc = all_data[doc_id]
                term_freq = doc["processed_content"].split().count(term)
                doc_len = len(doc["processed_content"].split())
                numerator = term_freq * (k1 + 1)
                denominator = term_freq + k1 * (1 - b + b * (doc_len / avg_doc_len))
                scores[doc_id] += idf * (numerator / denominator)
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

In [46]:
# Vector Space Model (VSM)
def process_query_vsm(query, all_data, inverted_index):
    query_tokens = word_tokenize(preprocess_text(query).lower())
    N = len(all_data)
    scores = defaultdict(float)

    # Υπολογισμός IDF για τους όρους του ερωτήματος
    idf = {}
    for term in query_tokens:
        df = len(inverted_index.get(term, []))
        idf[term] = math.log(N / (1 + df)) if df > 0 else 0

    # Υπολογισμός βαθμολογιών για κάθε έγγραφο
    for term in query_tokens:
        for doc_id in inverted_index.get(term, []):
            tf = all_data[doc_id]["processed_content"].split().count(term)
            scores[doc_id] += tf * idf[term]

    # Κανονικοποίηση βαθμολογιών
    for doc_id in scores:
        doc_length = len(all_data[doc_id]["processed_content"].split())
        scores[doc_id] /= doc_length

    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

In [47]:
# Συλλογή δεδομένων από URLs
urls = [
    "https://en.wikipedia.org/wiki/Web_scraping",
    "https://en.wikipedia.org/wiki/Data_mining",
    "https://en.wikipedia.org/wiki/Natural_language_processing",
    "https://en.wikipedia.org/wiki/Artificial_intelligence",
    "https://en.wikipedia.org/wiki/Savory_spinach_pie",
    "https://en.wikipedia.org/wiki/Chocolate",
    "https://en.wikipedia.org/wiki/Plumber",
    "https://en.wikipedia.org/wiki/University_of_West_Attica",
    "https://en.wikipedia.org/wiki/McDonald%27s",
    "https://en.wikipedia.org/wiki/Cancer",
    "https://en.wikipedia.org/wiki/Gastroenteritis",
    "https://en.wikipedia.org/wiki/Diabetes"
]

all_data = []
for url in urls:
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            title = soup.find("h1", id="firstHeading").text
            paragraphs = soup.find_all("p")
            content = "\n".join([p.text for p in paragraphs if p.text.strip()])
            processed_content = preprocess_text(content)
            all_data.append({
                "title": title,
                "content": content,
                "processed_content": processed_content
            })
    except Exception as e:
        print(f"Error processing URL {url}: {e}")

In [48]:
# Αποθήκευση δεδομένων σε JSON
data_file_path = "collected_data.json"
with open(data_file_path, "w", encoding="utf-8") as json_file:
    json.dump(all_data, json_file, ensure_ascii=False, indent=4)
print(f"Τα δεδομένα αποθηκεύτηκαν στο {data_file_path}")

Τα δεδομένα αποθηκεύτηκαν στο collected_data.json


In [49]:
# Δημιουργία ανεστραμμένου ευρετηρίου
inverted_index = build_inverted_index(all_data)

In [50]:
# Αποθήκευση ανεστραμμένου ευρετηρίου σε JSON
index_file_path = "inverted_index.json"
with open(index_file_path, "w", encoding="utf-8") as json_file:
    json.dump(inverted_index, json_file, ensure_ascii=False, indent=4)
print(f"Το ανεστραμμένο ευρετήριο αποθηκεύτηκε στο {index_file_path}")

Το ανεστραμμένο ευρετήριο αποθηκεύτηκε στο inverted_index.json


In [51]:
# Αναζήτηση και εμφάνιση αποτελεσμάτων
query = "Doctor"
results = process_query_bm25(query, all_data, inverted_index)

print("Αποτελέσματα αναζήτησης:")
for doc_id, score in results:
    print(f"Title: {all_data[doc_id]['title']} - Score: {score:.2f}")

Αποτελέσματα αναζήτησης:
Title: Cancer - Score: 1.40


In [52]:

# Αναζήτηση και εμφάνιση αποτελεσμάτων
query = "University"  # Παράδειγμα ερώτημα Boolean
results = process_query_boolean(query, inverted_index)

print("Αποτελέσματα αναζήτησης:")
for doc_id in results:
    print(f"Title: {all_data[doc_id]['title']}")

Αποτελέσματα αναζήτησης:
Title: Diabetes
Title: Data mining
Title: Artificial intelligence
Title: University of West Attica


In [53]:
# Αναζήτηση και εμφάνιση αποτελεσμάτων
query = "Plant"  # Παράδειγμα ερώτημα για TF-IDF
query_tokens = word_tokenize(preprocess_text(query).lower())  # Επεξεργασία ερωτήματος
results = calculate_tfidf(query_tokens, all_data, inverted_index)

print("Αποτελέσματα αναζήτησης:")
for doc_id, score in results:
    print(f"Title: {all_data[doc_id]['title']} - Score: {score:.2f}")

Αποτελέσματα αναζήτησης:
Title: Artificial intelligence - Score: 8.79
Title: Chocolate - Score: 2.20
Title: McDonald's - Score: 1.10
Title: Gastroenteritis - Score: 1.10


In [54]:
# Εκτέλεση αναζήτησης με VSM
query = "Data mining"
results_vsm = process_query_vsm(query, all_data, inverted_index)

print("Αποτελέσματα αναζήτησης (VSM):")
for doc_id, score in results_vsm:
    print(f"Title: {all_data[doc_id]['title']} - Score: {score:.4f}")


Αποτελέσματα αναζήτησης (VSM):
Title: Data mining - Score: 0.0756
Title: Web scraping - Score: 0.0115
Title: Artificial intelligence - Score: 0.0036
Title: Natural language processing - Score: 0.0022
Title: Diabetes - Score: 0.0004
Title: Cancer - Score: 0.0003
