# Project - Search Engine

## Βήμα 1: Συλλογή Δεδομένων

### H συνάρτηση Crawl

In [85]:
import requests
from bs4 import BeautifulSoup

def crawl_wikipedia(url):
    data = []
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Εξαγωγή τίτλου και κειμένου
        title = soup.find("h1").text # βρίσκει h1 html tag (header1) και επιστρέφει το κείμενο του
        paragraphs = [p.text for p in soup.find_all("p")] # βρίσκει όλα τα p html tags (paragraph) και επιστρέφει το κείμενο τους
        content = "\n".join(paragraphs) # διαχωριστής των παραγράφων το σύμβολο " | "
        data.append({'title': title, 'content': content}) # προσθήκη τίτλου και περιεχομένου στη λίστα data
    else: 
        print(f"Error: {response.status_code}")
        print(f"URL: {url}")
    
    return data

### Χρηση της συνάρτησης Crawl

In [86]:
# Λίστα άρθρων για συλλογή 
articles = [
    "Science",
    "Technology",
    "Engineering",
    "Mathematics",
]
collected_data = []

for article in articles:
    url = f'https://en.wikipedia.org/wiki/{article}'
    collected_data.extend(crawl_wikipedia(url))
    # for d in collected_data:
    #     words = d['content'].split()
    #     d['content'] = " ".join(words[:500]) # each article is limited to 500 words

In [87]:
from pprint import pprint
def print_article(collected_data):
    print("Number of articles collected:", len(collected_data))
    print(f"1.Article Title: {collected_data[0]['title']}")
    print("  Content (first 100 words):")
    pprint(" ".join(collected_data[0]['content'].split()[:100]))
print_article(collected_data)

Number of articles collected: 4
1.Article Title: Science
  Content (first 100 words):
('Science is a systematic discipline that builds and organises knowledge in '
 'the form of testable hypotheses and predictions about the universe.[1][2] '
 'Modern science is typically divided into two or three major branches:[3] the '
 'natural sciences (e.g., physics, chemistry, and biology), which study the '
 'physical world; and the behavioural sciences (e.g., economics, psychology, '
 'and sociology), which study individuals and societies.[4][5] The formal '
 'sciences (e.g., logic, mathematics, and theoretical computer science), which '
 'study formal systems governed by axioms and rules,[6][7] are sometimes '
 'described as being sciences as well; however, they are often regarded as a '
 'separate field because they rely on deductive')


### Αποθήκευση σε JSON

In [88]:
import json
def save_json(data, filename):
    with open('Files/' + filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
        
# Αποθήκευση δεδομένων σε json αρχείο
save_json(collected_data, 'wiki_data.json')

### Αποθήκευση σε CSV 

In [89]:
import csv
def save_csv(data, filename):
    with open('Files/' + filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['title', 'content'])
        writer.writeheader()
        writer.writerows(data)

# Αποθήκευση δεδομένων σε csv αρχείο
save_csv(collected_data, 'wiki_data.csv')

## Βήμα 2: Προεπεξεργασία Κειμένου (Text Processing)

### Αφαιρεση πηγών (π.χ. [1])

In [90]:
from pprint import pprint
import re 
import pandas as pd

# Load collected data from CSV
collected_data_df = pd.read_csv('Files/wiki_data.csv')
collected_data = collected_data_df.to_dict(orient='records')
for d in collected_data:
    d['content'] = re.sub(r"\[\d+\]", "", d['content']) # regex για αντικατάσταση πηγών με κενό

print_article(collected_data)

Number of articles collected: 4
1.Article Title: Science
  Content (first 100 words):
('Science is a systematic discipline that builds and organises knowledge in '
 'the form of testable hypotheses and predictions about the universe. Modern '
 'science is typically divided into two or three major branches: the natural '
 'sciences (e.g., physics, chemistry, and biology), which study the physical '
 'world; and the behavioural sciences (e.g., economics, psychology, and '
 'sociology), which study individuals and societies. The formal sciences '
 '(e.g., logic, mathematics, and theoretical computer science), which study '
 'formal systems governed by axioms and rules, are sometimes described as '
 'being sciences as well; however, they are often regarded as a separate field '
 'because they rely on deductive')


### Αφαίρεση σημείων στίξης

In [91]:
import string

for punct in string.punctuation:
    for d in collected_data:
        d['content'] = d['content'].replace(punct, '') # αφαίρεση σημείων στίξης
print_article(collected_data)

Number of articles collected: 4
1.Article Title: Science
  Content (first 100 words):
('Science is a systematic discipline that builds and organises knowledge in '
 'the form of testable hypotheses and predictions about the universe Modern '
 'science is typically divided into two or three major branches the natural '
 'sciences eg physics chemistry and biology which study the physical world and '
 'the behavioural sciences eg economics psychology and sociology which study '
 'individuals and societies The formal sciences eg logic mathematics and '
 'theoretical computer science which study formal systems governed by axioms '
 'and rules are sometimes described as being sciences as well however they are '
 'often regarded as a separate field because they rely on deductive')


### Tokenization and Stemming

In [92]:
from nltk.tokenize import word_tokenize
import nltk
from pprint import pprint
porter = nltk.PorterStemmer()

tokens = []
stemmed_data = []

for d in collected_data: # For each article
    tokens = word_tokenize(d['content'])  # Tokenize content 
    stemmed_tokens = [porter.stem(t) for t in tokens]  # Stem each token
    stemmed_data.append({
        "title": d["title"],
        "stemmed_tokens": stemmed_tokens
    })

In [93]:
def print_tokens(data, tokens):
    print(f"1. Article Title: {data[0]['title']}")
    print("Tokens first 20 words: ")
    pprint(data[0][tokens][:20])
print_tokens(stemmed_data, "stemmed_tokens")

1. Article Title: Science
Tokens first 20 words: 
['scienc',
 'is',
 'a',
 'systemat',
 'disciplin',
 'that',
 'build',
 'and',
 'organis',
 'knowledg',
 'in',
 'the',
 'form',
 'of',
 'testabl',
 'hypothes',
 'and',
 'predict',
 'about',
 'the']


### Stop-word removal

In [94]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')

cleaned_data = []
cleaned_data_for_csv = []

for d in stemmed_data: # or lemmed_data (one of the two) - Επιλογή μεταξύ stemming και lemmatization??
    filtered_tokens = [t for t in d['stemmed_tokens'] if t.lower() not in stopwords]
    cleaned_data.append({
        "title": d["title"],
        "cleaned_tokens": filtered_tokens
    })
    cleaned_data_for_csv.append({
        "title": d["title"],
        "content": " ".join(filtered_tokens)
    })
    
print_tokens(cleaned_data, "cleaned_tokens")

1. Article Title: Science
Tokens first 20 words: 
['scienc',
 'systemat',
 'disciplin',
 'build',
 'organis',
 'knowledg',
 'form',
 'testabl',
 'hypothes',
 'predict',
 'univers',
 'modern',
 'scienc',
 'typic',
 'divid',
 'two',
 'three',
 'major',
 'branch',
 'natur']


### Αποθήκευση σε .json και .csv 

In [95]:
save_json(cleaned_data, 'wiki_data_cleaned.json')
save_csv(cleaned_data_for_csv, 'wiki_data_cleaned.csv')

## Βήμα 3: Ευρετήριο (Indexing)

In [98]:
import json
import pandas as pd

with open('Files/wiki_data_cleaned.json', 'r') as file:
    wiki_data = json.load(file)

corpus = {}
for i, entry in enumerate(wiki_data):
    title = entry.get("title", f"sent{i}") 
    tokens = entry.get("cleaned_tokens", [])
    corpus[title] = {token: tokens.count(token) for token in tokens}

df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
print("First 15 columns:")
df.iloc[:, :15]

First 15 columns:


Unnamed: 0,engin,practic,use,natur,scienc,mathemat,design,process,solv,technic,problem,increas,effici,product,improv
Engineering,229,16,40,5,25,10,47,14,3,7,10,5,3,21,5
Mathematics,2,5,64,13,28,242,2,2,14,3,25,2,0,0,0
Science,5,10,30,46,131,14,2,10,4,3,7,5,1,3,8
Technology,13,8,45,9,9,0,3,5,0,1,1,13,0,7,7


### Αποθήκευση σε .json και .csv 

In [99]:
df.to_csv('Files/wiki_data_inverted_index.csv')
df.to_json('Files/wiki_data_inverted_index.json', indent=4)

## Βήμα 4: Μηχανή αναζήτησης (Search Engine)

### Επεξεργασία ερωτήματος (Query Processing)

In [100]:
# Λειτουργίες μηχανής αναζήτησης
def boolean_query(query, index):
    """Εκτελεί Boolean αναζήτηση (AND, OR, NOT)"""
    terms = query.split()
    result_sets = []

    for term in terms:
        if term in index:
            result_sets.append(set(index[term]))
        elif term.upper() == "AND":
            continue
        elif term.upper() == "OR":
            continue
        else:
            result_sets.append(set())

    result = set.intersection(*result_sets) if "AND" in terms else set.union(*result_sets)
    return result

### Κατάταξη αποτελεσμάτων (Ranking)

#### TF-IDF (Term Frequency-Inverse Document Frequency)

In [111]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import PorterStemmer

# Preprocess the input query (tokenize and stem)
def preprocess_query(query):
    stemmer = PorterStemmer()
    tokens = nltk.word_tokenize(query.lower())  # Tokenize and lowercased
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

# Function to calculate TF-IDF
def calculate_tfidf(query, inverted_index):
    # Preprocess query
    query_tokens = preprocess_query(query)

    # Build the corpus from the inverted index
    corpus = []
    document_names = list(next(iter(inverted_index.values())).keys())  # Extract document names

    # Create the document-term matrix
    for doc in document_names:
        doc_str = " ".join(
            [term for term, docs in inverted_index.items() if doc in docs and docs[doc] > 0]
        )
        corpus.append(doc_str)

    # Apply TF-IDF using TfidfVectorizer
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)

    # Query TF-IDF transformation
    query_vector = vectorizer.transform([" ".join(query_tokens)])

    # Calculate cosine similarity between query and documents
    cosine_similarities = cosine_similarity(query_vector, X).flatten()

    # Rank documents based on cosine similarity
    ranked_docs = sorted(zip(cosine_similarities, document_names), reverse=True)
    return ranked_docs

## OTHER

## Βήμα 5. Αξιολόγηση συστήματος:

### Αξιολόγηση συστήματος με Precision, Recall, F1, MAP

In [20]:
def evaluate_system(queries, relevant_docs, retrieval_function):
    """Αξιολόγηση συστήματος με Precision, Recall, F1, MAP"""
    precision_list = []
    recall_list = []
    f1_list = []
    average_precision_list = []

    for query, relevant in zip(queries, relevant_docs):
        results = retrieval_function(query, inverted_index, articles)
        retrieved_docs = [doc for doc, _ in results]

        true_positives = len(set(retrieved_docs) & set(relevant))
        precision = true_positives / len(retrieved_docs) if retrieved_docs else 0
        recall = true_positives / len(relevant) if relevant else 0
        f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0

        # Υπολογισμός Μέσης Ακρίβειας (AP)
        cumulative_precision = 0
        for rank, doc in enumerate(retrieved_docs, start=1):
            if doc in relevant:
                cumulative_precision += len(set(retrieved_docs[:rank]) & set(relevant)) / rank
        average_precision = cumulative_precision / len(relevant) if relevant else 0

        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)
        average_precision_list.append(average_precision)

    mean_precision = sum(precision_list) / len(precision_list)
    mean_recall = sum(recall_list) / len(recall_list)
    mean_f1 = sum(f1_list) / len(f1_list)
    mean_average_precision = sum(average_precision_list) / len(average_precision_list)

    return {
        "Precision": mean_precision,
        "Recall": mean_recall,
        "F1": mean_f1,
        "MAP": mean_average_precision
    }

## Διεπαφή Χρήστη (User Interface)

### Διάβασμα άρθρων και ευρετηρίου

In [102]:
import json
user_queries = []
user_relevant_docs = []

# Load articles
with open("Files/wiki_data_cleaned.json", "r", encoding="utf-8") as f:
    articles = json.load(f)

# Load inverted index
with open("Files/wiki_data_inverted_index.json", "r", encoding="utf-8") as f:
    inverted_index = json.load(f)

In [108]:
def user_interface():
    print("Καλώς ήρθατε στη μηχανή αναζήτησης!\n")
    while True:
        print("1. Boolean Αναζήτηση")
        print("2. Αναζήτηση με κατάταξη TF-IDF")
        print("3. Αναζήτηση με κατάταξη BM25")
        print("4. Αξιολόγηση Συστήματος")
        print("5. Έξοδος")
        
        choice = input("Επιλέξτε επιλογή:")

        if choice == "1":
            query = input("Εισάγετε το Boolean ερώτημά σας (π.χ. term1 AND term2): ")
            results = boolean_query(query, inverted_index)
            if results:
                print("Αποτελέσματα:")
                for res in results:
                    print(res)
                user_queries.append(query)
                user_relevant_docs.append(list(results))
            else:
                print("Δεν βρέθηκαν αποτελέσματα.")
        elif choice == "2":
            query = input("Εισάγετε το ερώτημά σας: ")
            print("---Αποτελέσματα TF-IDF---")
            print("Query given:", query)
            ranked_docs = calculate_tfidf(query, inverted_index)
            if ranked_docs:
                for score, doc in ranked_docs:
                    print(f"Document: {doc}, Score: {score:.4f}")
            else:
                print("Δεν βρέθηκαν αποτελέσματα.")
        elif choice == "5":
            print("Ευχαριστούμε που χρησιμοποιήσατε τη μηχανή αναζήτησης!")
            break
        else:
            print("Μη έγκυρη επιλογή. Δοκιμάστε ξανά.")


### Διεπαφή χρήστη

In [None]:
user_interface()