# Project - Search Engine

## Βήμα 1: Συλλογή Δεδομένων

### H συνάρτηση Crawl

In [3]:
import requests
from bs4 import BeautifulSoup

def crawl_wikipedia(url):
    data = []
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Εξαγωγή τίτλου και κειμένου
        title = soup.find("h1").text # βρίσκει h1 html tag (header1) και επιστρέφει το κείμενο του
        paragraphs = [p.text for p in soup.find_all("p")] # βρίσκει όλα τα p html tags (paragraph) και επιστρέφει το κείμενο τους
        content = "\n".join(paragraphs) # διαχωριστής των παραγράφων το σύμβολο " | "
        data.append({'title': title, 'content': content}) # προσθήκη τίτλου και περιεχομένου στη λίστα data
    else: 
        print(f"Error: {response.status_code}")
        print(f"URL: {url}")
    
    return data

### Χρηση της συνάρτησης Crawl

In [4]:
# Λίστα άρθρων για συλλογή 
articles = [
    "Science",
    "Technology",
    "Engineering",
    "Mathematics",
    "Artificial_intelligence",
    "Machine_learning",
    "Deep_learning",
    "Data_science",
    "Computer_science",
    "Programming_language",
    "Software_engineering",
    "Operating_system",
    "Computer_network",
    "Internet",
]
collected_data = []

for article in articles:
    url = f'https://en.wikipedia.org/wiki/{article}'
    collected_data.extend(crawl_wikipedia(url))
    # for d in collected_data:
    #     words = d['content'].split()
    #     d['content'] = " ".join(words[:500]) # each article is limited to 500 words

In [5]:
from pprint import pprint
def print_article(collected_data):
    print("Number of articles collected:", len(collected_data))
    print(f"1.Article Title: {collected_data[0]['title']}")
    print("  Content (first 100 words):")
    pprint(" ".join(collected_data[0]['content'].split()[:100]))
print_article(collected_data)

Number of articles collected: 14
1.Article Title: Science
  Content (first 100 words):
('Science is a systematic discipline that builds and organises knowledge in '
 'the form of testable hypotheses and predictions about the universe.[1][2] '
 'Modern science is typically divided into two or three major branches:[3] the '
 'natural sciences (e.g., physics, chemistry, and biology), which study the '
 'physical world; and the behavioural sciences (e.g., economics, psychology, '
 'and sociology), which study individuals and societies.[4][5] The formal '
 'sciences (e.g., logic, mathematics, and theoretical computer science), which '
 'study formal systems governed by axioms and rules,[6][7] are sometimes '
 'described as being sciences as well; however, they are often regarded as a '
 'separate field because they rely on deductive')


### Αποθήκευση σε JSON

In [6]:
import json
def save_json(data, filename):
    with open('Files/' + filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
        
# Αποθήκευση δεδομένων σε json αρχείο
save_json(collected_data, 'wiki_data.json')

### Αποθήκευση σε CSV 

In [7]:
import csv
def save_csv(data, filename):
    with open('Files/' + filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['title', 'content'])
        writer.writeheader()
        writer.writerows(data)

# Αποθήκευση δεδομένων σε csv αρχείο
save_csv(collected_data, 'wiki_data.csv')

## Βήμα 2: Προεπεξεργασία Κειμένου (Text Processing)

### Αφαιρεση πηγών (π.χ. [1])

In [8]:
from pprint import pprint
import re 
import pandas as pd

# Load collected data from CSV
collected_data_df = pd.read_csv('Files/wiki_data.csv')
collected_data = collected_data_df.to_dict(orient='records')
for d in collected_data:
    d['content'] = re.sub(r"\[\d+\]", "", d['content']) # regex για αντικατάσταση πηγών με κενό

print_article(collected_data)

Number of articles collected: 14
1.Article Title: Science
  Content (first 100 words):
('Science is a systematic discipline that builds and organises knowledge in '
 'the form of testable hypotheses and predictions about the universe. Modern '
 'science is typically divided into two or three major branches: the natural '
 'sciences (e.g., physics, chemistry, and biology), which study the physical '
 'world; and the behavioural sciences (e.g., economics, psychology, and '
 'sociology), which study individuals and societies. The formal sciences '
 '(e.g., logic, mathematics, and theoretical computer science), which study '
 'formal systems governed by axioms and rules, are sometimes described as '
 'being sciences as well; however, they are often regarded as a separate field '
 'because they rely on deductive')


### Αφαίρεση σημείων στίξης

In [9]:
import string

for punct in string.punctuation:
    for d in collected_data:
        d['content'] = d['content'].replace(punct, '') # αφαίρεση σημείων στίξης
print_article(collected_data)

Number of articles collected: 14
1.Article Title: Science
  Content (first 100 words):
('Science is a systematic discipline that builds and organises knowledge in '
 'the form of testable hypotheses and predictions about the universe Modern '
 'science is typically divided into two or three major branches the natural '
 'sciences eg physics chemistry and biology which study the physical world and '
 'the behavioural sciences eg economics psychology and sociology which study '
 'individuals and societies The formal sciences eg logic mathematics and '
 'theoretical computer science which study formal systems governed by axioms '
 'and rules are sometimes described as being sciences as well however they are '
 'often regarded as a separate field because they rely on deductive')


### Tokenization and Stemming

In [10]:
from nltk.tokenize import word_tokenize
import nltk
from pprint import pprint
porter = nltk.PorterStemmer()

tokens = []
stemmed_data = []

for d in collected_data: # For each article
    tokens = word_tokenize(d['content'])  # Tokenize content 
    stemmed_tokens = [porter.stem(t) for t in tokens]  # Stem each token
    stemmed_data.append({
        "title": d["title"],
        "stemmed_tokens": stemmed_tokens
    })

In [11]:
def print_tokens(data, tokens):
    print(f"1. Article Title: {data[0]['title']}")
    print("Tokens first 20 words: ")
    pprint(data[0][tokens][:20])
print_tokens(stemmed_data, "stemmed_tokens")

1. Article Title: Science
Tokens first 20 words: 
['scienc',
 'is',
 'a',
 'systemat',
 'disciplin',
 'that',
 'build',
 'and',
 'organis',
 'knowledg',
 'in',
 'the',
 'form',
 'of',
 'testabl',
 'hypothes',
 'and',
 'predict',
 'about',
 'the']


### Stop-word removal

In [12]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')

cleaned_data = []
cleaned_data_for_csv = []

for d in stemmed_data: # or lemmed_data (one of the two) - Επιλογή μεταξύ stemming και lemmatization??
    filtered_tokens = [t for t in d['stemmed_tokens'] if t.lower() not in stopwords]
    cleaned_data.append({
        "title": d["title"],
        "cleaned_tokens": filtered_tokens
    })
    cleaned_data_for_csv.append({
        "title": d["title"],
        "content": " ".join(filtered_tokens)
    })
    
print_tokens(cleaned_data, "cleaned_tokens")

1. Article Title: Science
Tokens first 20 words: 
['scienc',
 'systemat',
 'disciplin',
 'build',
 'organis',
 'knowledg',
 'form',
 'testabl',
 'hypothes',
 'predict',
 'univers',
 'modern',
 'scienc',
 'typic',
 'divid',
 'two',
 'three',
 'major',
 'branch',
 'natur']


### Αποθήκευση σε .json και .csv 

In [13]:
save_json(cleaned_data, 'wiki_data_cleaned.json')
save_csv(cleaned_data_for_csv, 'wiki_data_cleaned.csv')

## Βήμα 3: Ευρετήριο (Indexing)

In [14]:
import json
import pandas as pd

with open('Files/wiki_data_cleaned.json', 'r', encoding='utf-8') as file:
    wiki_data = json.load(file)

corpus = {}
for i, entry in enumerate(wiki_data):
    title = entry.get("title", f"sent{i}") 
    tokens = entry.get("cleaned_tokens", [])
    corpus[title] = {token: tokens.count(token) for token in tokens}

df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
print("First 15 columns:")
df.iloc[:, :15]

First 15 columns:


Unnamed: 0,artifici,intellig,ai,broadest,sens,exhibit,machin,particularli,comput,system,field,research,scienc,develop,studi
Artificial intelligence,41,73,221,1,4,3,59,5,41,34,22,53,6,36,11
Computer network,0,0,0,0,0,1,2,0,48,17,1,2,1,3,1
Computer science,6,8,2,0,0,0,13,0,166,28,15,10,70,13,22
Data science,0,0,0,0,0,0,6,0,16,4,18,3,56,4,0
Deep learning,13,4,8,0,1,0,25,4,35,38,8,19,4,19,1
Engineering,3,1,0,0,1,2,28,0,13,21,15,6,25,40,12
Internet,0,1,0,0,0,1,0,0,43,26,5,23,5,19,11
Machine learning,28,13,23,0,0,1,119,3,37,40,24,20,3,7,11
Mathematics,0,0,0,0,1,0,0,4,23,13,11,4,28,23,47
Operating system,0,0,0,0,0,0,8,0,48,166,0,1,0,14,0


### Αποθήκευση σε .json και .csv 

In [15]:
df.to_csv('Files/wiki_data_inverted_index.csv')
df.to_json('Files/wiki_data_inverted_index.json', indent=4)

## Βήμα 4: Μηχανή αναζήτησης (Search Engine)

### Επεξεργασία ερωτήματος (Query Processing)

In [16]:
import nltk
from nltk.stem import PorterStemmer

# Λειτουργίες μηχανής αναζήτησης
def boolean_query(query, index):
    """Boolean (AND, OR, NOT)"""
    terms = query.split()
    result_sets = []

    for term in terms:
        if term in index:
            stemmer = PorterStemmer()
            term = stemmer.stem(term)
            result_sets.append(set(index[term]))
        elif term.upper() == "AND":
            continue
        elif term.upper() == "OR":
            continue
        else:
            result_sets.append(set())

    result = set.intersection(*result_sets) if "AND" in terms else set.union(*result_sets)
    return result

### Κατάταξη αποτελεσμάτων (Ranking)

#### TF-IDF (Term Frequency-Inverse Document Frequency)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import PorterStemmer

# Preprocess the input query (tokenize and stem)
def preprocess_query(query):
    stemmer = PorterStemmer()
    tokens = nltk.word_tokenize(query.lower())  # Tokenize and lowercased
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

# Function to calculate TF-IDF
def calculate_tfidf(query, inverted_index):
    # Preprocess query
    query_tokens = preprocess_query(query)

    # Build the corpus from the inverted index
    corpus = []
    document_names = list(next(iter(inverted_index.values())).keys())  # Extract document names

    # Create the document-term matrix
    for doc in document_names:
        doc_str = " ".join(
            [term for term, docs in inverted_index.items() if doc in docs and docs[doc] > 0]
        )
        corpus.append(doc_str)

    # Apply TF-IDF using TfidfVectorizer
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)

    # Query TF-IDF transformation
    query_vector = vectorizer.transform([" ".join(query_tokens)])

    # Calculate cosine similarity between query and documents
    cosine_similarities = cosine_similarity(query_vector, X).flatten()

    # Rank documents based on cosine similarity
    ranked_docs = sorted(zip(cosine_similarities, document_names), reverse=True)
    return ranked_docs

###  Vector Space Model (VSM)

In [33]:
def vector_space_model(query, inverted_index):
    # Step 1: Preprocess the query
    preprocessed_query = preprocess_query(query)
    processed_query = " ".join(preprocessed_query)

    # Step 2: Prepare the document-term matrix
    documents = list(inverted_index.values())
    doc_names = list(documents[0].keys())  # Extract document names from the first term

    # Create a list of document strings (terms concatenated)
    doc_texts = [
        " ".join(
            [term] * inverted_index[term].get(doc, 0)
            for term in inverted_index
        )
        for doc in doc_names
    ]

    # Step 3: Compute TF-IDF representation
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(doc_texts)

    # Step 4: Transform the query into the same vector space
    query_vector = vectorizer.transform([processed_query])

    # Step 5: Compute cosine similarity between the query and the documents
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Step 6: Rank the documents by similarity scores
    ranked_docs = sorted(zip(doc_names, cosine_similarities), key=lambda x: x[1], reverse=True)

    return ranked_docs


### Okapi BM25 (Probabilistic retrieval model)

## Βήμα 5. Αξιολόγηση συστήματος:

### Αξιολόγηση συστήματος με Precision, Recall, F1, MAP

## Διεπαφή Χρήστη (User Interface)

### Διάβασμα άρθρων και ευρετηρίου

In [34]:
import json
user_queries = []
user_relevant_docs = []

# Load articles
with open("Files/wiki_data_cleaned.json", "r", encoding="utf-8") as f:
    articles = json.load(f)

# Load inverted index
with open("Files/wiki_data_inverted_index.json", "r", encoding="utf-8") as f:
    inverted_index = json.load(f)

### Διεπαφή χρήστη

In [35]:
def user_interface():
    print("\n")
    while True:
        questions = [
            "----------Search Engine Menu----------",
            "1. Boolean search",
            "2. TF-IDF Ranking",
            "4. Vector Space Model Ranking",
            "5. Okapi BM25"
            "6. System Evaluation",
            "7. Exit",
        ]
        print("\n".join(questions))
        choice = input("Choose:")

        if choice == "1":
            query = input("Input Boolean query (e.g. term1 AND term2): ")
            results = boolean_query(query, inverted_index)
            if not results:
                print("No results found.")
                break

            print("Αποτελέσματα:")
            for res in results:
                print(res)
            user_queries.append(query)
            user_relevant_docs.append(list(results))
        elif choice == "2":
            query = input("Input query:")
            print("----------Results TF-IDF----------")
            print("Query given:", query)
            ranked_docs = calculate_tfidf(query, inverted_index)
            if not ranked_docs:
                print("No results found.")
                break

            for score, doc in ranked_docs:
                    print(f"Document: {doc}, Score: {score:.4f}")
            user_queries.append(query)
            user_relevant_docs.append([doc for _, doc in ranked_docs])  
        elif choice == "3":
            query = input("Input query:")
            print("----------Results Vector Space Model----------")
            print("Query given:", query)
            ranked_docs = vector_space_model(query, inverted_index)
            if not ranked_docs:
                print("No results found.")
                break

            for doc, score in ranked_docs:
                print(f"Document: {doc}, Score: {score:.4f}")
            user_queries.append(query)
            user_relevant_docs.append([doc for doc, _ in ranked_docs])      
        elif choice == "7" or choice == "": 
            break


In [36]:
user_interface()



----------Search Engine Menu----------
1. Boolean search
2. TF-IDF Ranking
4. Vector Space Model Ranking
5. Okapi BM256. System Evaluation
7. Exit
----------Results Vector Space Model----------
Query given: artificial intelligence


TypeError: sequence item 0: expected str instance, list found