# Project - Search Engine

## Βήμα 1: Συλλογή Δεδομένων

### H συνάρτηση Crawl

In [94]:
import requests
from bs4 import BeautifulSoup

def crawl_wikipedia(url):
    data = []
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Εξαγωγή τίτλου και κειμένου
        title = soup.find("h1").text # βρίσκει h1 html tag (header1) και επιστρέφει το κείμενο του
        paragraphs = [p.text for p in soup.find_all("p")] # βρίσκει όλα τα p html tags (paragraph) και επιστρέφει το κείμενο τους
        content = "\n".join(paragraphs) # διαχωριστής των παραγράφων το σύμβολο " | "
        data.append({'title': title, 'content': content}) # προσθήκη τίτλου και περιεχομένου στη λίστα data
    else: 
        print('Error:' + response.status_code)
    
    return data

### Χρηση της συνάρτησης Crawl

In [95]:
# Λίστα άρθρων για συλλογή 
articles = ["Science", "Technology", "Engineering", "Computer", "Math", "Robotics", "Machine Learning",
            "History", "Art", "Philosophy", "Literature", "Psychology", "Politics", 
            "Economics", "Sociology"]
collected_data = []

for article in articles:
    url = 'https://en.wikipedia.org/wiki/' + article
    collected_data.extend(crawl_wikipedia(url))
    for d in collected_data:
        words = d['content'].split()
        d['content'] = " ".join(words[:1000]) # each article is limited to 1000 words

In [96]:
from pprint import pprint
def print_article(collected_data):
    print("Number of articles collected:", len(collected_data))
    print(f"1.Article Title: {collected_data[0]['title']}")
    print("  Content (first 100 words):")
    pprint(" ".join(collected_data[0]['content'].split()[:100]))
print_article(collected_data)

Number of articles collected: 15
1.Article Title: Science
  Content (first 100 words):
('Science is a systematic discipline that builds and organises knowledge in '
 'the form of testable hypotheses and predictions about the universe.[1][2] '
 'Modern science is typically divided into two or three major branches:[3] the '
 'natural sciences (e.g., physics, chemistry, and biology), which study the '
 'physical world; and the behavioural sciences (e.g., economics, psychology, '
 'and sociology), which study individuals and societies.[4][5] The formal '
 'sciences (e.g., logic, mathematics, and theoretical computer science), which '
 'study formal systems governed by axioms and rules,[6][7] are sometimes '
 'described as being sciences as well; however, they are often regarded as a '
 'separate field because they rely on deductive')


### Αποθήκευση σε JSON

In [97]:
import json
def save_json(data, filename):
    with open('Files/' + filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
        
# Αποθήκευση δεδομένων σε json αρχείο
save_json(collected_data, 'wiki_data.json')

### Αποθήκευση σε CSV 

In [99]:
import csv
def save_csv(data, filename):
    with open('Files/' + filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['title', 'content'])
        writer.writeheader()
        writer.writerows(data)

# Αποθήκευση δεδομένων σε csv αρχείο
save_csv(collected_data, 'wiki_data.csv')

## Βήμα 2: Προεπεξεργασία Κειμένου (Text Processing)

### Αφαιρεση πηγών (π.χ. [1])

In [100]:
from pprint import pprint
import re 
for d in collected_data:
    d['content'] = re.sub(r"\[\d+\]", "", d['content']) # regex για αντικατάσταση πηγών με κενό

print_article(collected_data)

Number of articles collected: 15
1.Article Title: Science
  Content (first 100 words):
('Science is a systematic discipline that builds and organises knowledge in '
 'the form of testable hypotheses and predictions about the universe. Modern '
 'science is typically divided into two or three major branches: the natural '
 'sciences (e.g., physics, chemistry, and biology), which study the physical '
 'world; and the behavioural sciences (e.g., economics, psychology, and '
 'sociology), which study individuals and societies. The formal sciences '
 '(e.g., logic, mathematics, and theoretical computer science), which study '
 'formal systems governed by axioms and rules, are sometimes described as '
 'being sciences as well; however, they are often regarded as a separate field '
 'because they rely on deductive')


### Αφαίρεση σημείων στίξης

In [101]:
import string

for punct in string.punctuation:
    for d in collected_data:
        d['content'] = d['content'].replace(punct, '') # αφαίρεση σημείων στίξης
print_article(collected_data)

Number of articles collected: 15
1.Article Title: Science
  Content (first 100 words):
('Science is a systematic discipline that builds and organises knowledge in '
 'the form of testable hypotheses and predictions about the universe Modern '
 'science is typically divided into two or three major branches the natural '
 'sciences eg physics chemistry and biology which study the physical world and '
 'the behavioural sciences eg economics psychology and sociology which study '
 'individuals and societies The formal sciences eg logic mathematics and '
 'theoretical computer science which study formal systems governed by axioms '
 'and rules are sometimes described as being sciences as well however they are '
 'often regarded as a separate field because they rely on deductive')


### Tokenization and Stemming

In [102]:
from nltk.tokenize import word_tokenize
import nltk
from pprint import pprint
porter = nltk.PorterStemmer()

tokens = []
stemmed_data = []

for d in collected_data: # For each article
    tokens = word_tokenize(d['content'])  # Tokenize content 
    stemmed_tokens = [porter.stem(t) for t in tokens]  # Stem each token
    stemmed_data.append({
        "title": d["title"],
        "stemmed_tokens": stemmed_tokens
    })

In [103]:
def print_tokens(data, tokens):
    print(f"1. Article Title: {data[0]['title']}")
    print("Tokens first 20 words: ")
    pprint(data[0][tokens][:20])
print_tokens(stemmed_data, "stemmed_tokens")

1. Article Title: Science
Tokens first 20 words: 
['scienc',
 'is',
 'a',
 'systemat',
 'disciplin',
 'that',
 'build',
 'and',
 'organis',
 'knowledg',
 'in',
 'the',
 'form',
 'of',
 'testabl',
 'hypothes',
 'and',
 'predict',
 'about',
 'the']


### Stop-word removal

In [104]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')

cleaned_data = []
cleaned_data_for_csv = []

for d in stemmed_data: # or lemmed_data (one of the two) - Επιλογή μεταξύ stemming και lemmatization??
    filtered_tokens = [t for t in d['stemmed_tokens'] if t.lower() not in stopwords]
    cleaned_data.append({
        "title": d["title"],
        "cleaned_tokens": filtered_tokens
    })
    cleaned_data_for_csv.append({
        "title": d["title"],
        "content": " ".join(filtered_tokens)
    })
    
print_tokens(cleaned_data, "cleaned_tokens")

1. Article Title: Science
Tokens first 20 words: 
['scienc',
 'systemat',
 'disciplin',
 'build',
 'organis',
 'knowledg',
 'form',
 'testabl',
 'hypothes',
 'predict',
 'univers',
 'modern',
 'scienc',
 'typic',
 'divid',
 'two',
 'three',
 'major',
 'branch',
 'natur']


### Αποθήκευση σε .json και .csv 

In [105]:
save_json(cleaned_data, 'wiki_data_cleaned.json')
save_csv(cleaned_data_for_csv, 'wiki_data_cleaned.csv')

## Βήμα 3: Ευρετήριο (Indexing)

In [106]:
import json
import pandas as pd

with open('Files/wiki_data_cleaned.json', 'r') as file:
    wiki_data = json.load(file)

corpus = {}
for i, entry in enumerate(wiki_data):
    title = entry.get("title", f"sent{i}") 
    tokens = entry.get("cleaned_tokens", [])
    corpus[title] = {token: tokens.count(token) for token in tokens}

df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
print("First 15 columns:")
df.iloc[:, :15]

First 15 columns:


Unnamed: 0,art,describ,divers,rang,cultur,activ,center,around,work,util,creativ,imagin,talent,expect,evok
Art,54,2,1,1,3,2,1,1,6,1,10,1,1,1,1
Computer,0,0,0,2,0,0,0,1,1,0,0,0,0,0,0
Economics,0,4,1,1,0,0,0,0,1,0,0,0,0,1,0
Engineering,0,1,0,1,0,0,0,3,1,1,1,0,0,0,0
History,1,2,1,0,6,1,0,0,1,0,0,0,0,0,0
Literature,4,1,0,0,4,0,0,0,5,0,1,0,0,0,0
Machine learning,0,2,0,0,0,0,0,1,2,0,0,0,0,0,0
Mathematics,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Philosophy,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
Politics,1,1,1,2,0,1,0,0,2,0,0,0,0,0,0


### Αποθήκευση σε .json και .csv 

In [21]:
df.to_csv('Files/wiki_data_inverted_index.csv')
df.to_json('Files/wiki_data_inverted_index.json', indent=4)

## Βήμα 4: Μηχανή αναζήτησης (Search Engine)

### Επεξεργασία ερωτήματος (Query Processing)

### Κατάταξη αποτελεσμάτων (Ranking)

## Βήμα 5. Αξιολόγηση συστήματος:

### Αξιολόγηση συστήματος με Precision, Recall, F1, MAP

## Διεπαφή Χρήστη (User Interface)

In [3]:
import json
user_queries = []
user_relevant_docs = []

# Load articles
with open("Files/wiki_data_cleaned.json", "r", encoding="utf-8") as f:
    articles = json.load(f)

# Load inverted index
with open("Files/wiki_data_inverted_index.json", "r", encoding="utf-8") as f:
    inverted_index = json.load(f)

In [None]:
def user_interface():
    print("Καλώς ήρθατε στη μηχανή αναζήτησης!\n")
    while True:
        print("1. Boolean Αναζήτηση")
        print("2. Αναζήτηση με κατάταξη TF-IDF")
        print("3. Αναζήτηση με κατάταξη BM25")
        print("4. Αξιολόγηση Συστήματος")
        print("5. Έξοδος")

        choice = input("Επιλέξτε επιλογή: ")

        if choice == "1":
            query = input("Εισάγετε το Boolean ερώτημά σας (π.χ. term1 AND term2): ")
            results = boolean_query(query, inverted_index)
            if results:
                print("Αποτελέσματα:")
                for res in results:
                    print(res)
                user_queries.append(query)
                user_relevant_docs.append(list(results))
            else:
                print("Δεν βρέθηκαν αποτελέσματα.")
