In [None]:
# import sys
import json
import uuid
from tqdm.notebook import tqdm
import datetime 
from openpyxl import Workbook
import os

# path is broken on my machine, so I leave this here for myself :)
# sys.path.append('/Users/veronicasmilga/Desktop/Tübingen/MSE/Project_MSE/')

from db.DocumentEntry import DocumentEntry
from db.DocumentRepository import DocumentRepository
from data_retrieval.Crawler import Crawler

In [None]:
# df: settings for this notebook. If you only want to test, but not want to persist sth., set both booleans to False.
SAVE_TO_DATABASE = True # If True, saves the crawled documents to the POSTGRESQL database, else not. Condition: you need docker
OVERWRITE_DUMP = True # If True, Overwrites the current "./db/dump.sql" with the results from this notebook. Condition: you need docker

In [None]:
# frontier now is in a separate file
with open("../frontier.json", "r") as file:
    frontier = json.load(file)

If "SAVE_TO_DATABASE" is set to True, we have to set up docker before doing our experiment

In [None]:
# Connect to the database by exec docker compose in your terminal. This executes a terminal command using Python
if SAVE_TO_DATABASE:
    print(os.system("""
    docker compose down;
    docker compose up -d --build db;
    """))

In [None]:
# Initialising the database
if SAVE_TO_DATABASE:
    documentRepository = DocumentRepository()
    if OVERWRITE_DUMP:  # if you want to persist the results of this experiments to our database, please make sure to delete the old results a priori
        documentRepository.deleteAllDocuments() 

# Initialising the Excel backup (if sth goes wrong with the database)
wb = Workbook()
ws = wb.active
ws.title = "Crawled Data"
headers = ["id", "url", "title", "headings", "page_text", "keywords", "accessed_timestamp", "internal_links", "external_links"]
ws.append(headers)

Just in case you set "SAVE_TO_DATABASE" = True and you did not get
`SC: Connected to the db. Now you can go and build the best search engine around!`, just **run the cell again**. 

Otherwise, continue :)

In [None]:
now = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

max_pages = 5000
max_steps_per_domain = 10
timeout = 10

# Crawler is an iterator now, to handle info generated on-the-fly and save it immediately
crawler = Crawler(frontier, max_pages, max_steps_per_domain, timeout)

for scraped_webpage_info in tqdm(crawler, total=max_pages):
    # save one crawled page to Excel file
    row = [
        str(uuid.uuid4()),
        scraped_webpage_info["url"],
        scraped_webpage_info["title"],
        str(scraped_webpage_info["headings"]),
        scraped_webpage_info["page_text"],
        str(scraped_webpage_info["keywords"]),
        scraped_webpage_info["accessed_timestamp"],
        str(scraped_webpage_info["internal_links"]),
        str(scraped_webpage_info["external_links"])
    ]
    ws.append(row)
    wb.save(f"./data/crawled_data_backup_{now}.xlsx")

    if SAVE_TO_DATABASE:
        # save one crawled page to database
        document = DocumentEntry(
            url=scraped_webpage_info["url"],
            title=scraped_webpage_info["title"],
            headings=scraped_webpage_info["headings"],
            page_text=scraped_webpage_info["page_text"], 
            keywords=scraped_webpage_info["keywords"],
            accessed_timestamp=scraped_webpage_info["accessed_timestamp"],
            internal_links=scraped_webpage_info["internal_links"],
            external_links=scraped_webpage_info["external_links"],
            id=uuid.uuid4()
            )
        documentRepository.saveDocument(document)

After doing the experiment, you can persist your changes and read out your results from the database.
If you want to access documents in the database.

In [None]:
if SAVE_TO_DATABASE:
    if OVERWRITE_DUMP:
        documentRepository.overwrite_dumb()
    allDocuments = documentRepository.loadAllDocuments()
    print(len(allDocuments))
    #print(allDocuments)

In [None]:
# df: Finally let's shut down docker if you are running it
if SAVE_TO_DATABASE:
    print(os.system("""
    docker compose down;
    """))

Otherwise, if you just want to take a look at the data, go to `exp/data` and find the Excel file with timestamp corresponding to the time you ran the Crawler :)

In [None]:
str(documentRepository._get_container_id_by_image("project_mse-db"))