In [None]:
! pip install -U jupyter ipywidgets # for tqdm to function properly
! pip install openpyxl docker # for saving stuff to Excel files
! pip install fast_langdetect
! pip uninstall nltk -y
! pip install -U spacy
! python -m spacy download en_core_web_sm
! pip install -U nltk

# Prepare for crawling

NB: you need Docker to be up and running on your machine for this notebook to work.

In [None]:
# import sys
import os
import json
import uuid
from tqdm.notebook import tqdm
import datetime 
from openpyxl import Workbook
import docker
import ssl
# otherwise some pages won't be crawled
ssl._create_default_https_context = ssl._create_unverified_context

# # path is broken on my machine, so I leave this here for myself :)
# sys.path.append('/Users/veronicasmilga/project_mse')

from db.DocumentEntry import DocumentEntry
from db.DocumentRepository import DocumentRepository
from data_retrieval.Crawler import Crawler

In [16]:
# df: settings for this notebook. If you only want to test, but not want to persist sth., set both booleans to False.
SAVE_TO_DATABASE = True # If True, saves the crawled documents to the POSTGRESQL database, else not. Condition: you need docker
OVERWRITE_DUMP = True # If True, Overwrites the current "./db/dump.sql" with the results from this notebook. Condition: you need docker

In [None]:
# Connect to the database by exec docker compose in your terminal. This executes a terminal command using Python
if SAVE_TO_DATABASE:
    print(os.system("""
    docker compose down;
    docker compose up -d --build db;
    sleep 5;
    """))

In [7]:
# frontier now is in a separate file
with open("../frontier.json", "r") as file:
    frontier = json.load(file)

NB: for the database to function properly, please first go to `exp/001_Flat_db_example_connection.ipynb` and complete the steps from there. If you don't want to be saving documents to the database, just comment out the code after _"# save one crawled page to database"_ comment.

In [None]:
# # initialising the database
# documentRepository = DocumentRepository()
# Initialising the database
if SAVE_TO_DATABASE:
    documentRepository = DocumentRepository()
    documentRepository.deleteAllDocuments()


# initialising the Excel backup (if sth goes wrong with the database)
wb = Workbook()
ws = wb.active
ws.title = "Crawled Data"
headers = ["id", "url", "title", "headings", "raw_html", "page_text", "keywords", "accessed_timestamp", "internal_links", "external_links"]
ws.append(headers)

# Crawling for the first time

Start crawling from this cell if you have no checkpoint information and want to start from scratch.

NB: I silenced the logs by default, now we only see error output from exceptions. To turn detailed logs back on for debug please initialise the Crawler with verbose=True.

In [None]:
now = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
# NB: this datetime will be used in the name of your Excel crawled_data_backup_file
# and json current_state_backup_file
print(f"START DATETIME: {now}")

max_pages = 10000
# only applies to websites fully about Tuebingen, i.e. ["tuebingen_focused_pages"] in frontier
max_steps_per_domain_prioritised = 5000
# applies to websites NOT about Tuebingen with just one or two pages about Tuebingen,
# i.e. ["general_pages"] in frontier and most children links
max_steps_per_domain_general = 100
timeout = 10

# Crawler is an iterator now, to handle info generated on-the-fly and save it immediately
crawler = Crawler(
    frontier,
    max_pages, 
    max_steps_per_domain_general, 
    max_steps_per_domain_prioritised, 
    timeout,
    # uncomment if you want to see all the logs
    # verbose=True
    )

for (
    # crawled info from page
    scraped_webpage_info,
    # this and further - state info to be saved to checkpoint file
    # from which crawler can be initialised later if our crawling process breaks at some point
    to_visit_prioritised, # Tübingen-related sites and their children
    to_visit, # general sites / unknown topic and their children
    visited_domains, # domains that should NOT be visited anymore because of reaching max_steps_per_domain
    visited, # links that were visited already
    domain_steps, # dict of steps made for each domain
    extra_links, # links that were extracted but belong to a visited domain and will not be visited on this iteration
    page_hashes
    ) in tqdm(crawler, total=max_pages):
    # save one crawled page to excel file
    try:
        row = [
            str(uuid.uuid4()),
            scraped_webpage_info["url"],
            scraped_webpage_info["title"],
            str(scraped_webpage_info["headings"]),
            str(scraped_webpage_info["raw_html"]),
            scraped_webpage_info["page_text"],
            str(scraped_webpage_info["keywords"]),
            scraped_webpage_info["accessed_timestamp"],
            str(scraped_webpage_info["internal_links"]),
            str(scraped_webpage_info["external_links"])
        ]
        ws.append(row)
        wb.save(f"./data/crawled_data_backup_{now}.xlsx")
    except Exception as e:
        # if something went wrong with Excel, try to save to json instead to preserve info
        try:
            print(f"Faced error {e} while trying to save page info to Excel. Saving to backup json file instead.")
            json_filename = f"data/crawled_data_backup_{now}.json"
            if os.path.exists(json_filename):
                with open(json_filename, "r") as file:
                    backup_file_content = json.load(file)
                backup_file_content.append(scraped_webpage_info)
            else:
                backup_file_content = [scraped_webpage_info]

            with open(json_filename, "w") as f:
                json.dump(backup_file_content, f, indent=4)
        except Exception as e:
            print(f"All attempts to save data failed. Skipping webpage {scraped_webpage_info['url']}.")


    if SAVE_TO_DATABASE:
        try:
            # save one crawled page to database
            document = DocumentEntry(
                url=scraped_webpage_info["url"],
                title=scraped_webpage_info["title"],
                headings=scraped_webpage_info["headings"],
                page_text=scraped_webpage_info["page_text"], 
                keywords=scraped_webpage_info["keywords"],
                accessed_timestamp=scraped_webpage_info["accessed_timestamp"],
                internal_links=scraped_webpage_info["internal_links"],
                external_links=scraped_webpage_info["external_links"],
                id=uuid.uuid4()
                )
            documentRepository.saveDocument(document)

        except Exception as e:
            print(f"""Failed to save {scraped_webpage_info["url"]} to database: {e}. Skipping the page.""")

    # save crawling state info on every step
    # later crawler can be initialised from this file
    crawling_state = {
        "to_visit_prioritised": list(to_visit_prioritised), 
        "to_visit": list(to_visit), 
        "visited_domains": list(visited_domains), 
        "visited": list(visited),
        "domain_steps": domain_steps,
        "extra_links": extra_links, 
        "page_hashes": page_hashes
    }

    json_filename = f"data/current_state_backup_file_{now}.json"
    with open(json_filename, "w") as f:
        json.dump(crawling_state, f, indent=4)

Check if all is good with database. It should be of roughly the same length as the df / excel file.

In [None]:
allDocuments = documentRepository.loadAllDocuments()
len(allDocuments)

Last (very important) step -- saving everything we have crawled into the dump file.

In [None]:
if SAVE_TO_DATABASE:
    if OVERWRITE_DUMP:
        documentRepository.overwrite_dump()

Now you can find your crawled data in `db/dump.sql` file. Alternatively, if something went wrong, you may retrieve the lost data from Excel file named `exp/data/current_state_backup_file_{crawling_date_and_time}.json`, where crawling_date_and_time is the point at which you started crawling.

# Crawling from checkpoint

Start crawling from this cell if you have checkpoint information and want to initialise your crawler from a given state (to not crawl the links you crawled already and preserve the info about to_visit queue, visited_domains list, etc.).

Look for the checkpoint json file in `exp/data/current_state_backup_file_{crawling_date_and_time}.json`, where crawling_date_and_time is the point at which you started crawling previously and from which you want to resume crawling.

In [None]:
from collections import deque

wb = Workbook()
ws = wb.active
ws.title = "Crawled Data"
headers = ["id", "url", "title", "headings", "raw_html", "page_text", "keywords", "accessed_timestamp", "internal_links", "external_links"]
ws.append(headers)

now = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
print(f"START DATETIME: {now}")

# provide the path to the checkpoint file here !!!
with open("data/current_state_backup_file_2024-07-21_01-02-44.json", "r") as f:
    crawling_state = json.load(f)

with open("../frontier.json", "r") as file:
    frontier = json.load(file)

max_pages = 10000
# only applies to websites fully about Tuebingen, i.e. ["tuebingen_focused_pages"] in frontier
max_steps_per_domain_prioritised = 5000
# applies to websites NOT about Tuebingen with just one or two pages about Tuebingen,
# i.e. ["general_pages"] in frontier and most children links
max_steps_per_domain_general = 100
timeout = 10

to_visit_list = crawling_state["to_visit"]
to_visit_prioritised_list = crawling_state["to_visit_prioritised"]
visited_list = crawling_state["visited"]
visited_domains_list = crawling_state["visited_domains"]
domain_steps = crawling_state["domain_steps"]
extra_links = crawling_state["extra_links"]
to_visit = deque(to_visit_list)
to_visit_prioritised = deque(to_visit_prioritised_list)
visited = set(visited_list)
visited_domains = set(visited_domains_list)

crawler_1 = Crawler(
    frontier,
    max_pages, 
    max_steps_per_domain_general, 
    max_steps_per_domain_prioritised, 
    timeout,
    visited=visited,
    to_visit=to_visit,
    to_visit_prioritised=to_visit_prioritised,
    visited_domains=visited_domains,
    domain_steps=domain_steps,
    extra_links=extra_links,
    page_hashes=page_hashes
    # uncomment if you want to see all the logs
    # verbose=True
    )

for (
    scraped_webpage_info,
    to_visit_prioritised, 
    to_visit, 
    visited_domains, 
    visited, 
    domain_steps, 
    extra_links,
    page_hashes
    ) in tqdm(crawler_1, total=max_pages):
    id =  uuid.uuid4()
    try:
        row = [
            str(id),
            scraped_webpage_info["url"],
            scraped_webpage_info["title"],
            str(scraped_webpage_info["headings"]),
            str(scraped_webpage_info["raw_html"]),
            scraped_webpage_info["page_text"],
            str(scraped_webpage_info["keywords"]),
            scraped_webpage_info["accessed_timestamp"],
            str(scraped_webpage_info["internal_links"]),
            str(scraped_webpage_info["external_links"])
        ]
        ws.append(row)
        wb.save(f"./data/crawled_data_backup_{now}.xlsx")
    except Exception as e:
        try:
            print(f"Faced error {e} while trying to save page info to Excel. Saving to backup json file instead.")
            json_filename = f"data/crawled_data_backup_{now}.json"
            if os.path.exists(json_filename):
                with open(json_filename, "r") as file:
                    backup_file_content = json.load(file)
                backup_file_content.append(scraped_webpage_info)
            else:
                backup_file_content = [scraped_webpage_info]

            with open(json_filename, "w") as f:
                json.dump(backup_file_content, f, indent=4)
        except Exception as e:
            print(f"All attempts to save data failed. Skipping webpage {scraped_webpage_info['url']}.")


    if SAVE_TO_DATABASE:
        try:
            document = DocumentEntry(
                url=scraped_webpage_info["url"],
                title=scraped_webpage_info["title"],
                headings=scraped_webpage_info["headings"],
                page_text=scraped_webpage_info["page_text"], 
                keywords=scraped_webpage_info["keywords"],
                accessed_timestamp=scraped_webpage_info["accessed_timestamp"],
                internal_links=scraped_webpage_info["internal_links"],
                external_links=scraped_webpage_info["external_links"],
                id=id
                )
            documentRepository.saveDocument(document)

        except Exception as e:
            print(f"""Failed to save {scraped_webpage_info["url"]} to database: {e}. Skipping the page.""")

    crawling_state = {
        "to_visit_prioritised": list(to_visit_prioritised), 
        "to_visit": list(to_visit), 
        "visited_domains": list(visited_domains), 
        "visited": list(visited),
        "domain_steps": domain_steps,
        "extra_links": extra_links,
        "page_hashes": page_hashes
    }

    json_filename = f"data/current_state_backup_file_{now}.json"
    with open(json_filename, "w") as f:
        json.dump(crawling_state, f, indent=4)

Check if all is good with database. It should be of roughly the same length as the df / excel file.

In [None]:
allDocuments = documentRepository.loadAllDocuments()
len(allDocuments)

Last (very important) step -- saving everything we have crawled into the dump file.

In [None]:
if SAVE_TO_DATABASE:
    if OVERWRITE_DUMP:
        documentRepository.overwrite_dump()

Now you can find your crawled data in `db/dump.sql` file. Alternatively, if something went wrong, you may retrieve the lost data from Excel file named `exp/data/current_state_backup_file_{crawling_date_and_time}.json`, where crawling_date_and_time is the point at which you started crawling.