In [None]:
# sorry for installs here, I'll move them to poetry later
! pip install -U jupyter ipywidgets # for tqdm to function properly
! pip install openpyxl # for saving stuff to Excel files

In [3]:
# import sys
import os
import json
import uuid
from tqdm.notebook import tqdm
import datetime 
from openpyxl import Workbook

# path is broken on my machine, so I leave this here for myself :)
# sys.path.append('/Users/veronicasmilga/Desktop/Tübingen/MSE/Project_MSE/')

# from db.DocumentEntry import DocumentEntry
# from db.DocumentRepository import DocumentRepository
from data_retrieval.Crawler import Crawler

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package stopwords to /Users/veronicasmilga/Des
[nltk_data]     ktop/Tübingen/MSE/Project_MSE/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/veronicasmilga/Desktop
[nltk_data]     /Tübingen/MSE/Project_MSE/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# frontier now is in a separate file
with open("../frontier.json", "r") as file:
    frontier = json.load(file)

NB: for the database to function properly, please first go to `exp/001_Flat_db_example_connection.ipynb` and complete the steps from there. If you don't want to be saving documents to the database, just comment out the code after _"# save one crawled page to database"_ comment.

Right now I am only saving to Excel files, but we can put the saved info from the Excel table to the database at any point.

In [5]:
# # initialising the database
# documentRepository = DocumentRepository()

# initialising the Excel backup (if sth goes wrong with the database)
wb = Workbook()
ws = wb.active
ws.title = "Crawled Data"
headers = ["id", "url", "title", "headings", "raw_html", "page_text", "keywords", "accessed_timestamp", "internal_links", "external_links"]
ws.append(headers)

Start crawling from this cell if you have no checkpoint information and want to start from scratch.

NB: I silenced the logs by default, now we only see error output from exceptions. To turn detailed logs back on for debug please initialise the Crawler with verbose=True.

In [None]:
now = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
# NB: this datetime will be used in the name of your Excel crawled_data_backup_file
# and json current_state_backup_file
print(f"START DATETIME: {now}")

max_pages = 10000
# only applies to websites fully about Tuebingen, i.e. ["tuebingen_focused_pages"] in frontier
max_steps_per_domain_prioritised = 3000
# applies to websites NOT about Tuebingen with just one or two pages about Tuebingen,
# i.e. ["general_pages"] in frontier and most children links
max_steps_per_domain_general = 5
timeout = 10

# Crawler is an iterator now, to handle info generated on-the-fly and save it immediately
crawler = Crawler(
    frontier,
    max_pages, 
    max_steps_per_domain_general, 
    max_steps_per_domain_prioritised, 
    timeout,
    # uncomment if you want to see all the logs
    # verbose=True
    )

for (
    # crawled info from page
    scraped_webpage_info,
    # this and further - state info to be saved to checkpoint file
    # from which crawler can be initialised later if our crawling process 
    # breaks at some point
    to_visit_prioritised, # Tübingen-related sites and their children
    to_visit, # general sites / unknown topic and their children
    visited_domains, # domains that should NOT be visited anymore because of reaching max_steps_per_domain
    visited, # links that were visited already
    domain_steps, # dict of steps made for each domain
    extra_links # links that were extracted but belong to a visited domain and will not be visited on this iteration
    ) in tqdm(crawler, total=max_pages):
    # save one crawled page to excel file
    row = [
        str(uuid.uuid4()),
        scraped_webpage_info["url"],
        scraped_webpage_info["title"],
        str(scraped_webpage_info["headings"]),
        str(scraped_webpage_info["raw_html"]),
        scraped_webpage_info["page_text"],
        str(scraped_webpage_info["keywords"]),
        scraped_webpage_info["accessed_timestamp"],
        str(scraped_webpage_info["internal_links"]),
        str(scraped_webpage_info["external_links"])
    ]
    try:
        ws.append(row)
        wb.save(f"./data/crawled_data_backup_{now}.xlsx")
    except Exception as e:
        # if something went wrong with Excel, try to save to json instead to preserve info
        try:
            print(f"Faced error {e} while trying to save page info to Excel. Saving to backup json file instead.")
            json_filename = f"data/crawled_data_backup_{now}.json"
            if os.path.exists(json_filename):
                with open(json_filename, "r") as file:
                    backup_file_content = json.load(file)
                backup_file_content.append(scraped_webpage_info)
            else:
                backup_file_content = [scraped_webpage_info]

            with open(json_filename, "w") as f:
                json.dump(backup_file_content, f, indent=4)
        except Exception as e:
            print(f"All attempts to save data failed. Skipping webpage {scraped_webpage_info['url']}.")

    # # save one crawled page to database
    # document = DocumentEntry(
    #     url=scraped_webpage_info["url"],
    #     title=scraped_webpage_info["title"],
    #     headings=scraped_webpage_info["headings"],
    #     page_text=scraped_webpage_info["page_text"], 
    #     keywords=scraped_webpage_info["keywords"],
    #     accessed_timestamp=scraped_webpage_info["accessed_timestamp"],
    #     internal_links=scraped_webpage_info["internal_links"],
    #     external_links=scraped_webpage_info["external_links"],
    #     id=uuid.uuid4()
    #     )
    # documentRepository.saveDocument(document)

    # save crawling state info on every step
    # later crawler can be initialised from this file
    crawling_state = {
        "to_visit_prioritised": list(to_visit_prioritised), 
        "to_visit": list(to_visit), 
        "visited_domains": list(visited_domains), 
        "visited": list(visited),
        "domain_steps": domain_steps,
        "extra_links": extra_links
    }

    json_filename = f"data/current_state_backup_file_{now}.json"
    with open(json_filename, "w") as f:
        json.dump(crawling_state, f, indent=4)
    
    print(f"Saved checkpoint info to {json_filename}.")

Start crawling from this cell if you have checkpoint information and want to initialise your crawler from a given state (to not crawl the links you crawled already and preserve the info about to_visit queue, visited_domains list, etc.). NB: this time your crawled data will be saved to a new Excel file, so you will have to merge the old one and the new one manually.

NB: right now the state of the crawler (according to what I have crawled already) can be found in `exp/data/current_state_backup_file_2024-07-08_09-57-39.json`.

In [None]:
from collections import deque

wb = Workbook()
ws = wb.active
ws.title = "Crawled Data"
headers = ["id", "url", "title", "headings", "raw_html", "page_text", "keywords", "accessed_timestamp", "internal_links", "external_links"]
ws.append(headers)

now = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
print(f"START DATETIME: {now}")

# provide the path to the checkpoint file here !!!
with open("data/current_state_backup_file_2024-07-08_09-57-39.json", "r") as f:
    crawling_state = json.load(f)

max_pages = 10000
max_steps_per_domain_prioritised = 3000 # only applies to websites fully about Tuebingen
max_steps_per_domain_general = 3 # only applies to websites NOT about Tuebingen
timeout = 10

to_visit_list = crawling_state["to_visit"]
to_visit_prioritised_list = crawling_state["to_visit_prioritised"]
visited_list = crawling_state["visited"]
visited_domains_list = crawling_state["visited_domains"]
domain_steps = crawling_state["domain_steps"]
extra_links = crawling_state["extra_links"]


to_visit = deque(to_visit_list)
to_visit_prioritised = deque(to_visit_prioritised_list)
visited = set(visited_list)
visited_domains = set(visited_domains_list)

crawler_1 = Crawler(
    frontier,
    max_pages, 
    max_steps_per_domain_general, 
    max_steps_per_domain_prioritised, 
    timeout,
    visited=visited,
    to_visit=to_visit,
    to_visit_prioritised=to_visit_prioritised,
    visited_domains=visited_domains,
    domain_steps=domain_steps,
    extra_links=extra_links,
    # uncomment if you want to see all the logs
    # verbose=True
    )

# Crawler is an iterator now, to handle info generated on-the-fly and save it immediately
for (
    scraped_webpage_info,
    to_visit_prioritised, 
    to_visit, 
    visited_domains, 
    visited, 
    domain_steps, 
    extra_links
    ) in tqdm(crawler_1, total=max_pages):
    # save one crawled page to excel file
    row = [
        str(uuid.uuid4()),
        scraped_webpage_info["url"],
        scraped_webpage_info["title"],
        str(scraped_webpage_info["headings"]),
        str(scraped_webpage_info["raw_html"]),
        scraped_webpage_info["page_text"],
        str(scraped_webpage_info["keywords"]),
        scraped_webpage_info["accessed_timestamp"],
        str(scraped_webpage_info["internal_links"]),
        str(scraped_webpage_info["external_links"])
    ]
    try:
        ws.append(row)
        wb.save(f"./data/crawled_data_backup_{now}.xlsx")
    except Exception as e:
        try:
            print(f"Faced error {e} while trying to save page info to Excel. Saving to backup json file instead.")
            json_filename = f"data/crawled_data_backup_{now}.json"
            if os.path.exists(json_filename):
                with open(json_filename, "r") as file:
                    backup_file_content = json.load(file)
                backup_file_content.append(scraped_webpage_info)
            else:
                backup_file_content = [scraped_webpage_info]

            with open(json_filename, "w") as f:
                json.dump(backup_file_content, f, indent=4)
        except Exception as e:
            print(f"All attempts to save data failed. Skipping webpage {scraped_webpage_info['url']}.")

    # # save one crawled page to database
    # document = DocumentEntry(
    #     url=scraped_webpage_info["url"],
    #     title=scraped_webpage_info["title"],
    #     headings=scraped_webpage_info["headings"],
    #     page_text=scraped_webpage_info["page_text"], 
    #     keywords=scraped_webpage_info["keywords"],
    #     accessed_timestamp=scraped_webpage_info["accessed_timestamp"],
    #     internal_links=scraped_webpage_info["internal_links"],
    #     external_links=scraped_webpage_info["external_links"],
    #     id=uuid.uuid4()
    #     )
    # documentRepository.saveDocument(document)

    crawling_state = {
        "to_visit_prioritised": list(to_visit_prioritised), 
        "to_visit": list(to_visit), 
        "visited_domains": list(visited_domains), 
        "visited": list(visited),
        "domain_steps": domain_steps,
        "extra_links": extra_links
    }

    json_filename = f"data/current_state_backup_file_{now}.json"
    with open(json_filename, "w") as f:
        json.dump(crawling_state, f, indent=4)


If you want to access documents in the database:

In [None]:
# allDocuments = documentRepository.loadAllDocuments()

Otherwise, if you just want to take a look at the data, go to `exp/data` and find the Excel file with timestamp corresponding to the time you ran the Crawler :)

Right now, the most recent file is `data/crawled_data_backup_2024-07-08_09-57-39_CONCAT.xlsx`.

In [11]:
import pandas as pd

df = pd.read_excel('data/crawled_data_backup_2024-07-08_09-57-39_CONCAT.xlsx', index_col=0)   

In [13]:
df.head(10)

Unnamed: 0_level_0,url,title,headings,raw_html,page_text,keywords,accessed_timestamp,internal_links,external_links
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
59971bc0-c414-428d-828c-249487b49a27,https://uni-tuebingen.de/en/,Home | University of Tübingen,['Studying at the University of Tübingen: Degr...,"b'<!DOCTYPE html>\n<html dir=""ltr"" lang=""en-GB...",Home | University of Tübingen Skip to main nav...,"['podcasts', 'fördermöglichkeiten', 'professor...",2024-07-08 04:01:29.810,"['https://uni-tuebingen.de/en/uni-a-z/', 'http...",['https://alma.uni-tuebingen.de/alma/pages/cs/...
96b9d348-0d2f-4b32-b0ea-50eefc613512,https://www.tuebingen.mpg.de/en,Max-Planck-Campus Tübingen | Max Planck Campus...,['Cutting-Edge Research at the Frontiers of Kn...,"b'<!DOCTYPE html>\n<html prefix=""og: http://og...",Max-Planck-Campus Tübingen | Max Planck Campus...,"['neuroscience', 'nobel', 'biochemistry', 'sci...",2024-07-08 04:01:32.900,"['https://www.tuebingen.mpg.de/en', 'https://w...","['https://cyber-valley.de/cyber-valley-days', ..."
4633587a-9ce4-44b0-96d3-4e6fadedf75e,https://www.tuebingen.de/en/,Welcome to Tübingen - City of Tuebingen,"['Welcome to Tübingen', 'July 8th, 2024', 'Pro...","b'<!DOCTYPE html>\n<html lang=""en""><head><meta...",Welcome to Tübingen - City of Tuebingen Immigr...,"['universitätsstadt', 'july', 'webcam', 'cosmo...",2024-07-08 04:01:34.906,"['https://www.tuebingen.de/en', 'https://www.t...",['http://tuebingen-info.de/index.php?id=727&Fs...
ff838549-bd02-4361-8b33-0389b761d1c9,https://tuebingenresearchcampus.com/,Home | Tübingen Research Campus,"['Research needs a future', 'Excellence Strate...","b'<!DOCTYPE html>\n<html lang=""en-US"" xml:lang...",Home | Tübingen Research Campus info@tuebingen...,"['scientists', 'leibniz', 'universities', 'can...",2024-07-08 04:01:38.323,"['https://tuebingenresearchcampus.com/', 'http...",['https://www.facebook.com/sharer.php?u=https%...
5ebe1b04-f797-4ba7-9a90-a8c3cf75994e,https://www.hih-tuebingen.de/en/,Home : Hertie-Institut für klinische Hirnforsc...,"['Home', ""Laying the foundations for tomorrow'...","b'<!DOCTYPE html>\n<html lang=""en"">\n<head>\n\...",Home : Hertie-Institut für klinische Hirnforsc...,"['genomics', 'oncology', 'cancer', 'neuroimmun...",2024-07-08 04:01:41.819,"['https://www.hih-tuebingen.de/', 'https://www...","['https://x.com/HIHTuebingen', 'https://de.lin..."
9570c667-5ba9-48ac-ab9e-9f3511fd304b,https://www.bccn-tuebingen.de/,Home - BCCN Tübingen,['Bernstein Center for Computational Neuroscie...,"b'<!DOCTYPE html>\n<html lang=""en"">\n<head pre...","Home - BCCN Tübingen Research CRC 1233 ""Robust...","['neuroscience', 'scientists', 'neurobiology',...",2024-07-08 04:01:44.298,"['https://www.bccn-tuebingen.de/', 'https://ww...","['http://www.tuebingen.mpg.de/', 'http://www.u..."
9341b185-a0c7-4d2f-99d9-76fe22c622f4,https://www.medizin.uni-tuebingen.de/en-de/,Startseite | Universitätsklinikum Tübingen,['University Hospital and\nFaculty of Medicine...,"b'<!doctype html>\n<html class=""no-js"" lang=""d...",Startseite | Universitätsklinikum Tübingen Sea...,"['datenschutzerklärung', 'hospitalization', 'd...",2024-07-08 04:01:48.447,"['https://www.medizin.uni-tuebingen.de/en-de',...","['https://twitter.com/uktuebingen', 'https://w..."
16a034ad-5b7c-4379-887a-2e9336037a50,https://www.my-stuwe.de/en/,Homepage - Studierendenwerk Tübingen-Hohenheim,"['News', 'Relocation coming soon', 'Rent adjus...","b'<!DOCTYPE html>\n<html lang=""en-US"" prefix=""...",Homepage - Studierendenwerk Tübingen-Hohenheim...,"['podcast', 'gastronomy', 'homepage', 'cafeter...",2024-07-08 04:01:50.939,"['https://www.my-stuwe.de/en/refectory/', 'htt...","['https://www.my-stuwe.de/en/opening-hours/', ..."
bda31f57-a36e-4084-88b9-9ac66a59d554,https://kunsthalle-tuebingen.de/en/,Kunsthalle Tübingen,"['Kunsthalle is closed today.', 'ART TREASURES...","b'<!DOCTYPE html>\r\n<html lang=""en-US"" class=...",Kunsthalle Tübingen Kunsthalle Tübingen Kunsth...,"['wednesday', 'saturday', 'kunstvermittlerin',...",2024-07-08 04:01:56.045,"['https://kunsthalle-tuebingen.de/', 'https://...","['https://kunsthalle-tuebingen.de/en/', 'https..."
7f2fb287-7ae9-4c67-9dd0-67c32341a146,https://www.neurochirurgie-tuebingen.de/en/,Welcome / Uniklinik Tübingen Neurochirurgie,"['Hinweis!', 'Willkomen bei der Neurochirurgie...","b'\n\n<!DOCTYPE html>\n\n<html lang=""de"">\n<!-...",Welcome / Uniklinik Tübingen Neurochirurgie Hi...,"['neurosurgery', 'tübingen', 'clinic', 'neuroc...",2024-07-08 04:01:58.375,['https://www.neurochirurgie-tuebingen.de/impr...,"['http://www.enable-javascript.com/de/', 'http..."
