From 661fdb525986ab5d4a6a5cd06b4c3fccee27553a Mon Sep 17 00:00:00 2001 From: mattnowzari Date: Tue, 18 Feb 2025 14:22:38 -0500 Subject: [PATCH 1/6] initial commit for elastic crawler migration notebook --- migration/crawler_migration.ipynb | 195 ++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 migration/crawler_migration.ipynb diff --git a/migration/crawler_migration.ipynb b/migration/crawler_migration.ipynb new file mode 100644 index 00000000..94793c3e --- /dev/null +++ b/migration/crawler_migration.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "89b4646f-6a71-44e0-97b9-846319bf0162", + "metadata": {}, + "source": [ + "## Hello, future Elastic Open Crawler user!\n", + "This notebook is designed to help you painlessly migrate your Elastic Crawler configurations to Open Crawler-friendly YAML!\n", + "\n", + "We recommend running each cell individually in a sequential fashion, as each cell is dependent on previous cells having been run." + ] + }, + { + "cell_type": "markdown", + "id": "f8b41584-1cce-440e-b3af-e8ae0cb1312c", + "metadata": {}, + "source": [ + "_If you are running this notebook inside Google Colab, or have not installed elasticsearch in your local environment yet, please run the following cell to make sure the Python `elasticsearch` client is installed._" + ] + }, + { + "cell_type": "markdown", + "id": "8bc65371-58ea-4be9-a319-2f7ed9713145", + "metadata": {}, + "source": [ + "### Setup\n", + "First, let's start by making sure `elasticsearch` and other required dependencies are installed and imported by running the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "da411d2f-9aff-46af-845a-5fe9be19ea3c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: elasticsearch in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (8.17.1)\n", + "Requirement already satisfied: elastic-transport<9,>=8.15.1 in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (from elasticsearch) (8.17.0)\n", + "Requirement already satisfied: urllib3<3,>=1.26.2 in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (from elastic-transport<9,>=8.15.1->elasticsearch) (2.3.0)\n", + "Requirement already satisfied: certifi in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (from elastic-transport<9,>=8.15.1->elasticsearch) (2024.12.14)\n" + ] + } + ], + "source": [ + "!pip install elasticsearch\n", + "\n", + "from getpass import getpass\n", + "from elasticsearch import Elasticsearch\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "id": "f4131f88-9895-4c0e-8b0a-6ec7b3b45653", + "metadata": {}, + "source": [ + "We are going to need a few things from your Elasticsearch deployment before we can migrate your configurations:\n", + "- Your **Elasticsearch Cloud ID**\n", + "- An **API key**\n", + "\n", + "To find the Cloud ID for your deployment, go to https://cloud.elastic.co/deployments and select your deployment.\n", + "You can create a new API key from the Stack Management -> API keys menu in Kibana. Be sure to copy or write down your key in a safe place once it is created it will be displayed only upon creation." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "08e6e3d2-62d3-4890-a6be-41fe0a931ef6", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Elastic Cloud ID: ········\n", + "Elastic Api Key: ········\n" + ] + } + ], + "source": [ + "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", + "API_KEY = getpass(\"Elastic Api Key: \")" + ] + }, + { + "cell_type": "markdown", + "id": "4993104f-ebb6-4715-b758-1fa262a224f3", + "metadata": {}, + "source": [ + "Great! Now let's try connecting to your Elasticsearch instance." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f3ada2cb-b00f-4b1d-be09-57b2ccf25c7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'You Know, for Search'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "es_client = Elasticsearch(\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " api_key=API_KEY,\n", + ")\n", + "\n", + "# ping ES to make sure we have positive connection\n", + "es_client.info()['tagline']" + ] + }, + { + "cell_type": "markdown", + "id": "85f99942-58ae-437d-a72b-70b8d1f4432c", + "metadata": {}, + "source": [ + "Hopefully you received our tagline 'You Know, for Search'. If so, we are connected and ready to go!\n", + "\n", + "If not, please double-check your Cloud ID and API key that you provided above. " + ] + }, + { + "cell_type": "markdown", + "id": "a55236e7-19dc-4f4c-92b9-d10848dd6af9", + "metadata": {}, + "source": [ + "#### Step 1: Grabbing basic configurations\n", + "\n", + "The first order of business is to establish what Crawlers you have, and their basic configuration details.\n", + "This migration notebook will attempt to pull configurations for every distinct Crawler you have in your Elasticsearch instance." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0a698b05-e939-42a5-aa31-51b1b1883e6f", + "metadata": {}, + "outputs": [], + "source": [ + "# define an intermediate data structure\n", + "inflight_configuration_data = {}\n", + "\n", + "crawler_configurations = es_client.search(\n", + " index=\".ent-search-actastic-crawler2_extraction_rules\",\n", + ")\n", + "\n", + "for configuration in crawler_configurations[\"hits\"][\"hits\"]:\n", + " source = configuration['_source']\n", + " conf_map = {} # this will be the entire config hashmap for a single Crawler\n", + " output_index = configuration[\"_index\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ee89b6a-00fe-4048-a6d6-a90fdbaaceed", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 4eaa6e2ba077b69be4071e22beb0fb31639736f1 Mon Sep 17 00:00:00 2001 From: mattnowzari Date: Fri, 21 Feb 2025 09:48:35 -0500 Subject: [PATCH 2/6] Working notebook, WIP --- .../crawler_migration-checkpoint.ipynb | 746 ++++++++++++++++++ ...ler_migration_exploration-checkpoint.ipynb | 466 +++++++++++ migration/crawler_migration.ipynb | 583 +++++++++++++- migration/crawler_migration_exploration.ipynb | 564 +++++++++++++ 4 files changed, 2343 insertions(+), 16 deletions(-) create mode 100644 migration/.ipynb_checkpoints/crawler_migration-checkpoint.ipynb create mode 100644 migration/.ipynb_checkpoints/crawler_migration_exploration-checkpoint.ipynb create mode 100644 migration/crawler_migration_exploration.ipynb diff --git a/migration/.ipynb_checkpoints/crawler_migration-checkpoint.ipynb b/migration/.ipynb_checkpoints/crawler_migration-checkpoint.ipynb new file mode 100644 index 00000000..b74898d7 --- /dev/null +++ b/migration/.ipynb_checkpoints/crawler_migration-checkpoint.ipynb @@ -0,0 +1,746 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "89b4646f-6a71-44e0-97b9-846319bf0162", + "metadata": {}, + "source": [ + "## Hello, future Elastic Open Crawler user!\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)]()\n", + "\n", + "This notebook is designed to help you migrate your Elastic Crawler configurations to Open Crawler-friendly YAML!\n", + "\n", + "We recommend running each cell individually in a sequential fashion, as each cell is dependent on previous cells having been run." + ] + }, + { + "cell_type": "markdown", + "id": "f8b41584-1cce-440e-b3af-e8ae0cb1312c", + "metadata": {}, + "source": [ + "_If you are running this notebook inside Google Colab, or have not installed elasticsearch in your local environment yet, please run the following cell to make sure the Python `elasticsearch` client is installed._" + ] + }, + { + "cell_type": "markdown", + "id": "8bc65371-58ea-4be9-a319-2f7ed9713145", + "metadata": {}, + "source": [ + "### Setup\n", + "First, let's start by making sure `elasticsearch` and other required dependencies are installed and imported by running the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": 510, + "id": "da411d2f-9aff-46af-845a-5fe9be19ea3c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: elasticsearch in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (8.17.1)\n", + "Requirement already satisfied: elastic-transport<9,>=8.15.1 in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (from elasticsearch) (8.17.0)\n", + "Requirement already satisfied: urllib3<3,>=1.26.2 in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (from elastic-transport<9,>=8.15.1->elasticsearch) (2.3.0)\n", + "Requirement already satisfied: certifi in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (from elastic-transport<9,>=8.15.1->elasticsearch) (2024.12.14)\n" + ] + } + ], + "source": [ + "!pip install elasticsearch\n", + "\n", + "from getpass import getpass\n", + "from elasticsearch import Elasticsearch\n", + "\n", + "import os\n", + "import json\n", + "import yaml\n", + "import pprint\n" + ] + }, + { + "cell_type": "markdown", + "id": "f4131f88-9895-4c0e-8b0a-6ec7b3b45653", + "metadata": {}, + "source": [ + "We are going to need a few things from your Elasticsearch deployment before we can migrate your configurations:\n", + "- Your **Elasticsearch Cloud ID**\n", + "- An **API key**\n", + "\n", + "To find the Cloud ID for your deployment, go to https://cloud.elastic.co/deployments and select your deployment.\n", + "You can create a new API key from the Stack Management -> API keys menu in Kibana. Be sure to copy or write down your key in a safe place once it is created it will be displayed only upon creation." + ] + }, + { + "cell_type": "code", + "execution_count": 511, + "id": "08e6e3d2-62d3-4890-a6be-41fe0a931ef6", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Elastic Cloud ID: ········\n", + "Elastic Api Key: ········\n" + ] + } + ], + "source": [ + "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", + "API_KEY = getpass(\"Elastic Api Key: \")" + ] + }, + { + "cell_type": "markdown", + "id": "4993104f-ebb6-4715-b758-1fa262a224f3", + "metadata": {}, + "source": [ + "Great! Now let's try connecting to your Elasticsearch instance." + ] + }, + { + "cell_type": "code", + "execution_count": 512, + "id": "f3ada2cb-b00f-4b1d-be09-57b2ccf25c7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'You Know, for Search'" + ] + }, + "execution_count": 512, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "es_client = Elasticsearch(\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " api_key=API_KEY,\n", + ")\n", + "\n", + "# ping ES to make sure we have positive connection\n", + "es_client.info()['tagline']" + ] + }, + { + "cell_type": "markdown", + "id": "85f99942-58ae-437d-a72b-70b8d1f4432c", + "metadata": {}, + "source": [ + "Hopefully you received our tagline 'You Know, for Search'. If so, we are connected and ready to go!\n", + "\n", + "If not, please double-check your Cloud ID and API key that you provided above. " + ] + }, + { + "cell_type": "markdown", + "id": "a55236e7-19dc-4f4c-92b9-d10848dd6af9", + "metadata": {}, + "source": [ + "### Step 1: Acquire Basic Configurations\n", + "\n", + "The first order of business is to establish what Crawlers you have and their basic configuration details.\n", + "This migration notebook will attempt to pull configurations for every distinct Crawler you have in your Elasticsearch instance." + ] + }, + { + "cell_type": "code", + "execution_count": 669, + "id": "0a698b05-e939-42a5-aa31-51b1b1883e6f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. search-search-crawler-fully-loaded-8.18\n", + " Crawler ID is 67b74f16204956a3ce9fd0a4\n", + "\n", + "2. search-daggerfall-unity-website-crawler-8.18\n", + " Crawler ID is 67b74f84204956efce9fd0b7\n", + "\n", + "3. search-migration-crawler\n", + " Crawler ID is 67b7509b2049567f859fd0d4\n", + "\n", + "4. search-basic\n", + " Crawler ID is 67b75aeb20495617d59fd0ea\n", + "\n" + ] + } + ], + "source": [ + " # in-memory data structure that maintains current state of the configs we've pulled\n", + "inflight_configuration_data = {}\n", + "\n", + "crawler_configurations = es_client.search(\n", + " index=\".ent-search-actastic-crawler2_configurations_v2\",\n", + ")\n", + "\n", + "crawler_counter = 1\n", + "for configuration in crawler_configurations[\"hits\"][\"hits\"]:\n", + " source = configuration[\"_source\"]\n", + "\n", + " # extract values\n", + " crawler_oid = source[\"id\"]\n", + " output_index = source[\"index_name\"]\n", + "\n", + " print (f\"{crawler_counter}. {output_index}\")\n", + " print (f\" Crawler ID is {crawler_oid}\\n\")\n", + " crawler_counter += 1\n", + "\n", + " crawl_schedule = [] # either no schedule or a specific schedule - determined in Step 4\n", + " if source[\"use_connector_schedule\"] == False and source[\"crawl_schedule\"]: # an interval schedule is being used\n", + " crawl_schedule = source[\"crawl_schedule\"] # this will be transformed in Step 4\n", + "\n", + " # populate a temporary hashmap\n", + " temp_conf_map = {\n", + " \"output_index\": output_index,\n", + " \"schedule\": crawl_schedule\n", + " }\n", + " # pre-populate some necessary fields in preparation for upcoming steps\n", + " temp_conf_map[\"domains_temp\"] = {}\n", + " temp_conf_map[\"output_sink\"] = \"elasticsearch\"\n", + " temp_conf_map[\"full_html_extraction_enabled\"] = False\n", + " temp_conf_map[\"elasticsearch\"] = {\n", + " \"host\": \"\",\n", + " \"port\": \"\",\n", + " \"api_key\": \"\",\n", + " # \"username\": \"\",\n", + " # \"password\": \"\",\n", + " }\n", + " # populate the in-memory data structure\n", + " inflight_configuration_data[crawler_oid] = temp_conf_map\n", + "\n", + "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT" + ] + }, + { + "cell_type": "markdown", + "id": "34f5e024-688c-4ffb-a16f-35f5171ba7a8", + "metadata": {}, + "source": [ + "**Before continuing, please verify in the output above that the correct number of Crawlers was found!**\n", + "\n", + "Now that we have some basic data about your Crawlers, let's use this information to get more configuration values!" + ] + }, + { + "cell_type": "markdown", + "id": "2b9e2da7-853c-40bd-9ee1-02c4d92b3b43", + "metadata": {}, + "source": [ + "### Step 2: URLs, Sitemaps, and Crawl Rules\n", + "\n", + "In this cell, we will need to query Elasticsearch for information about each Crawler's domain URLs, seed URLs, sitemaps, and crawling rules." + ] + }, + { + "cell_type": "code", + "execution_count": 670, + "id": "e1c64c3d-c8d7-4236-9ed9-c9b1cb5e7972", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.) Crawler ID 67b74f16204956a3ce9fd0a4\n", + " Domain https://www.speedhunters.com found!\n", + " Seed URls found: ['https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/', 'https://www.speedhunters.com/2025/02/daniel-arsham-eroded-porsche-911/', 'https://www.speedhunters.com/2025/02/5-plus-7-equals-v12-a-custom-bmw-super-saloon/']\n", + " Sitemap URLs found: ['https://www.speedhunters.com/post_tag-sitemap2.xml']\n", + "2.) Crawler ID 67b74f84204956efce9fd0b7\n", + " Domain https://www.dfworkshop.net found!\n", + " Seed URls found: ['https://www.dfworkshop.net/']\n", + " Crawl rules found: [{'policy': 'allow', 'type': 'begins', 'pattern': '/word'}, {'policy': 'deny', 'type': 'contains', 'pattern': 'DOS'}]\n", + " Domain https://www.speedhunters.com found!\n", + " Seed URls found: ['https://www.speedhunters.com/']\n", + " Crawl rules found: [{'policy': 'deny', 'type': 'begins', 'pattern': '/BMW'}]\n", + "3.) Crawler ID 67b7509b2049567f859fd0d4\n", + " Domain https://justinjackson.ca found!\n", + " Seed URls found: ['https://justinjackson.ca/']\n", + " Domain https://matt-nowzari.myportfolio.com found!\n", + " Seed URls found: ['https://matt-nowzari.myportfolio.com/']\n", + " Crawl rules found: [{'policy': 'deny', 'type': 'begins', 'pattern': '/The'}]\n", + "4.) Crawler ID 67b75aeb20495617d59fd0ea\n", + " Domain https://www.elastic.co found!\n", + " Seed URls found: ['https://www.elastic.co/']\n" + ] + } + ], + "source": [ + "crawler_ids_to_query = inflight_configuration_data.keys()\n", + "\n", + "crawler_counter = 1\n", + "for crawler_oid in crawler_ids_to_query:\n", + " # query ES to get the crawler's domain configurations\n", + " crawler_domains = es_client.search(\n", + " index=\".ent-search-actastic-crawler2_domains\",\n", + " query={\"match\": {\"configuration_oid\": crawler_oid}},\n", + " _source=[\"name\",\n", + " \"configuration_oid\",\n", + " \"id\",\n", + " \"sitemaps\",\n", + " \"crawl_rules\",\n", + " \"seed_urls\",\n", + " \"auth\"]\n", + " )\n", + " print (f\"{crawler_counter}.) Crawler ID {crawler_oid}\")\n", + " crawler_counter += 1\n", + " \n", + " # for each domain the Crawler has, grab its config values\n", + " # and update the in-memory data structure\n", + " for domain_info in crawler_domains[\"hits\"][\"hits\"]:\n", + " source = domain_info[\"_source\"]\n", + "\n", + " # extract values\n", + " domain_oid = str(source[\"id\"])\n", + " domain_url = source[\"name\"]\n", + " seed_urls = source[\"seed_urls\"]\n", + " sitemap_urls = source[\"sitemaps\"]\n", + " crawl_rules = source[\"crawl_rules\"]\n", + "\n", + " print (f\" Domain {domain_url} found!\")\n", + " \n", + " # transform seed, sitemap, and crawl rules into arrays\n", + " seed_urls_list = []\n", + " for seed_obj in seed_urls:\n", + " seed_urls_list.append(seed_obj[\"url\"])\n", + "\n", + " sitemap_urls_list= []\n", + " for sitemap_obj in sitemap_urls:\n", + " sitemap_urls_list.append(sitemap_obj[\"url\"])\n", + "\n", + " crawl_rules_list = []\n", + " for crawl_rules_obj in crawl_rules:\n", + " crawl_rules_list.append({\n", + " \"policy\" : crawl_rules_obj[\"policy\"],\n", + " \"type\": crawl_rules_obj[\"rule\"],\n", + " \"pattern\": crawl_rules_obj[\"pattern\"]\n", + " })\n", + "\n", + " # populate a temporary hashmap\n", + " temp_domain_conf = {\"url\": domain_url}\n", + " if seed_urls_list:\n", + " temp_domain_conf[\"seed_urls\"] = seed_urls_list\n", + " print (f\" Seed URls found: {seed_urls_list}\")\n", + " if sitemap_urls_list:\n", + " temp_domain_conf[\"sitemap_urls\"] = sitemap_urls_list\n", + " print (f\" Sitemap URLs found: {sitemap_urls_list}\")\n", + " if crawl_rules_list:\n", + " temp_domain_conf[\"crawl_rules\"] = crawl_rules_list\n", + " print (f\" Crawl rules found: {crawl_rules_list}\")\n", + " \n", + " # populate the in-memory data structure\n", + " inflight_configuration_data[crawler_oid][\"domains_temp\"][domain_oid] = temp_domain_conf\n", + "\n", + "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT" + ] + }, + { + "cell_type": "markdown", + "id": "575c00ac-7c84-465e-83d7-aa51f8e5310d", + "metadata": {}, + "source": [ + "### Step 3: Extracting the Extraction Rules\n", + "\n", + "In the following cell, we will be acquiring any extraction rules you may have set in your Elastic Crawlers." + ] + }, + { + "cell_type": "code", + "execution_count": 671, + "id": "61a7df7a-72ad-4330-a30c-da319befd55c", + "metadata": {}, + "outputs": [], + "source": [ + "extraction_rules = es_client.search(\n", + " index=\".ent-search-actastic-crawler2_extraction_rules\",\n", + " _source=[\"configuration_oid\", \"domain_oid\", \"rules\", \"url_filters\"]\n", + ")\n", + "\n", + "for exr_rule in extraction_rules[\"hits\"][\"hits\"]:\n", + " source = exr_rule[\"_source\"]\n", + "\n", + " config_oid = source[\"configuration_oid\"]\n", + " domain_oid = source[\"domain_oid\"]\n", + " \n", + " all_rules = source[\"rules\"]\n", + " all_url_filters = source[\"url_filters\"]\n", + "\n", + " # extract url filters\n", + " url_filters = []\n", + " if all_url_filters:\n", + " url_filters = [{\n", + " \"type\": all_url_filters[0][\"filter\"],\n", + " \"pattern\": all_url_filters[0][\"pattern\"],\n", + " }]\n", + "\n", + " # extract rulesets\n", + " action_translation_map = {\n", + " \"fixed\": \"set\",\n", + " \"extracted\": \"extract\",\n", + " }\n", + " \n", + " ruleset = {}\n", + " if all_rules:\n", + " ruleset = [{\n", + " \"action\": action_translation_map[all_rules[0][\"content_from\"][\"value_type\"]],\n", + " \"field_name\": all_rules[0][\"field_name\"],\n", + " \"selector\": all_rules[0][\"selector\"],\n", + " \"join_as\": all_rules[0][\"multiple_objects_handling\"],\n", + " \"value\": all_rules[0][\"content_from\"][\"value\"],\n", + " \"source\": all_rules[0][\"source_type\"],\n", + " }]\n", + "\n", + " # populate the in-memory data structure\n", + " temp_extraction_rulesets = [{\n", + " \"url_filters\": url_filters,\n", + " \"rules\": ruleset,\n", + " }]\n", + " inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\"extraction_rulesets\"] = temp_extraction_rulesets\n", + "\n", + "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT" + ] + }, + { + "cell_type": "markdown", + "id": "538fb054-1399-4b88-bd1e-fef116491421", + "metadata": {}, + "source": [ + "### Step 4: Schedules\n", + "\n", + "In the upcoming cell, we will be gathing any schedules your Crawlers have set." + ] + }, + { + "cell_type": "code", + "execution_count": 672, + "id": "d880e081-f960-41c7-921e-26896f248eab", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_cron_expression(interval_values: dict) -> str:\n", + " return interval_values # TODO TODO this ** might not be needed? **\n", + "\n", + "# ---------------------------\n", + "\n", + "for crawler_oid, crawler_config in inflight_configuration_data.items():\n", + " output_index = crawler_config[\"output_index\"]\n", + " \n", + " existing_schedule_value = crawler_config[\"schedule\"]\n", + "\n", + " if not existing_schedule_value:\n", + " # query ES to get this Crawler's specific time schedule\n", + " schedules_result = es_client.search(\n", + " index=\".elastic-connectors-v1\",\n", + " query={\"match\": {\"index_name\": output_index}},\n", + " _source=[\"index_name\", \"scheduling\"]\n", + " )\n", + " # update schedule field with cron expression if specific time scheduling is enabled\n", + " if schedules_result[\"hits\"][\"hits\"][0][\"_source\"][\"scheduling\"][\"full\"][\"enabled\"]:\n", + " specific_time_schedule = schedules_result[\"hits\"][\"hits\"][0][\"_source\"][\"scheduling\"][\"full\"][\"interval\"]\n", + " crawler_config[\"schedule\"] = specific_time_schedule\n", + " elif isinstance(existing_schedule_value[0], dict):\n", + " crawler_config[\"schedule\"] = generate_cron_expression(existing_schedule_value)\n", + " \n", + "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT " + ] + }, + { + "cell_type": "markdown", + "id": "b1586df2-283d-435f-9b08-ba9fad3a7e0a", + "metadata": {}, + "source": [ + "### Step 5: Creating the Open Crawler YAML configuration files\n", + "\n", + "In this final step, we will be creating the actual YAML files you need to get up and running with Open Crawler!\n", + "\n", + "The upcoming cell performs some final transformations to the in-memory data structure that is keeping track of your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 673, + "id": "dd70f102-33ee-4106-8861-0aa0f9a223a1", + "metadata": {}, + "outputs": [], + "source": [ + "# Final transform of the in-memory data structure to a form we can dump to YAML\n", + "# for each crawler, collect all of its domain configurations into a list\n", + "for crawler_config in inflight_configuration_data.values():\n", + " all_crawler_domains = []\n", + " \n", + " for domain_config in crawler_config[\"domains_temp\"].values():\n", + " all_crawler_domains.append(domain_config)\n", + " # create a new key called \"domains\" that points to a list of domain configs only - no domain_oid values as keys\n", + " crawler_config[\"domains\"] = all_crawler_domains\n", + " # delete the temporary domain key\n", + " del crawler_config[\"domains_temp\"]\n", + "\n", + "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT " + ] + }, + { + "cell_type": "markdown", + "id": "e611a486-e12f-4951-ab95-ca54241a7a06", + "metadata": {}, + "source": [ + "#### **Wait! Before we continue onto creating our YAML files, we're going to need your input on a few things.**\n", + "\n", + "In the following cell, please enter the following details about the _Elasticsearch instance you will be using with Open Crawler_:\n", + "- The Elasticsearch endpoint URL\n", + "- The port number of your Elasticsearch endpoint\n", + "- An API key" + ] + }, + { + "cell_type": "code", + "execution_count": 660, + "id": "213880cc-cbf3-40d9-8c7d-6fcf6428c16b", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Elasticsearch endpoint URL: https://4911ebad5ed44d149fe8ddad4a4b3751.us-west2.gcp.elastic-cloud.com\n", + "The Elasticsearch endpoint's port number: 443\n", + "Elasticsearch API key: ········\n" + ] + } + ], + "source": [ + "ENDPOINT = input(\"Elasticsearch endpoint URL: \")\n", + "PORT = input(\"The Elasticsearch endpoint's port number: \")\n", + "API_KEY = getpass(\"Elasticsearch API key: \")\n", + "\n", + "# set the above values in each Crawler's configuration\n", + "for crawler_config in inflight_configuration_data.values():\n", + " crawler_config[\"elasticsearch\"][\"host\"] = ENDPOINT\n", + " crawler_config[\"elasticsearch\"][\"port\"] = int(PORT)\n", + " crawler_config[\"elasticsearch\"][\"api_key\"] = API_KEY" + ] + }, + { + "cell_type": "markdown", + "id": "67dfc7c6-429e-42f0-ab08-2c84d72945cb", + "metadata": {}, + "source": [ + "#### **This is the final step! You have two options here:**\n", + "\n", + "- The \"Write to YAML\" cell will create _n_ number of YAML files, one for each Crawler you have.\n", + "- The \"Print to output\" cell will print each Crawler's configuration YAML in the Notebook, so you can copy-paste them into your Open Crawler YAML files manually.\n", + "\n", + "Feel free to run both! You can run Option 2 first to see the output before running Option 1 to save the configs into YAML files." + ] + }, + { + "cell_type": "markdown", + "id": "7ca5ad33-364c-4d13-88fc-db19052363d5", + "metadata": {}, + "source": [ + "#### Option 1: Write to YAML file" + ] + }, + { + "cell_type": "code", + "execution_count": 661, + "id": "6adc53db-d781-4b72-a5f3-441364f354b8", + "metadata": {}, + "outputs": [], + "source": [ + "# Dump each Crawler's configuration into its own YAML file\n", + "for crawler_config in inflight_configuration_data.values():\n", + " base_dir = os.getcwd()\n", + " file_name = f\"{crawler_config['output_index']}-config.yml\" # autogen a custom filename\n", + " output_path = os.path.join(base_dir, file_name)\n", + "\n", + " if os.path.exists(base_dir):\n", + " with open(output_path, 'w') as file:\n", + " yaml.safe_dump(\n", + " crawler_config,\n", + " file,\n", + " sort_keys=False\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "35c56a2b-4acd-47f5-90e3-9dd39fa4383f", + "metadata": {}, + "source": [ + "#### Option 2: Print to output" + ] + }, + { + "cell_type": "code", + "execution_count": 674, + "id": "525aabb8-0537-4ba6-8109-109490dddafe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "YAML config => search-search-crawler-fully-loaded-8.18-config.yml\n", + "--------\n", + "output_index: search-search-crawler-fully-loaded-8.18\n", + "schedule: []\n", + "output_sink: elasticsearch\n", + "full_html_extraction_enabled: false\n", + "elasticsearch:\n", + " host: ''\n", + " port: ''\n", + " api_key: ''\n", + "domains:\n", + "- url: https://www.speedhunters.com\n", + " seed_urls:\n", + " - https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/\n", + " - https://www.speedhunters.com/2025/02/daniel-arsham-eroded-porsche-911/\n", + " - https://www.speedhunters.com/2025/02/5-plus-7-equals-v12-a-custom-bmw-super-saloon/\n", + " sitemap_urls:\n", + " - https://www.speedhunters.com/post_tag-sitemap2.xml\n", + "\n", + "--------------------------------------------------------------------------------\n", + "YAML config => search-daggerfall-unity-website-crawler-8.18-config.yml\n", + "--------\n", + "output_index: search-daggerfall-unity-website-crawler-8.18\n", + "schedule: 0 30 8 * * ?\n", + "output_sink: elasticsearch\n", + "full_html_extraction_enabled: false\n", + "elasticsearch:\n", + " host: ''\n", + " port: ''\n", + " api_key: ''\n", + "domains:\n", + "- url: https://www.dfworkshop.net\n", + " seed_urls:\n", + " - https://www.dfworkshop.net/\n", + " crawl_rules:\n", + " - policy: allow\n", + " type: begins\n", + " pattern: /word\n", + " - policy: deny\n", + " type: contains\n", + " pattern: DOS\n", + " extraction_rulesets:\n", + " - url_filters:\n", + " - type: begins\n", + " pattern: /elderscrolls/*\n", + " rules:\n", + " - action: set\n", + " field_name: elder_field\n", + " selector: /elderscrolls/*\n", + " join_as: string\n", + " value: ping\n", + " source: url\n", + "- url: https://www.speedhunters.com\n", + " seed_urls:\n", + " - https://www.speedhunters.com/\n", + " crawl_rules:\n", + " - policy: deny\n", + " type: begins\n", + " pattern: /BMW\n", + "\n", + "--------------------------------------------------------------------------------\n", + "YAML config => search-migration-crawler-config.yml\n", + "--------\n", + "output_index: search-migration-crawler\n", + "schedule: []\n", + "output_sink: elasticsearch\n", + "full_html_extraction_enabled: false\n", + "elasticsearch:\n", + " host: ''\n", + " port: ''\n", + " api_key: ''\n", + "domains:\n", + "- url: https://justinjackson.ca\n", + " seed_urls:\n", + " - https://justinjackson.ca/\n", + "- url: https://matt-nowzari.myportfolio.com\n", + " seed_urls:\n", + " - https://matt-nowzari.myportfolio.com/\n", + " crawl_rules:\n", + " - policy: deny\n", + " type: begins\n", + " pattern: /The\n", + " extraction_rulesets:\n", + " - url_filters: []\n", + " rules:\n", + " - action: set\n", + " field_name: test_field\n", + " selector: /html/body/a/@title\n", + " join_as: string\n", + " value: some_rando_value\n", + " source: html\n", + "\n", + "--------------------------------------------------------------------------------\n", + "YAML config => search-basic-config.yml\n", + "--------\n", + "output_index: search-basic\n", + "schedule:\n", + "- unit: hour\n", + " frequency: 24\n", + "output_sink: elasticsearch\n", + "full_html_extraction_enabled: false\n", + "elasticsearch:\n", + " host: ''\n", + " port: ''\n", + " api_key: ''\n", + "domains:\n", + "- url: https://www.elastic.co\n", + " seed_urls:\n", + " - https://www.elastic.co/\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "for crawler_config in inflight_configuration_data.values():\n", + " yaml_out = yaml.safe_dump(\n", + " crawler_config,\n", + " sort_keys=False\n", + " )\n", + " \n", + " print (f\"YAML config => {crawler_config['output_index']}-config.yml\\n--------\")\n", + " print (yaml_out)\n", + " print (\"--------------------------------------------------------------------------------\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55888204-f823-48cd-bca4-a7663e0fe56a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/migration/.ipynb_checkpoints/crawler_migration_exploration-checkpoint.ipynb b/migration/.ipynb_checkpoints/crawler_migration_exploration-checkpoint.ipynb new file mode 100644 index 00000000..a62478bb --- /dev/null +++ b/migration/.ipynb_checkpoints/crawler_migration_exploration-checkpoint.ipynb @@ -0,0 +1,466 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "4f198cd5-cc9c-4080-8dd4-425628b05d4d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is the start of the notebook\n", + "Hopefully we can learn a thing or two, before doing a thing or two.\n" + ] + } + ], + "source": [ + "print (\"This is the start of the notebook\")\n", + "print (\"Hopefully we can learn a thing or two, before doing a thing or two.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1914cbf9-a18b-4b1d-9ea1-5bc04e23ceff", + "metadata": {}, + "outputs": [], + "source": [ + "from elasticsearch import Elasticsearch\n", + "import json\n", + "\n", + "endpoint = \"https://5a5b8a5cdd84464dae4c7c7ae8a59562.us-east1.gcp.elastic-cloud.com:443\"\n", + "api_key = \"aTN4MUdwVUJLTFFTSmFFWjBlTFM6dmU0ZXJnTjdUaUs5dXhIUU1fd0xiZw==\"\n", + "\n", + "es_client = Elasticsearch(\n", + " endpoint,\n", + " api_key=api_key,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "711bb339-bbc8-4112-a392-dde01f5e5729", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The index [search-migration_crawler] was found!\n" + ] + } + ], + "source": [ + "index_name = \"search-migration_crawler\"\n", + "if not es_client.indices.exists(index=index_name):\n", + " print (\"Eek! The index does not exist!\")\n", + "else:\n", + " print (f\"The index [{index_name}] was found!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a14009b2-2d34-465e-b43d-a274f01fbff0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Let's see if we can get _all_ indices in this ES instance related to crawler?\n", + "\n", + "We got the list as a JSON dictionary! We received 20 indices.\n", + "\n", + " 4986743 docs -> .ds-logs-elastic_crawler-default-2025.02.05-000001\n", + " 0 docs -> .ent-search-actastic-app_search_crawler_content_metadata\n", + " 0 docs -> .ent-search-actastic-app_search_crawler_content_metadata-content_hash-engine_oid-unique-constraint\n", + " 0 docs -> .ent-search-actastic-app_search_crawler_content_url_metadata\n", + " 3 docs -> .ent-search-actastic-crawler2_configurations_v2\n", + " 3 docs -> .ent-search-actastic-crawler2_configurations_v2-index_name-unique-constraint\n", + " 9385 docs -> .ent-search-actastic-crawler2_content_metadata\n", + " 9385 docs -> .ent-search-actastic-crawler2_content_metadata-configuration_oid-content_hash-unique-constraint\n", + " 9532 docs -> .ent-search-actastic-crawler2_content_url_metadata\n", + " 332 docs -> .ent-search-actastic-crawler2_crawl_requests_v2\n", + " 4 docs -> .ent-search-actastic-crawler2_domains\n", + " 4 docs -> .ent-search-actastic-crawler2_domains-configuration_oid-name-unique-constraint\n", + " 2 docs -> .ent-search-actastic-crawler2_extraction_rules\n", + " 0 docs -> .ent-search-actastic-crawler2_process_crawls\n", + " 651 docs -> .ent-search-actastic-crawler2_robots_txts\n", + " 0 docs -> .ent-search-actastic-crawler_crawl_requests_v7\n", + " 0 docs -> .ent-search-actastic-crawler_domains_v6\n", + " 0 docs -> .ent-search-actastic-crawler_domains_v6-engine_oid-name-unique-constraint\n", + " 0 docs -> .ent-search-actastic-crawler_process_crawls\n", + " 0 docs -> .ent-search-actastic-crawler_robots_txts_v3\n", + "\n", + "There are 20 healthy indices, 0 sick indices and 0 unhealthy indices.\n", + "11 indices have docs, and 9 indices do not.\n" + ] + } + ], + "source": [ + "print (\"Let's see if we can get _all_ indices in this ES instance related to crawler?\\n\")\n", + "\n", + "json_response = es_client.cat.indices(\n", + " index=\".*crawler*\", # take note of the . before *crawler* - this tells the API to query 'hidden' indices as well\n", + " s=\"index\",\n", + " format=\"json\"\n", + ")\n", + "\n", + "print (f\"We got the list as a JSON dictionary! We received {len(json_response)} indices.\\n\")\n", + "\n", + "health_histogram = {\n", + " \"green\": 0,\n", + " \"yellow\": 0,\n", + " \"red\": 0,\n", + "}\n", + "\n", + "indices_with_docs = {\n", + " \"with_docs\": 0,\n", + " \"without_docs\": 0,\n", + "}\n", + "\n", + "index_names = [] # save the index names to run through all of them at some point in the future\n", + "\n", + "for item in json_response.body: # Note that calling .body on the response will get us a List of dictionaries:\n", + " health_status = item[\"health\"]\n", + " health_histogram[health_status] += 1\n", + "\n", + " if int(item['docs.count']) > 0:\n", + " indices_with_docs['with_docs'] += 1\n", + " index_names.append(item['index'])\n", + " else:\n", + " indices_with_docs['without_docs'] += 1\n", + "\n", + " print (f\" {item['docs.count']} docs -> {item['index']}\")\n", + "\n", + "print (f\"\\nThere are {health_histogram['green']} healthy indices, {health_histogram['yellow']} sick indices \\\n", + "and {health_histogram['red']} unhealthy indices.\")\n", + "\n", + "print (f\"{indices_with_docs['with_docs']} indices have docs, and {indices_with_docs['without_docs']} \\\n", + "indices do not.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "db96133f-bb1b-4f40-b02f-4812a585964d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Let's grab the extraction rules for our crawler.\n", + "The extraction rules are defined in the index .ent-search-actastic-crawler2_extraction_rules\n", + "\n", + "{'took': 0, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': '.ent-search-actastic-crawler2_extraction_rules', '_id': '67a393cc17490cc0d24853ef', '_score': 1.0, '_source': {'id': '67a393cc17490cc0d24853ef', 'created_at': '2025-02-05T16:37:32Z', 'updated_at': '2025-02-05T16:37:32Z', 'description': 'extraction_rule_generic', 'domain_oid': '67a3858717490ccc74482755', 'configuration_oid': '67a3857117490c2adf48274d', 'url_filters': [], 'rules': [{'content_from': {'value_type': 'fixed', 'value': '\"some_rando_value\"'}, 'field_name': 'test_field', 'multiple_objects_handling': 'string', 'selector': '/html/body/a/@title', 'source_type': 'html'}], 'edited_by': '1432693181'}}, {'_index': '.ent-search-actastic-crawler2_extraction_rules', '_id': '67adfbd0cf0332091f8af6e2', '_score': 1.0, '_source': {'id': '67adfbd0cf0332091f8af6e2', 'created_at': '2025-02-13T14:04:00Z', 'updated_at': '2025-02-13T14:04:00Z', 'description': 'df_ex_rule', 'domain_oid': '67adf64bcf0332cb308aef02', 'configuration_oid': '67adf643cf03320a318aeefa', 'url_filters': [], 'rules': [{'content_from': {'value_type': 'extracted', 'value': ''}, 'field_name': 'df_xtraction_rule', 'multiple_objects_handling': 'string', 'selector': '/html/body/a/@title', 'source_type': 'html'}], 'edited_by': '1432693181'}}]}}\n", + "\n", + "Index: .ent-search-actastic-crawler2_extraction_rules\n", + "[{'content_from': {'value_type': 'fixed', 'value': '\"some_rando_value\"'}, 'field_name': 'test_field', 'multiple_objects_handling': 'string', 'selector': '/html/body/a/@title', 'source_type': 'html'}]\n", + "Description: extraction_rule_generic\n", + "Configuration OID: 67a3857117490c2adf48274d\n", + "Domain OID: 67a3858717490ccc74482755\n", + "Value type: fixed\n", + "Value: \"some_rando_value\"\n", + "Field name: test_field\n", + "Multiple Objects Handling: string\n", + "Selector: /html/body/a/@title\n", + "Source type: html\n", + "\n", + "Index: .ent-search-actastic-crawler2_extraction_rules\n", + "[{'content_from': {'value_type': 'extracted', 'value': ''}, 'field_name': 'df_xtraction_rule', 'multiple_objects_handling': 'string', 'selector': '/html/body/a/@title', 'source_type': 'html'}]\n", + "Description: df_ex_rule\n", + "Configuration OID: 67adf643cf03320a318aeefa\n", + "Domain OID: 67adf64bcf0332cb308aef02\n", + "Value type: extracted\n", + "Value: \n", + "Field name: df_xtraction_rule\n", + "Multiple Objects Handling: string\n", + "Selector: /html/body/a/@title\n", + "Source type: html\n", + "\n" + ] + } + ], + "source": [ + "print (\"Let's grab the extraction rules for our crawler.\")\n", + "print (\"The extraction rules are defined in the index .ent-search-actastic-crawler2_extraction_rules\\n\")\n", + "\n", + "ex_r = es_client.search(\n", + " index=\".ent-search-actastic-crawler2_extraction_rules\",\n", + " # _source=\"rules\"\n", + ")\n", + "\n", + "print (ex_r)\n", + "print ()\n", + "\n", + "for i in ex_r[\"hits\"][\"hits\"]:\n", + " print (f\"Index: {i['_index']}\")\n", + " rules = i[\"_source\"][\"rules\"]\n", + " print (rules)\n", + " for rule in rules:\n", + " print (f\"Description: {i['_source']['description']}\")\n", + " print (f\"Configuration OID: {i['_source']['configuration_oid']}\") # <-- I wonder if we can use these to\n", + " print (f\"Domain OID: {i['_source']['domain_oid']}\") # <-- match specific crawlers\n", + " \n", + " print (f\"Value type: {rule['content_from']['value_type']}\") # <-- this maps to 'action'\n", + " print (f\"Value: {rule['content_from']['value']}\") # <--\n", + " print (f\"Field name: {rule['field_name']}\") # <--\n", + " print (f\"Multiple Objects Handling: {rule['multiple_objects_handling']}\") # <-- this maps to \"join_as\"\n", + " print (f\"Selector: {rule['selector']}\") # <--\n", + " print (f\"Source type: {rule['source_type']}\\n\") # <--" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "8c0b6c8d-52f5-4325-92ca-8b7976c69abf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Let's try grabbing the configurations docs in .ent-search-actastic-crawler2_configurations_v2\n", + "{'took': 1, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 3, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': '.ent-search-actastic-crawler2_configurations_v2', '_id': '67a3857117490c2adf48274d', '_score': 1.0, '_source': {'id': '67a3857117490c2adf48274d', 'created_at': '2025-02-05T15:36:17Z', 'updated_at': '2025-02-05T16:34:04Z', 'index_name': 'search-migration_crawler', 'crawl_schedule': [{'unit': 'hour', 'frequency': 1}], 'use_connector_schedule': False}}, {'_index': '.ent-search-actastic-crawler2_configurations_v2', '_id': '67adf643cf03320a318aeefa', '_score': 1.0, '_source': {'id': '67adf643cf03320a318aeefa', 'created_at': '2025-02-13T13:40:19Z', 'updated_at': '2025-02-13T14:29:16Z', 'index_name': 'search-daggerfall-unity-website-crawler', 'crawl_schedule': [{'unit': 'hour', 'frequency': 24}], 'use_connector_schedule': True}}, {'_index': '.ent-search-actastic-crawler2_configurations_v2', '_id': '67abc13fcf0332544683f928', '_score': 1.0, '_source': {'id': '67abc13fcf0332544683f928', 'created_at': '2025-02-11T21:29:35Z', 'updated_at': '2025-02-14T16:58:50Z', 'index_name': 'search-crawler-fully-loaded', 'crawl_schedule': None, 'use_connector_schedule': False}}]}}\n", + "\n", + "Inside this index, we can find the following values:\n", + "\n", + "Index: .ent-search-actastic-crawler2_configurations_v2\n", + "Configuration ID: 67a3857117490c2adf48274d\n", + "Index name: search-migration_crawler\n", + "Crawl schedule: [{'unit': 'hour', 'frequency': 1}]\n", + "Use crawl schedule?: False\n", + "\n", + "Index: .ent-search-actastic-crawler2_configurations_v2\n", + "Configuration ID: 67adf643cf03320a318aeefa\n", + "Index name: search-daggerfall-unity-website-crawler\n", + "Crawl schedule: [{'unit': 'hour', 'frequency': 24}]\n", + "Use crawl schedule?: True\n", + "\n", + "Index: .ent-search-actastic-crawler2_configurations_v2\n", + "Configuration ID: 67abc13fcf0332544683f928\n", + "Index name: search-crawler-fully-loaded\n", + "Crawl schedule: None\n", + "Use crawl schedule?: False\n", + "\n" + ] + } + ], + "source": [ + "print (\"Let's try grabbing the configurations docs in .ent-search-actastic-crawler2_configurations_v2\")\n", + "\n", + "config_r = es_client.search(\n", + " index=\".ent-search-actastic-crawler2_configurations_v2\",\n", + ")\n", + "\n", + "print (config_r)\n", + "print ()\n", + "\n", + "print (\"Inside this index, we can find the following values:\\n\")\n", + "for i in config_r[\"hits\"][\"hits\"]:\n", + " source = i[\"_source\"]\n", + " print (f\"Index: {i['_index']}\")\n", + " print (f\"Configuration ID: {source['id']}\") # <--\n", + " print (f\"Index name: {source['index_name']}\") # <--\n", + " print (f\"Crawl schedule: {source['crawl_schedule']}\")\n", + " print (f\"Use crawl schedule?: {source['use_connector_schedule']}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "85adaae5-38d1-4410-b38c-3dd827d3170c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".ent-search-actastic-crawler2_domains\n", + "\n", + "{'took': 1, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 4, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': '.ent-search-actastic-crawler2_domains', '_id': '67abc186cf0332829183f93b', '_score': 1.0, '_source': {'id': '67abc186cf0332829183f93b', 'configuration_oid': '67abc13fcf0332544683f928', 'name': 'https://www.speedhunters.com', 'crawl_rules': [], 'seed_urls': [{'created_at': '2025-02-11T21:31:04Z', 'id': '67abc198cf0332d5ae83f93d', 'url': 'https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/'}], 'sitemaps': [{'created_at': '2025-02-14T16:58:47Z', 'id': '67af7647cf0332fdd191f64d', 'url': 'https://www.speedhunters.com/post_tag-sitemap2.xml'}], 'auth': 'akh4My83Myt3Y2NkbnJHUll2RFVlZz09LS1wbjU5TEZmNVdxUDZCK0lSa2dwRlhRPT0=--21413c9faa1309012319bdf4132ef5f4d0add9fb'}}, {'_index': '.ent-search-actastic-crawler2_domains', '_id': '67adf64bcf0332cb308aef02', '_score': 1.0, '_source': {'id': '67adf64bcf0332cb308aef02', 'configuration_oid': '67adf643cf03320a318aeefa', 'name': 'https://www.dfworkshop.net', 'crawl_rules': [{'pattern': 'DOS', 'created_at': '2025-02-13T13:43:27Z', 'rule': 'contains', 'id': '67adf6ffcf03320a318aef0a', 'order': 0, 'policy': 'deny'}], 'seed_urls': [{'created_at': '2025-02-13T13:40:26Z', 'id': '67adf64acf0332cb308aef01', 'url': 'https://www.dfworkshop.net/'}], 'sitemaps': [], 'auth': 'V001b0R1a2xCQmsyYloyQUYvTzJJQT09LS1md284bXJjOGhyRllMeGR3RlVpYU13PT0=--34940590d810db53c8dd783f58244db6c74dad20'}}, {'_index': '.ent-search-actastic-crawler2_domains', '_id': '67a3867317490c4bca482807', '_score': 1.0, '_source': {'id': '67a3867317490c4bca482807', 'configuration_oid': '67a3857117490c2adf48274d', 'name': 'https://justinjackson.ca', 'crawl_rules': [], 'seed_urls': [{'url': 'https://justinjackson.ca/words.html', 'id': '67a3867317490c4bca482806', 'created_at': '2025-02-05T15:40:35Z'}], 'sitemaps': [], 'auth': 'SWpyRVpYdUp0WHlCL1hHOCtXaEhaQT09LS1Ub3ZpV1dpL2FONGpod0ZSeUFYV1RBPT0=--c83fbe57eeffa5429b450de0ffe1fe4aade85e43'}}, {'_index': '.ent-search-actastic-crawler2_domains', '_id': '67a3858717490ccc74482755', '_score': 1.0, '_source': {'id': '67a3858717490ccc74482755', 'configuration_oid': '67a3857117490c2adf48274d', 'name': 'https://matt-nowzari.myportfolio.com', 'crawl_rules': [{'pattern': '/The', 'created_at': '2025-02-05T16:33:59Z', 'rule': 'begins', 'id': '67a392f717490ccb5b4853ec', 'order': 0, 'policy': 'deny'}], 'seed_urls': [{'url': 'https://matt-nowzari.myportfolio.com/', 'id': '67a3858717490ccc74482754', 'created_at': '2025-02-05T15:36:39Z'}], 'sitemaps': [], 'auth': 'U1J4d0xBTW1EN2Ryb05iNXoxRmF1UT09LS1kMDhmMEZWMys0RlJDd2hzWmZMNWx3PT0=--2f4cae86febac77fb8e1fb0133d6520029716a37'}}]}}\n", + "\n", + "Name: https://www.speedhunters.com\n", + "Configuration OID: 67abc13fcf0332544683f928\n", + "Domain ID: 67abc186cf0332829183f93b\n", + "Sitemaps : [{'created_at': '2025-02-14T16:58:47Z', 'id': '67af7647cf0332fdd191f64d', 'url': 'https://www.speedhunters.com/post_tag-sitemap2.xml'}]\n", + "Crawl rules: []\n", + "Seed URLs: [{'created_at': '2025-02-11T21:31:04Z', 'id': '67abc198cf0332d5ae83f93d', 'url': 'https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/'}]\n", + "Auth (decode and split on the ':' delimiter): akh4My83Myt3Y2NkbnJHUll2RFVlZz09LS1wbjU5TEZmNVdxUDZCK0lSa2dwRlhRPT0=--21413c9faa1309012319bdf4132ef5f4d0add9fb\n", + "\n", + "Name: https://www.dfworkshop.net\n", + "Configuration OID: 67adf643cf03320a318aeefa\n", + "Domain ID: 67adf64bcf0332cb308aef02\n", + "Sitemaps : []\n", + "Crawl rules: [{'pattern': 'DOS', 'created_at': '2025-02-13T13:43:27Z', 'rule': 'contains', 'id': '67adf6ffcf03320a318aef0a', 'order': 0, 'policy': 'deny'}]\n", + "Seed URLs: [{'created_at': '2025-02-13T13:40:26Z', 'id': '67adf64acf0332cb308aef01', 'url': 'https://www.dfworkshop.net/'}]\n", + "Auth (decode and split on the ':' delimiter): V001b0R1a2xCQmsyYloyQUYvTzJJQT09LS1md284bXJjOGhyRllMeGR3RlVpYU13PT0=--34940590d810db53c8dd783f58244db6c74dad20\n", + "\n", + "Name: https://justinjackson.ca\n", + "Configuration OID: 67a3857117490c2adf48274d\n", + "Domain ID: 67a3867317490c4bca482807\n", + "Sitemaps : []\n", + "Crawl rules: []\n", + "Seed URLs: [{'url': 'https://justinjackson.ca/words.html', 'id': '67a3867317490c4bca482806', 'created_at': '2025-02-05T15:40:35Z'}]\n", + "Auth (decode and split on the ':' delimiter): SWpyRVpYdUp0WHlCL1hHOCtXaEhaQT09LS1Ub3ZpV1dpL2FONGpod0ZSeUFYV1RBPT0=--c83fbe57eeffa5429b450de0ffe1fe4aade85e43\n", + "\n", + "Name: https://matt-nowzari.myportfolio.com\n", + "Configuration OID: 67a3857117490c2adf48274d\n", + "Domain ID: 67a3858717490ccc74482755\n", + "Sitemaps : []\n", + "Crawl rules: [{'pattern': '/The', 'created_at': '2025-02-05T16:33:59Z', 'rule': 'begins', 'id': '67a392f717490ccb5b4853ec', 'order': 0, 'policy': 'deny'}]\n", + "Seed URLs: [{'url': 'https://matt-nowzari.myportfolio.com/', 'id': '67a3858717490ccc74482754', 'created_at': '2025-02-05T15:36:39Z'}]\n", + "Auth (decode and split on the ':' delimiter): U1J4d0xBTW1EN2Ryb05iNXoxRmF1UT09LS1kMDhmMEZWMys0RlJDd2hzWmZMNWx3PT0=--2f4cae86febac77fb8e1fb0133d6520029716a37\n", + "\n" + ] + } + ], + "source": [ + "print (\".ent-search-actastic-crawler2_domains\\n\")\n", + "\n", + "domains_r = es_client.search(\n", + " index=\".ent-search-actastic-crawler2_domains\",\n", + " _source=[\"name\",\n", + " \"configuration_oid\",\n", + " \"id\",\n", + " \"sitemaps\",\n", + " \"crawl_rules\",\n", + " \"seed_urls\",\n", + " \"auth\"]\n", + ")\n", + "\n", + "print (domains_r)\n", + "print ()\n", + "\n", + "for i in domains_r[\"hits\"][\"hits\"]:\n", + " source = i[\"_source\"]\n", + " print (f\"Name: {source['name']}\") # <--\n", + " \n", + " print (f\"Configuration OID: {source['configuration_oid']}\") # <--\n", + " print (f\"Domain ID: {source['id']}\") # <--\n", + " \n", + " print (f\"Sitemaps : {source['sitemaps']}\") # <--\n", + " print (f\"Crawl rules: {source['crawl_rules']}\") # <--\n", + " print (f\"Seed URLs: {source['seed_urls']}\") # <--\n", + " \n", + " print (f\"Auth (decode and split on the ':' delimiter): {source['auth']}\\n\") # <--" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "eef6570b-26d0-4d98-a279-cf55fd9d31a5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['67adf64bcf0332cb308aef02', '67abc186cf0332829183f93b', '67a3867317490c4bca482807', '67a3858717490ccc74482755']\n", + "['67adf643cf03320a318aeefa', '67abc13fcf0332544683f928', '67a3857117490c2adf48274d']\n" + ] + } + ], + "source": [ + "all_domain_ids = []\n", + "all_config_ids = []\n", + "for i in domains_r[\"hits\"][\"hits\"]:\n", + " source = i[\"_source\"]\n", + " \n", + " all_domain_ids.append(source['id'])\n", + "\n", + " if source['configuration_oid'] not in all_config_ids:\n", + " all_config_ids.append(source['configuration_oid'])\n", + "\n", + "print (all_domain_ids)\n", + "print (all_config_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "fbddf119-e73c-4e97-9759-3f2f34b8000f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'A': [{'a': 1}],\n", + " 'extraction_rulesets': [{'rules': {'action': '',\n", + " 'field_name': '',\n", + " 'join_as': '',\n", + " 'selector': '',\n", + " 'source': '',\n", + " 'value': ''},\n", + " 'url_filters': {'pattern': '', 'type': ''}}]}\n", + "/Users/mattnowzari/repos/search_and_transform/crawler/migration/my-output-conf.yml\n" + ] + } + ], + "source": [ + "import yaml\n", + "import os\n", + "import pprint as pp\n", + "\n", + "##### You may change the value of file_name #####\n", + "file_name = \"my-output-conf.yml\"\n", + "#################################################\n", + "\n", + "data = {\n", + " 'A':[{'a':1}],\n", + " 'extraction_rulesets':[{\n", + " \"url_filters\": {\"type\": \"\", \"pattern\": \"\"},\n", + " \"rules\": {\"action\": \"\", \"field_name\": \"\", \"selector\": \"\", \"join_as\": \"\", \"value\": \"\", \"source\": \"\"}\n", + " }\n", + " ]\n", + " \n", + "}\n", + "# pp.pprint(data)\n", + "\n", + "base_dir = os.getcwd()\n", + "\n", + "output_path = os.path.join(base_dir, file_name)\n", + "print (output_path)\n", + "\n", + "if os.path.exists(base_dir):\n", + " with open(output_path, 'w') as file:\n", + " yaml.dump(data, file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d525c194-643b-4f97-a59b-f0fa17ba3bfd", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/migration/crawler_migration.ipynb b/migration/crawler_migration.ipynb index 94793c3e..b74898d7 100644 --- a/migration/crawler_migration.ipynb +++ b/migration/crawler_migration.ipynb @@ -6,7 +6,9 @@ "metadata": {}, "source": [ "## Hello, future Elastic Open Crawler user!\n", - "This notebook is designed to help you painlessly migrate your Elastic Crawler configurations to Open Crawler-friendly YAML!\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)]()\n", + "\n", + "This notebook is designed to help you migrate your Elastic Crawler configurations to Open Crawler-friendly YAML!\n", "\n", "We recommend running each cell individually in a sequential fashion, as each cell is dependent on previous cells having been run." ] @@ -30,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 510, "id": "da411d2f-9aff-46af-845a-5fe9be19ea3c", "metadata": {}, "outputs": [ @@ -50,7 +52,11 @@ "\n", "from getpass import getpass\n", "from elasticsearch import Elasticsearch\n", - "import json" + "\n", + "import os\n", + "import json\n", + "import yaml\n", + "import pprint\n" ] }, { @@ -68,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 511, "id": "08e6e3d2-62d3-4890-a6be-41fe0a931ef6", "metadata": {}, "outputs": [ @@ -96,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 512, "id": "f3ada2cb-b00f-4b1d-be09-57b2ccf25c7c", "metadata": {}, "outputs": [ @@ -106,7 +112,7 @@ "'You Know, for Search'" ] }, - "execution_count": 7, + "execution_count": 512, "metadata": {}, "output_type": "execute_result" } @@ -136,36 +142,581 @@ "id": "a55236e7-19dc-4f4c-92b9-d10848dd6af9", "metadata": {}, "source": [ - "#### Step 1: Grabbing basic configurations\n", + "### Step 1: Acquire Basic Configurations\n", "\n", - "The first order of business is to establish what Crawlers you have, and their basic configuration details.\n", + "The first order of business is to establish what Crawlers you have and their basic configuration details.\n", "This migration notebook will attempt to pull configurations for every distinct Crawler you have in your Elasticsearch instance." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 669, "id": "0a698b05-e939-42a5-aa31-51b1b1883e6f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. search-search-crawler-fully-loaded-8.18\n", + " Crawler ID is 67b74f16204956a3ce9fd0a4\n", + "\n", + "2. search-daggerfall-unity-website-crawler-8.18\n", + " Crawler ID is 67b74f84204956efce9fd0b7\n", + "\n", + "3. search-migration-crawler\n", + " Crawler ID is 67b7509b2049567f859fd0d4\n", + "\n", + "4. search-basic\n", + " Crawler ID is 67b75aeb20495617d59fd0ea\n", + "\n" + ] + } + ], "source": [ - "# define an intermediate data structure\n", + " # in-memory data structure that maintains current state of the configs we've pulled\n", "inflight_configuration_data = {}\n", "\n", "crawler_configurations = es_client.search(\n", - " index=\".ent-search-actastic-crawler2_extraction_rules\",\n", + " index=\".ent-search-actastic-crawler2_configurations_v2\",\n", ")\n", "\n", + "crawler_counter = 1\n", "for configuration in crawler_configurations[\"hits\"][\"hits\"]:\n", - " source = configuration['_source']\n", - " conf_map = {} # this will be the entire config hashmap for a single Crawler\n", - " output_index = configuration[\"_index\"]" + " source = configuration[\"_source\"]\n", + "\n", + " # extract values\n", + " crawler_oid = source[\"id\"]\n", + " output_index = source[\"index_name\"]\n", + "\n", + " print (f\"{crawler_counter}. {output_index}\")\n", + " print (f\" Crawler ID is {crawler_oid}\\n\")\n", + " crawler_counter += 1\n", + "\n", + " crawl_schedule = [] # either no schedule or a specific schedule - determined in Step 4\n", + " if source[\"use_connector_schedule\"] == False and source[\"crawl_schedule\"]: # an interval schedule is being used\n", + " crawl_schedule = source[\"crawl_schedule\"] # this will be transformed in Step 4\n", + "\n", + " # populate a temporary hashmap\n", + " temp_conf_map = {\n", + " \"output_index\": output_index,\n", + " \"schedule\": crawl_schedule\n", + " }\n", + " # pre-populate some necessary fields in preparation for upcoming steps\n", + " temp_conf_map[\"domains_temp\"] = {}\n", + " temp_conf_map[\"output_sink\"] = \"elasticsearch\"\n", + " temp_conf_map[\"full_html_extraction_enabled\"] = False\n", + " temp_conf_map[\"elasticsearch\"] = {\n", + " \"host\": \"\",\n", + " \"port\": \"\",\n", + " \"api_key\": \"\",\n", + " # \"username\": \"\",\n", + " # \"password\": \"\",\n", + " }\n", + " # populate the in-memory data structure\n", + " inflight_configuration_data[crawler_oid] = temp_conf_map\n", + "\n", + "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT" + ] + }, + { + "cell_type": "markdown", + "id": "34f5e024-688c-4ffb-a16f-35f5171ba7a8", + "metadata": {}, + "source": [ + "**Before continuing, please verify in the output above that the correct number of Crawlers was found!**\n", + "\n", + "Now that we have some basic data about your Crawlers, let's use this information to get more configuration values!" + ] + }, + { + "cell_type": "markdown", + "id": "2b9e2da7-853c-40bd-9ee1-02c4d92b3b43", + "metadata": {}, + "source": [ + "### Step 2: URLs, Sitemaps, and Crawl Rules\n", + "\n", + "In this cell, we will need to query Elasticsearch for information about each Crawler's domain URLs, seed URLs, sitemaps, and crawling rules." + ] + }, + { + "cell_type": "code", + "execution_count": 670, + "id": "e1c64c3d-c8d7-4236-9ed9-c9b1cb5e7972", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.) Crawler ID 67b74f16204956a3ce9fd0a4\n", + " Domain https://www.speedhunters.com found!\n", + " Seed URls found: ['https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/', 'https://www.speedhunters.com/2025/02/daniel-arsham-eroded-porsche-911/', 'https://www.speedhunters.com/2025/02/5-plus-7-equals-v12-a-custom-bmw-super-saloon/']\n", + " Sitemap URLs found: ['https://www.speedhunters.com/post_tag-sitemap2.xml']\n", + "2.) Crawler ID 67b74f84204956efce9fd0b7\n", + " Domain https://www.dfworkshop.net found!\n", + " Seed URls found: ['https://www.dfworkshop.net/']\n", + " Crawl rules found: [{'policy': 'allow', 'type': 'begins', 'pattern': '/word'}, {'policy': 'deny', 'type': 'contains', 'pattern': 'DOS'}]\n", + " Domain https://www.speedhunters.com found!\n", + " Seed URls found: ['https://www.speedhunters.com/']\n", + " Crawl rules found: [{'policy': 'deny', 'type': 'begins', 'pattern': '/BMW'}]\n", + "3.) Crawler ID 67b7509b2049567f859fd0d4\n", + " Domain https://justinjackson.ca found!\n", + " Seed URls found: ['https://justinjackson.ca/']\n", + " Domain https://matt-nowzari.myportfolio.com found!\n", + " Seed URls found: ['https://matt-nowzari.myportfolio.com/']\n", + " Crawl rules found: [{'policy': 'deny', 'type': 'begins', 'pattern': '/The'}]\n", + "4.) Crawler ID 67b75aeb20495617d59fd0ea\n", + " Domain https://www.elastic.co found!\n", + " Seed URls found: ['https://www.elastic.co/']\n" + ] + } + ], + "source": [ + "crawler_ids_to_query = inflight_configuration_data.keys()\n", + "\n", + "crawler_counter = 1\n", + "for crawler_oid in crawler_ids_to_query:\n", + " # query ES to get the crawler's domain configurations\n", + " crawler_domains = es_client.search(\n", + " index=\".ent-search-actastic-crawler2_domains\",\n", + " query={\"match\": {\"configuration_oid\": crawler_oid}},\n", + " _source=[\"name\",\n", + " \"configuration_oid\",\n", + " \"id\",\n", + " \"sitemaps\",\n", + " \"crawl_rules\",\n", + " \"seed_urls\",\n", + " \"auth\"]\n", + " )\n", + " print (f\"{crawler_counter}.) Crawler ID {crawler_oid}\")\n", + " crawler_counter += 1\n", + " \n", + " # for each domain the Crawler has, grab its config values\n", + " # and update the in-memory data structure\n", + " for domain_info in crawler_domains[\"hits\"][\"hits\"]:\n", + " source = domain_info[\"_source\"]\n", + "\n", + " # extract values\n", + " domain_oid = str(source[\"id\"])\n", + " domain_url = source[\"name\"]\n", + " seed_urls = source[\"seed_urls\"]\n", + " sitemap_urls = source[\"sitemaps\"]\n", + " crawl_rules = source[\"crawl_rules\"]\n", + "\n", + " print (f\" Domain {domain_url} found!\")\n", + " \n", + " # transform seed, sitemap, and crawl rules into arrays\n", + " seed_urls_list = []\n", + " for seed_obj in seed_urls:\n", + " seed_urls_list.append(seed_obj[\"url\"])\n", + "\n", + " sitemap_urls_list= []\n", + " for sitemap_obj in sitemap_urls:\n", + " sitemap_urls_list.append(sitemap_obj[\"url\"])\n", + "\n", + " crawl_rules_list = []\n", + " for crawl_rules_obj in crawl_rules:\n", + " crawl_rules_list.append({\n", + " \"policy\" : crawl_rules_obj[\"policy\"],\n", + " \"type\": crawl_rules_obj[\"rule\"],\n", + " \"pattern\": crawl_rules_obj[\"pattern\"]\n", + " })\n", + "\n", + " # populate a temporary hashmap\n", + " temp_domain_conf = {\"url\": domain_url}\n", + " if seed_urls_list:\n", + " temp_domain_conf[\"seed_urls\"] = seed_urls_list\n", + " print (f\" Seed URls found: {seed_urls_list}\")\n", + " if sitemap_urls_list:\n", + " temp_domain_conf[\"sitemap_urls\"] = sitemap_urls_list\n", + " print (f\" Sitemap URLs found: {sitemap_urls_list}\")\n", + " if crawl_rules_list:\n", + " temp_domain_conf[\"crawl_rules\"] = crawl_rules_list\n", + " print (f\" Crawl rules found: {crawl_rules_list}\")\n", + " \n", + " # populate the in-memory data structure\n", + " inflight_configuration_data[crawler_oid][\"domains_temp\"][domain_oid] = temp_domain_conf\n", + "\n", + "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT" + ] + }, + { + "cell_type": "markdown", + "id": "575c00ac-7c84-465e-83d7-aa51f8e5310d", + "metadata": {}, + "source": [ + "### Step 3: Extracting the Extraction Rules\n", + "\n", + "In the following cell, we will be acquiring any extraction rules you may have set in your Elastic Crawlers." + ] + }, + { + "cell_type": "code", + "execution_count": 671, + "id": "61a7df7a-72ad-4330-a30c-da319befd55c", + "metadata": {}, + "outputs": [], + "source": [ + "extraction_rules = es_client.search(\n", + " index=\".ent-search-actastic-crawler2_extraction_rules\",\n", + " _source=[\"configuration_oid\", \"domain_oid\", \"rules\", \"url_filters\"]\n", + ")\n", + "\n", + "for exr_rule in extraction_rules[\"hits\"][\"hits\"]:\n", + " source = exr_rule[\"_source\"]\n", + "\n", + " config_oid = source[\"configuration_oid\"]\n", + " domain_oid = source[\"domain_oid\"]\n", + " \n", + " all_rules = source[\"rules\"]\n", + " all_url_filters = source[\"url_filters\"]\n", + "\n", + " # extract url filters\n", + " url_filters = []\n", + " if all_url_filters:\n", + " url_filters = [{\n", + " \"type\": all_url_filters[0][\"filter\"],\n", + " \"pattern\": all_url_filters[0][\"pattern\"],\n", + " }]\n", + "\n", + " # extract rulesets\n", + " action_translation_map = {\n", + " \"fixed\": \"set\",\n", + " \"extracted\": \"extract\",\n", + " }\n", + " \n", + " ruleset = {}\n", + " if all_rules:\n", + " ruleset = [{\n", + " \"action\": action_translation_map[all_rules[0][\"content_from\"][\"value_type\"]],\n", + " \"field_name\": all_rules[0][\"field_name\"],\n", + " \"selector\": all_rules[0][\"selector\"],\n", + " \"join_as\": all_rules[0][\"multiple_objects_handling\"],\n", + " \"value\": all_rules[0][\"content_from\"][\"value\"],\n", + " \"source\": all_rules[0][\"source_type\"],\n", + " }]\n", + "\n", + " # populate the in-memory data structure\n", + " temp_extraction_rulesets = [{\n", + " \"url_filters\": url_filters,\n", + " \"rules\": ruleset,\n", + " }]\n", + " inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\"extraction_rulesets\"] = temp_extraction_rulesets\n", + "\n", + "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT" + ] + }, + { + "cell_type": "markdown", + "id": "538fb054-1399-4b88-bd1e-fef116491421", + "metadata": {}, + "source": [ + "### Step 4: Schedules\n", + "\n", + "In the upcoming cell, we will be gathing any schedules your Crawlers have set." + ] + }, + { + "cell_type": "code", + "execution_count": 672, + "id": "d880e081-f960-41c7-921e-26896f248eab", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_cron_expression(interval_values: dict) -> str:\n", + " return interval_values # TODO TODO this ** might not be needed? **\n", + "\n", + "# ---------------------------\n", + "\n", + "for crawler_oid, crawler_config in inflight_configuration_data.items():\n", + " output_index = crawler_config[\"output_index\"]\n", + " \n", + " existing_schedule_value = crawler_config[\"schedule\"]\n", + "\n", + " if not existing_schedule_value:\n", + " # query ES to get this Crawler's specific time schedule\n", + " schedules_result = es_client.search(\n", + " index=\".elastic-connectors-v1\",\n", + " query={\"match\": {\"index_name\": output_index}},\n", + " _source=[\"index_name\", \"scheduling\"]\n", + " )\n", + " # update schedule field with cron expression if specific time scheduling is enabled\n", + " if schedules_result[\"hits\"][\"hits\"][0][\"_source\"][\"scheduling\"][\"full\"][\"enabled\"]:\n", + " specific_time_schedule = schedules_result[\"hits\"][\"hits\"][0][\"_source\"][\"scheduling\"][\"full\"][\"interval\"]\n", + " crawler_config[\"schedule\"] = specific_time_schedule\n", + " elif isinstance(existing_schedule_value[0], dict):\n", + " crawler_config[\"schedule\"] = generate_cron_expression(existing_schedule_value)\n", + " \n", + "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT " + ] + }, + { + "cell_type": "markdown", + "id": "b1586df2-283d-435f-9b08-ba9fad3a7e0a", + "metadata": {}, + "source": [ + "### Step 5: Creating the Open Crawler YAML configuration files\n", + "\n", + "In this final step, we will be creating the actual YAML files you need to get up and running with Open Crawler!\n", + "\n", + "The upcoming cell performs some final transformations to the in-memory data structure that is keeping track of your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 673, + "id": "dd70f102-33ee-4106-8861-0aa0f9a223a1", + "metadata": {}, + "outputs": [], + "source": [ + "# Final transform of the in-memory data structure to a form we can dump to YAML\n", + "# for each crawler, collect all of its domain configurations into a list\n", + "for crawler_config in inflight_configuration_data.values():\n", + " all_crawler_domains = []\n", + " \n", + " for domain_config in crawler_config[\"domains_temp\"].values():\n", + " all_crawler_domains.append(domain_config)\n", + " # create a new key called \"domains\" that points to a list of domain configs only - no domain_oid values as keys\n", + " crawler_config[\"domains\"] = all_crawler_domains\n", + " # delete the temporary domain key\n", + " del crawler_config[\"domains_temp\"]\n", + "\n", + "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT " + ] + }, + { + "cell_type": "markdown", + "id": "e611a486-e12f-4951-ab95-ca54241a7a06", + "metadata": {}, + "source": [ + "#### **Wait! Before we continue onto creating our YAML files, we're going to need your input on a few things.**\n", + "\n", + "In the following cell, please enter the following details about the _Elasticsearch instance you will be using with Open Crawler_:\n", + "- The Elasticsearch endpoint URL\n", + "- The port number of your Elasticsearch endpoint\n", + "- An API key" + ] + }, + { + "cell_type": "code", + "execution_count": 660, + "id": "213880cc-cbf3-40d9-8c7d-6fcf6428c16b", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Elasticsearch endpoint URL: https://4911ebad5ed44d149fe8ddad4a4b3751.us-west2.gcp.elastic-cloud.com\n", + "The Elasticsearch endpoint's port number: 443\n", + "Elasticsearch API key: ········\n" + ] + } + ], + "source": [ + "ENDPOINT = input(\"Elasticsearch endpoint URL: \")\n", + "PORT = input(\"The Elasticsearch endpoint's port number: \")\n", + "API_KEY = getpass(\"Elasticsearch API key: \")\n", + "\n", + "# set the above values in each Crawler's configuration\n", + "for crawler_config in inflight_configuration_data.values():\n", + " crawler_config[\"elasticsearch\"][\"host\"] = ENDPOINT\n", + " crawler_config[\"elasticsearch\"][\"port\"] = int(PORT)\n", + " crawler_config[\"elasticsearch\"][\"api_key\"] = API_KEY" + ] + }, + { + "cell_type": "markdown", + "id": "67dfc7c6-429e-42f0-ab08-2c84d72945cb", + "metadata": {}, + "source": [ + "#### **This is the final step! You have two options here:**\n", + "\n", + "- The \"Write to YAML\" cell will create _n_ number of YAML files, one for each Crawler you have.\n", + "- The \"Print to output\" cell will print each Crawler's configuration YAML in the Notebook, so you can copy-paste them into your Open Crawler YAML files manually.\n", + "\n", + "Feel free to run both! You can run Option 2 first to see the output before running Option 1 to save the configs into YAML files." + ] + }, + { + "cell_type": "markdown", + "id": "7ca5ad33-364c-4d13-88fc-db19052363d5", + "metadata": {}, + "source": [ + "#### Option 1: Write to YAML file" + ] + }, + { + "cell_type": "code", + "execution_count": 661, + "id": "6adc53db-d781-4b72-a5f3-441364f354b8", + "metadata": {}, + "outputs": [], + "source": [ + "# Dump each Crawler's configuration into its own YAML file\n", + "for crawler_config in inflight_configuration_data.values():\n", + " base_dir = os.getcwd()\n", + " file_name = f\"{crawler_config['output_index']}-config.yml\" # autogen a custom filename\n", + " output_path = os.path.join(base_dir, file_name)\n", + "\n", + " if os.path.exists(base_dir):\n", + " with open(output_path, 'w') as file:\n", + " yaml.safe_dump(\n", + " crawler_config,\n", + " file,\n", + " sort_keys=False\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "35c56a2b-4acd-47f5-90e3-9dd39fa4383f", + "metadata": {}, + "source": [ + "#### Option 2: Print to output" + ] + }, + { + "cell_type": "code", + "execution_count": 674, + "id": "525aabb8-0537-4ba6-8109-109490dddafe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "YAML config => search-search-crawler-fully-loaded-8.18-config.yml\n", + "--------\n", + "output_index: search-search-crawler-fully-loaded-8.18\n", + "schedule: []\n", + "output_sink: elasticsearch\n", + "full_html_extraction_enabled: false\n", + "elasticsearch:\n", + " host: ''\n", + " port: ''\n", + " api_key: ''\n", + "domains:\n", + "- url: https://www.speedhunters.com\n", + " seed_urls:\n", + " - https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/\n", + " - https://www.speedhunters.com/2025/02/daniel-arsham-eroded-porsche-911/\n", + " - https://www.speedhunters.com/2025/02/5-plus-7-equals-v12-a-custom-bmw-super-saloon/\n", + " sitemap_urls:\n", + " - https://www.speedhunters.com/post_tag-sitemap2.xml\n", + "\n", + "--------------------------------------------------------------------------------\n", + "YAML config => search-daggerfall-unity-website-crawler-8.18-config.yml\n", + "--------\n", + "output_index: search-daggerfall-unity-website-crawler-8.18\n", + "schedule: 0 30 8 * * ?\n", + "output_sink: elasticsearch\n", + "full_html_extraction_enabled: false\n", + "elasticsearch:\n", + " host: ''\n", + " port: ''\n", + " api_key: ''\n", + "domains:\n", + "- url: https://www.dfworkshop.net\n", + " seed_urls:\n", + " - https://www.dfworkshop.net/\n", + " crawl_rules:\n", + " - policy: allow\n", + " type: begins\n", + " pattern: /word\n", + " - policy: deny\n", + " type: contains\n", + " pattern: DOS\n", + " extraction_rulesets:\n", + " - url_filters:\n", + " - type: begins\n", + " pattern: /elderscrolls/*\n", + " rules:\n", + " - action: set\n", + " field_name: elder_field\n", + " selector: /elderscrolls/*\n", + " join_as: string\n", + " value: ping\n", + " source: url\n", + "- url: https://www.speedhunters.com\n", + " seed_urls:\n", + " - https://www.speedhunters.com/\n", + " crawl_rules:\n", + " - policy: deny\n", + " type: begins\n", + " pattern: /BMW\n", + "\n", + "--------------------------------------------------------------------------------\n", + "YAML config => search-migration-crawler-config.yml\n", + "--------\n", + "output_index: search-migration-crawler\n", + "schedule: []\n", + "output_sink: elasticsearch\n", + "full_html_extraction_enabled: false\n", + "elasticsearch:\n", + " host: ''\n", + " port: ''\n", + " api_key: ''\n", + "domains:\n", + "- url: https://justinjackson.ca\n", + " seed_urls:\n", + " - https://justinjackson.ca/\n", + "- url: https://matt-nowzari.myportfolio.com\n", + " seed_urls:\n", + " - https://matt-nowzari.myportfolio.com/\n", + " crawl_rules:\n", + " - policy: deny\n", + " type: begins\n", + " pattern: /The\n", + " extraction_rulesets:\n", + " - url_filters: []\n", + " rules:\n", + " - action: set\n", + " field_name: test_field\n", + " selector: /html/body/a/@title\n", + " join_as: string\n", + " value: some_rando_value\n", + " source: html\n", + "\n", + "--------------------------------------------------------------------------------\n", + "YAML config => search-basic-config.yml\n", + "--------\n", + "output_index: search-basic\n", + "schedule:\n", + "- unit: hour\n", + " frequency: 24\n", + "output_sink: elasticsearch\n", + "full_html_extraction_enabled: false\n", + "elasticsearch:\n", + " host: ''\n", + " port: ''\n", + " api_key: ''\n", + "domains:\n", + "- url: https://www.elastic.co\n", + " seed_urls:\n", + " - https://www.elastic.co/\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "for crawler_config in inflight_configuration_data.values():\n", + " yaml_out = yaml.safe_dump(\n", + " crawler_config,\n", + " sort_keys=False\n", + " )\n", + " \n", + " print (f\"YAML config => {crawler_config['output_index']}-config.yml\\n--------\")\n", + " print (yaml_out)\n", + " print (\"--------------------------------------------------------------------------------\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "8ee89b6a-00fe-4048-a6d6-a90fdbaaceed", + "id": "55888204-f823-48cd-bca4-a7663e0fe56a", "metadata": {}, "outputs": [], "source": [] diff --git a/migration/crawler_migration_exploration.ipynb b/migration/crawler_migration_exploration.ipynb new file mode 100644 index 00000000..42a3922b --- /dev/null +++ b/migration/crawler_migration_exploration.ipynb @@ -0,0 +1,564 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "4f198cd5-cc9c-4080-8dd4-425628b05d4d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is the start of the notebook\n", + "Hopefully we can learn a thing or two, before doing a thing or two.\n" + ] + } + ], + "source": [ + "print (\"This is the start of the notebook\")\n", + "print (\"Hopefully we can learn a thing or two, before doing a thing or two.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1914cbf9-a18b-4b1d-9ea1-5bc04e23ceff", + "metadata": {}, + "outputs": [], + "source": [ + "from elasticsearch import Elasticsearch\n", + "import json\n", + "\n", + "endpoint = \"https://5a5b8a5cdd84464dae4c7c7ae8a59562.us-east1.gcp.elastic-cloud.com:443\"\n", + "api_key = \"aTN4MUdwVUJLTFFTSmFFWjBlTFM6dmU0ZXJnTjdUaUs5dXhIUU1fd0xiZw==\"\n", + "\n", + "es_client = Elasticsearch(\n", + " endpoint,\n", + " api_key=api_key,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "711bb339-bbc8-4112-a392-dde01f5e5729", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The index [search-migration_crawler] was found!\n" + ] + } + ], + "source": [ + "index_name = \"search-migration_crawler\"\n", + "if not es_client.indices.exists(index=index_name):\n", + " print (\"Eek! The index does not exist!\")\n", + "else:\n", + " print (f\"The index [{index_name}] was found!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a14009b2-2d34-465e-b43d-a274f01fbff0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Let's see if we can get _all_ indices in this ES instance related to crawler?\n", + "\n", + "We got the list as a JSON dictionary! We received 20 indices.\n", + "\n", + " 4986743 docs -> .ds-logs-elastic_crawler-default-2025.02.05-000001\n", + " 0 docs -> .ent-search-actastic-app_search_crawler_content_metadata\n", + " 0 docs -> .ent-search-actastic-app_search_crawler_content_metadata-content_hash-engine_oid-unique-constraint\n", + " 0 docs -> .ent-search-actastic-app_search_crawler_content_url_metadata\n", + " 3 docs -> .ent-search-actastic-crawler2_configurations_v2\n", + " 3 docs -> .ent-search-actastic-crawler2_configurations_v2-index_name-unique-constraint\n", + " 9385 docs -> .ent-search-actastic-crawler2_content_metadata\n", + " 9385 docs -> .ent-search-actastic-crawler2_content_metadata-configuration_oid-content_hash-unique-constraint\n", + " 9532 docs -> .ent-search-actastic-crawler2_content_url_metadata\n", + " 332 docs -> .ent-search-actastic-crawler2_crawl_requests_v2\n", + " 4 docs -> .ent-search-actastic-crawler2_domains\n", + " 4 docs -> .ent-search-actastic-crawler2_domains-configuration_oid-name-unique-constraint\n", + " 2 docs -> .ent-search-actastic-crawler2_extraction_rules\n", + " 0 docs -> .ent-search-actastic-crawler2_process_crawls\n", + " 651 docs -> .ent-search-actastic-crawler2_robots_txts\n", + " 0 docs -> .ent-search-actastic-crawler_crawl_requests_v7\n", + " 0 docs -> .ent-search-actastic-crawler_domains_v6\n", + " 0 docs -> .ent-search-actastic-crawler_domains_v6-engine_oid-name-unique-constraint\n", + " 0 docs -> .ent-search-actastic-crawler_process_crawls\n", + " 0 docs -> .ent-search-actastic-crawler_robots_txts_v3\n", + "\n", + "There are 20 healthy indices, 0 sick indices and 0 unhealthy indices.\n", + "11 indices have docs, and 9 indices do not.\n" + ] + } + ], + "source": [ + "print (\"Let's see if we can get _all_ indices in this ES instance related to crawler?\\n\")\n", + "\n", + "json_response = es_client.cat.indices(\n", + " index=\".*crawler*\", # take note of the . before *crawler* - this tells the API to query 'hidden' indices as well\n", + " s=\"index\",\n", + " format=\"json\"\n", + ")\n", + "\n", + "print (f\"We got the list as a JSON dictionary! We received {len(json_response)} indices.\\n\")\n", + "\n", + "health_histogram = {\n", + " \"green\": 0,\n", + " \"yellow\": 0,\n", + " \"red\": 0,\n", + "}\n", + "\n", + "indices_with_docs = {\n", + " \"with_docs\": 0,\n", + " \"without_docs\": 0,\n", + "}\n", + "\n", + "index_names = [] # save the index names to run through all of them at some point in the future\n", + "\n", + "for item in json_response.body: # Note that calling .body on the response will get us a List of dictionaries:\n", + " health_status = item[\"health\"]\n", + " health_histogram[health_status] += 1\n", + "\n", + " if int(item['docs.count']) > 0:\n", + " indices_with_docs['with_docs'] += 1\n", + " index_names.append(item['index'])\n", + " else:\n", + " indices_with_docs['without_docs'] += 1\n", + "\n", + " print (f\" {item['docs.count']} docs -> {item['index']}\")\n", + "\n", + "print (f\"\\nThere are {health_histogram['green']} healthy indices, {health_histogram['yellow']} sick indices \\\n", + "and {health_histogram['red']} unhealthy indices.\")\n", + "\n", + "print (f\"{indices_with_docs['with_docs']} indices have docs, and {indices_with_docs['without_docs']} \\\n", + "indices do not.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "db96133f-bb1b-4f40-b02f-4812a585964d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Let's grab the extraction rules for our crawler.\n", + "The extraction rules are defined in the index .ent-search-actastic-crawler2_extraction_rules\n", + "\n", + "{'hits': [{'_id': '67a393cc17490cc0d24853ef',\n", + " '_index': '.ent-search-actastic-crawler2_extraction_rules',\n", + " '_score': 1.0,\n", + " '_source': {'configuration_oid': '67a3857117490c2adf48274d',\n", + " 'created_at': '2025-02-05T16:37:32Z',\n", + " 'description': 'extraction_rule_generic',\n", + " 'domain_oid': '67a3858717490ccc74482755',\n", + " 'edited_by': '1432693181',\n", + " 'id': '67a393cc17490cc0d24853ef',\n", + " 'rules': [{'content_from': {'value': '\"some_rando_value\"',\n", + " 'value_type': 'fixed'},\n", + " 'field_name': 'test_field',\n", + " 'multiple_objects_handling': 'string',\n", + " 'selector': '/html/body/a/@title',\n", + " 'source_type': 'html'}],\n", + " 'updated_at': '2025-02-05T16:37:32Z',\n", + " 'url_filters': []}},\n", + " {'_id': '67adfbd0cf0332091f8af6e2',\n", + " '_index': '.ent-search-actastic-crawler2_extraction_rules',\n", + " '_score': 1.0,\n", + " '_source': {'configuration_oid': '67adf643cf03320a318aeefa',\n", + " 'created_at': '2025-02-13T14:04:00Z',\n", + " 'description': 'df_ex_rule',\n", + " 'domain_oid': '67adf64bcf0332cb308aef02',\n", + " 'edited_by': '1432693181',\n", + " 'id': '67adfbd0cf0332091f8af6e2',\n", + " 'rules': [{'content_from': {'value': '',\n", + " 'value_type': 'extracted'},\n", + " 'field_name': 'df_xtraction_rule',\n", + " 'multiple_objects_handling': 'string',\n", + " 'selector': '/html/body/a/@title',\n", + " 'source_type': 'html'}],\n", + " 'updated_at': '2025-02-13T14:04:00Z',\n", + " 'url_filters': []}},\n", + " {'_id': '67b5e3a2cf0332ddecbf47e4',\n", + " '_index': '.ent-search-actastic-crawler2_extraction_rules',\n", + " '_score': 1.0,\n", + " '_source': {'configuration_oid': '67adf643cf03320a318aeefa',\n", + " 'created_at': '2025-02-19T13:58:58Z',\n", + " 'description': 'new_url_extraction_rule',\n", + " 'domain_oid': '67adf64bcf0332cb308aef02',\n", + " 'edited_by': '1432693181',\n", + " 'id': '67b5e3a2cf0332ddecbf47e4',\n", + " 'rules': [{'content_from': {'value': '',\n", + " 'value_type': 'extracted'},\n", + " 'field_name': 'df_new_url_extractor',\n", + " 'multiple_objects_handling': 'string',\n", + " 'selector': '/df_url/*',\n", + " 'source_type': 'url'}],\n", + " 'updated_at': '2025-02-19T13:58:58Z',\n", + " 'url_filters': [{'filter': 'begins',\n", + " 'pattern': '/df/*'}]}},\n", + " {'_id': '67b5e822cf0332c6c4bf738c',\n", + " '_index': '.ent-search-actastic-crawler2_extraction_rules',\n", + " '_score': 1.0,\n", + " '_source': {'configuration_oid': '67adf643cf03320a318aeefa',\n", + " 'created_at': '2025-02-19T14:18:10Z',\n", + " 'description': 'yet_another_rule',\n", + " 'domain_oid': '67adf64bcf0332cb308aef02',\n", + " 'edited_by': '1432693181',\n", + " 'id': '67b5e822cf0332c6c4bf738c',\n", + " 'rules': [{'content_from': {'value': 'ping',\n", + " 'value_type': 'fixed'},\n", + " 'field_name': 'elder_field',\n", + " 'multiple_objects_handling': 'string',\n", + " 'selector': '/elderscrolls/*',\n", + " 'source_type': 'url'}],\n", + " 'updated_at': '2025-02-19T14:18:10Z',\n", + " 'url_filters': [{'filter': 'regex',\n", + " 'pattern': '/elderscrolls/*'}]}}],\n", + " 'max_score': 1.0,\n", + " 'total': {'relation': 'eq', 'value': 4}}\n", + "\n", + "Index: .ent-search-actastic-crawler2_extraction_rules\n", + "[{'content_from': {'value_type': 'fixed', 'value': '\"some_rando_value\"'}, 'field_name': 'test_field', 'multiple_objects_handling': 'string', 'selector': '/html/body/a/@title', 'source_type': 'html'}]\n", + "Description: extraction_rule_generic\n", + "Configuration OID: 67a3857117490c2adf48274d\n", + "Domain OID: 67a3858717490ccc74482755\n", + "Value type: fixed\n", + "Value: \"some_rando_value\"\n", + "Field name: test_field\n", + "Multiple Objects Handling: string\n", + "Selector: /html/body/a/@title\n", + "Source type: html\n", + "\n", + "Index: .ent-search-actastic-crawler2_extraction_rules\n", + "[{'content_from': {'value_type': 'extracted', 'value': ''}, 'field_name': 'df_xtraction_rule', 'multiple_objects_handling': 'string', 'selector': '/html/body/a/@title', 'source_type': 'html'}]\n", + "Description: df_ex_rule\n", + "Configuration OID: 67adf643cf03320a318aeefa\n", + "Domain OID: 67adf64bcf0332cb308aef02\n", + "Value type: extracted\n", + "Value: \n", + "Field name: df_xtraction_rule\n", + "Multiple Objects Handling: string\n", + "Selector: /html/body/a/@title\n", + "Source type: html\n", + "\n", + "Index: .ent-search-actastic-crawler2_extraction_rules\n", + "[{'content_from': {'value_type': 'extracted', 'value': ''}, 'field_name': 'df_new_url_extractor', 'multiple_objects_handling': 'string', 'selector': '/df_url/*', 'source_type': 'url'}]\n", + "Description: new_url_extraction_rule\n", + "Configuration OID: 67adf643cf03320a318aeefa\n", + "Domain OID: 67adf64bcf0332cb308aef02\n", + "Value type: extracted\n", + "Value: \n", + "Field name: df_new_url_extractor\n", + "Multiple Objects Handling: string\n", + "Selector: /df_url/*\n", + "Source type: url\n", + "\n", + "Index: .ent-search-actastic-crawler2_extraction_rules\n", + "[{'content_from': {'value_type': 'fixed', 'value': 'ping'}, 'field_name': 'elder_field', 'multiple_objects_handling': 'string', 'selector': '/elderscrolls/*', 'source_type': 'url'}]\n", + "Description: yet_another_rule\n", + "Configuration OID: 67adf643cf03320a318aeefa\n", + "Domain OID: 67adf64bcf0332cb308aef02\n", + "Value type: fixed\n", + "Value: ping\n", + "Field name: elder_field\n", + "Multiple Objects Handling: string\n", + "Selector: /elderscrolls/*\n", + "Source type: url\n", + "\n" + ] + } + ], + "source": [ + "print (\"Let's grab the extraction rules for our crawler.\")\n", + "print (\"The extraction rules are defined in the index .ent-search-actastic-crawler2_extraction_rules\\n\")\n", + "\n", + "ex_r = es_client.search(\n", + " index=\".ent-search-actastic-crawler2_extraction_rules\",\n", + " # _source=\"rules\"\n", + ")\n", + "import pprint as pp\n", + "\n", + "pp.pprint (dict(ex_r[\"hits\"]))\n", + "print ()\n", + "\n", + "for i in ex_r[\"hits\"][\"hits\"]:\n", + " print (f\"Index: {i['_index']}\")\n", + " rules = i[\"_source\"][\"rules\"]\n", + " print (rules)\n", + " for rule in rules:\n", + " print (f\"Description: {i['_source']['description']}\")\n", + " print (f\"Configuration OID: {i['_source']['configuration_oid']}\") # <-- I wonder if we can use these to\n", + " print (f\"Domain OID: {i['_source']['domain_oid']}\") # <-- match specific crawlers\n", + " \n", + " print (f\"Value type: {rule['content_from']['value_type']}\") # <-- this maps to 'action'\n", + " print (f\"Value: {rule['content_from']['value']}\") # <--\n", + " print (f\"Field name: {rule['field_name']}\") # <--\n", + " print (f\"Multiple Objects Handling: {rule['multiple_objects_handling']}\") # <-- this maps to \"join_as\"\n", + " print (f\"Selector: {rule['selector']}\") # <--\n", + " print (f\"Source type: {rule['source_type']}\\n\") # <--" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "8c0b6c8d-52f5-4325-92ca-8b7976c69abf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Let's try grabbing the configurations docs in .ent-search-actastic-crawler2_configurations_v2\n", + "{'took': 1, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 3, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': '.ent-search-actastic-crawler2_configurations_v2', '_id': '67a3857117490c2adf48274d', '_score': 1.0, '_source': {'id': '67a3857117490c2adf48274d', 'created_at': '2025-02-05T15:36:17Z', 'updated_at': '2025-02-05T16:34:04Z', 'index_name': 'search-migration_crawler', 'crawl_schedule': [{'unit': 'hour', 'frequency': 1}], 'use_connector_schedule': False}}, {'_index': '.ent-search-actastic-crawler2_configurations_v2', '_id': '67adf643cf03320a318aeefa', '_score': 1.0, '_source': {'id': '67adf643cf03320a318aeefa', 'created_at': '2025-02-13T13:40:19Z', 'updated_at': '2025-02-13T14:29:16Z', 'index_name': 'search-daggerfall-unity-website-crawler', 'crawl_schedule': [{'unit': 'hour', 'frequency': 24}], 'use_connector_schedule': True}}, {'_index': '.ent-search-actastic-crawler2_configurations_v2', '_id': '67abc13fcf0332544683f928', '_score': 1.0, '_source': {'id': '67abc13fcf0332544683f928', 'created_at': '2025-02-11T21:29:35Z', 'updated_at': '2025-02-14T16:58:50Z', 'index_name': 'search-crawler-fully-loaded', 'crawl_schedule': None, 'use_connector_schedule': False}}]}}\n", + "\n", + "Inside this index, we can find the following values:\n", + "\n", + "Index: .ent-search-actastic-crawler2_configurations_v2\n", + "Configuration ID: 67a3857117490c2adf48274d\n", + "Index name: search-migration_crawler\n", + "Crawl schedule: [{'unit': 'hour', 'frequency': 1}]\n", + "Use crawl schedule?: False\n", + "\n", + "Index: .ent-search-actastic-crawler2_configurations_v2\n", + "Configuration ID: 67adf643cf03320a318aeefa\n", + "Index name: search-daggerfall-unity-website-crawler\n", + "Crawl schedule: [{'unit': 'hour', 'frequency': 24}]\n", + "Use crawl schedule?: True\n", + "\n", + "Index: .ent-search-actastic-crawler2_configurations_v2\n", + "Configuration ID: 67abc13fcf0332544683f928\n", + "Index name: search-crawler-fully-loaded\n", + "Crawl schedule: None\n", + "Use crawl schedule?: False\n", + "\n" + ] + } + ], + "source": [ + "print (\"Let's try grabbing the configurations docs in .ent-search-actastic-crawler2_configurations_v2\")\n", + "\n", + "config_r = es_client.search(\n", + " index=\".ent-search-actastic-crawler2_configurations_v2\",\n", + ")\n", + "\n", + "print (config_r)\n", + "print ()\n", + "\n", + "print (\"Inside this index, we can find the following values:\\n\")\n", + "for i in config_r[\"hits\"][\"hits\"]:\n", + " source = i[\"_source\"]\n", + " print (f\"Index: {i['_index']}\")\n", + " print (f\"Configuration ID: {source['id']}\") # <--\n", + " print (f\"Index name: {source['index_name']}\") # <--\n", + " print (f\"Crawl schedule: {source['crawl_schedule']}\")\n", + " print (f\"Use crawl schedule?: {source['use_connector_schedule']}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "85adaae5-38d1-4410-b38c-3dd827d3170c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".ent-search-actastic-crawler2_domains\n", + "\n", + "{'took': 1, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 4, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': '.ent-search-actastic-crawler2_domains', '_id': '67abc186cf0332829183f93b', '_score': 1.0, '_source': {'id': '67abc186cf0332829183f93b', 'configuration_oid': '67abc13fcf0332544683f928', 'name': 'https://www.speedhunters.com', 'crawl_rules': [], 'seed_urls': [{'created_at': '2025-02-11T21:31:04Z', 'id': '67abc198cf0332d5ae83f93d', 'url': 'https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/'}], 'sitemaps': [{'created_at': '2025-02-14T16:58:47Z', 'id': '67af7647cf0332fdd191f64d', 'url': 'https://www.speedhunters.com/post_tag-sitemap2.xml'}], 'auth': 'akh4My83Myt3Y2NkbnJHUll2RFVlZz09LS1wbjU5TEZmNVdxUDZCK0lSa2dwRlhRPT0=--21413c9faa1309012319bdf4132ef5f4d0add9fb'}}, {'_index': '.ent-search-actastic-crawler2_domains', '_id': '67adf64bcf0332cb308aef02', '_score': 1.0, '_source': {'id': '67adf64bcf0332cb308aef02', 'configuration_oid': '67adf643cf03320a318aeefa', 'name': 'https://www.dfworkshop.net', 'crawl_rules': [{'pattern': 'DOS', 'created_at': '2025-02-13T13:43:27Z', 'rule': 'contains', 'id': '67adf6ffcf03320a318aef0a', 'order': 0, 'policy': 'deny'}], 'seed_urls': [{'created_at': '2025-02-13T13:40:26Z', 'id': '67adf64acf0332cb308aef01', 'url': 'https://www.dfworkshop.net/'}], 'sitemaps': [], 'auth': 'V001b0R1a2xCQmsyYloyQUYvTzJJQT09LS1md284bXJjOGhyRllMeGR3RlVpYU13PT0=--34940590d810db53c8dd783f58244db6c74dad20'}}, {'_index': '.ent-search-actastic-crawler2_domains', '_id': '67a3867317490c4bca482807', '_score': 1.0, '_source': {'id': '67a3867317490c4bca482807', 'configuration_oid': '67a3857117490c2adf48274d', 'name': 'https://justinjackson.ca', 'crawl_rules': [], 'seed_urls': [{'url': 'https://justinjackson.ca/words.html', 'id': '67a3867317490c4bca482806', 'created_at': '2025-02-05T15:40:35Z'}], 'sitemaps': [], 'auth': 'SWpyRVpYdUp0WHlCL1hHOCtXaEhaQT09LS1Ub3ZpV1dpL2FONGpod0ZSeUFYV1RBPT0=--c83fbe57eeffa5429b450de0ffe1fe4aade85e43'}}, {'_index': '.ent-search-actastic-crawler2_domains', '_id': '67a3858717490ccc74482755', '_score': 1.0, '_source': {'id': '67a3858717490ccc74482755', 'configuration_oid': '67a3857117490c2adf48274d', 'name': 'https://matt-nowzari.myportfolio.com', 'crawl_rules': [{'pattern': '/The', 'created_at': '2025-02-05T16:33:59Z', 'rule': 'begins', 'id': '67a392f717490ccb5b4853ec', 'order': 0, 'policy': 'deny'}], 'seed_urls': [{'url': 'https://matt-nowzari.myportfolio.com/', 'id': '67a3858717490ccc74482754', 'created_at': '2025-02-05T15:36:39Z'}], 'sitemaps': [], 'auth': 'U1J4d0xBTW1EN2Ryb05iNXoxRmF1UT09LS1kMDhmMEZWMys0RlJDd2hzWmZMNWx3PT0=--2f4cae86febac77fb8e1fb0133d6520029716a37'}}]}}\n", + "\n", + "Name: https://www.speedhunters.com\n", + "Configuration OID: 67abc13fcf0332544683f928\n", + "Domain ID: 67abc186cf0332829183f93b\n", + "Sitemaps : [{'created_at': '2025-02-14T16:58:47Z', 'id': '67af7647cf0332fdd191f64d', 'url': 'https://www.speedhunters.com/post_tag-sitemap2.xml'}]\n", + "Crawl rules: []\n", + "Seed URLs: [{'created_at': '2025-02-11T21:31:04Z', 'id': '67abc198cf0332d5ae83f93d', 'url': 'https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/'}]\n", + "Auth (decode and split on the ':' delimiter): akh4My83Myt3Y2NkbnJHUll2RFVlZz09LS1wbjU5TEZmNVdxUDZCK0lSa2dwRlhRPT0=--21413c9faa1309012319bdf4132ef5f4d0add9fb\n", + "\n", + "Name: https://www.dfworkshop.net\n", + "Configuration OID: 67adf643cf03320a318aeefa\n", + "Domain ID: 67adf64bcf0332cb308aef02\n", + "Sitemaps : []\n", + "Crawl rules: [{'pattern': 'DOS', 'created_at': '2025-02-13T13:43:27Z', 'rule': 'contains', 'id': '67adf6ffcf03320a318aef0a', 'order': 0, 'policy': 'deny'}]\n", + "Seed URLs: [{'created_at': '2025-02-13T13:40:26Z', 'id': '67adf64acf0332cb308aef01', 'url': 'https://www.dfworkshop.net/'}]\n", + "Auth (decode and split on the ':' delimiter): V001b0R1a2xCQmsyYloyQUYvTzJJQT09LS1md284bXJjOGhyRllMeGR3RlVpYU13PT0=--34940590d810db53c8dd783f58244db6c74dad20\n", + "\n", + "Name: https://justinjackson.ca\n", + "Configuration OID: 67a3857117490c2adf48274d\n", + "Domain ID: 67a3867317490c4bca482807\n", + "Sitemaps : []\n", + "Crawl rules: []\n", + "Seed URLs: [{'url': 'https://justinjackson.ca/words.html', 'id': '67a3867317490c4bca482806', 'created_at': '2025-02-05T15:40:35Z'}]\n", + "Auth (decode and split on the ':' delimiter): SWpyRVpYdUp0WHlCL1hHOCtXaEhaQT09LS1Ub3ZpV1dpL2FONGpod0ZSeUFYV1RBPT0=--c83fbe57eeffa5429b450de0ffe1fe4aade85e43\n", + "\n", + "Name: https://matt-nowzari.myportfolio.com\n", + "Configuration OID: 67a3857117490c2adf48274d\n", + "Domain ID: 67a3858717490ccc74482755\n", + "Sitemaps : []\n", + "Crawl rules: [{'pattern': '/The', 'created_at': '2025-02-05T16:33:59Z', 'rule': 'begins', 'id': '67a392f717490ccb5b4853ec', 'order': 0, 'policy': 'deny'}]\n", + "Seed URLs: [{'url': 'https://matt-nowzari.myportfolio.com/', 'id': '67a3858717490ccc74482754', 'created_at': '2025-02-05T15:36:39Z'}]\n", + "Auth (decode and split on the ':' delimiter): U1J4d0xBTW1EN2Ryb05iNXoxRmF1UT09LS1kMDhmMEZWMys0RlJDd2hzWmZMNWx3PT0=--2f4cae86febac77fb8e1fb0133d6520029716a37\n", + "\n" + ] + } + ], + "source": [ + "print (\".ent-search-actastic-crawler2_domains\\n\")\n", + "\n", + "domains_r = es_client.search(\n", + " index=\".ent-search-actastic-crawler2_domains\",\n", + " _source=[\"name\",\n", + " \"configuration_oid\",\n", + " \"id\",\n", + " \"sitemaps\",\n", + " \"crawl_rules\",\n", + " \"seed_urls\",\n", + " \"auth\"]\n", + ")\n", + "\n", + "print (domains_r)\n", + "print ()\n", + "\n", + "for i in domains_r[\"hits\"][\"hits\"]:\n", + " source = i[\"_source\"]\n", + " print (f\"Name: {source['name']}\") # <--\n", + " \n", + " print (f\"Configuration OID: {source['configuration_oid']}\") # <--\n", + " print (f\"Domain ID: {source['id']}\") # <--\n", + " \n", + " print (f\"Sitemaps : {source['sitemaps']}\") # <--\n", + " print (f\"Crawl rules: {source['crawl_rules']}\") # <--\n", + " print (f\"Seed URLs: {source['seed_urls']}\") # <--\n", + " \n", + " print (f\"Auth (decode and split on the ':' delimiter): {source['auth']}\\n\") # <--" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "eef6570b-26d0-4d98-a279-cf55fd9d31a5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['67adf64bcf0332cb308aef02', '67abc186cf0332829183f93b', '67a3867317490c4bca482807', '67a3858717490ccc74482755']\n", + "['67adf643cf03320a318aeefa', '67abc13fcf0332544683f928', '67a3857117490c2adf48274d']\n" + ] + } + ], + "source": [ + "all_domain_ids = []\n", + "all_config_ids = []\n", + "for i in domains_r[\"hits\"][\"hits\"]:\n", + " source = i[\"_source\"]\n", + " \n", + " all_domain_ids.append(source['id'])\n", + "\n", + " if source['configuration_oid'] not in all_config_ids:\n", + " all_config_ids.append(source['configuration_oid'])\n", + "\n", + "print (all_domain_ids)\n", + "print (all_config_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "fbddf119-e73c-4e97-9759-3f2f34b8000f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/mattnowzari/repos/search_and_transform/crawler/migration/my-output-conf.yml\n" + ] + } + ], + "source": [ + "import yaml\n", + "import os\n", + "import pprint as pp\n", + "\n", + "##### You may change the value of file_name #####\n", + "file_name = \"my-output-conf.yml\"\n", + "#################################################\n", + "\n", + "data = {\n", + " \"url\": \"\",\n", + " \"seed_urls\": [\"seed1\", \"seed2\", \"seed3\"],\n", + " \"sitemap_urls\": [\"sitemap1\", \"sitemap2\"],\n", + " 'extraction_rulesets':[{\n", + " \"url_filters\": [{\n", + " \"type\": \"begins\",\n", + " \"pattern\": \"/cool/pattern/*\"\n", + " }],\n", + " \"rules\": [{\n", + " \"action\": \"extract\",\n", + " \"field_name\": \"author\",\n", + " \"selector\": \".author\",\n", + " \"join_as\": \"array\",\n", + " \"value\": \"yes\",\n", + " \"source\": \"html\"\n", + " }]\n", + " }]\n", + "}\n", + "# pp.pprint(data)\n", + "\n", + "base_dir = os.getcwd()\n", + "\n", + "output_path = os.path.join(base_dir, file_name)\n", + "print (output_path)\n", + "\n", + "if os.path.exists(base_dir):\n", + " with open(output_path, 'w') as file:\n", + " yaml.dump(data, file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d525c194-643b-4f97-a59b-f0fa17ba3bfd", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 4c3c7d02be2d190356a076647f35c44987d31a12 Mon Sep 17 00:00:00 2001 From: mattnowzari Date: Fri, 21 Feb 2025 09:56:49 -0500 Subject: [PATCH 3/6] cleared output of all cells --- .../crawler_migration-checkpoint.ipynb | 241 ++---------------- migration/crawler_migration.ipynb | 241 ++---------------- 2 files changed, 36 insertions(+), 446 deletions(-) diff --git a/migration/.ipynb_checkpoints/crawler_migration-checkpoint.ipynb b/migration/.ipynb_checkpoints/crawler_migration-checkpoint.ipynb index b74898d7..b8799165 100644 --- a/migration/.ipynb_checkpoints/crawler_migration-checkpoint.ipynb +++ b/migration/.ipynb_checkpoints/crawler_migration-checkpoint.ipynb @@ -32,21 +32,10 @@ }, { "cell_type": "code", - "execution_count": 510, + "execution_count": null, "id": "da411d2f-9aff-46af-845a-5fe9be19ea3c", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: elasticsearch in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (8.17.1)\n", - "Requirement already satisfied: elastic-transport<9,>=8.15.1 in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (from elasticsearch) (8.17.0)\n", - "Requirement already satisfied: urllib3<3,>=1.26.2 in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (from elastic-transport<9,>=8.15.1->elasticsearch) (2.3.0)\n", - "Requirement already satisfied: certifi in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (from elastic-transport<9,>=8.15.1->elasticsearch) (2024.12.14)\n" - ] - } - ], + "outputs": [], "source": [ "!pip install elasticsearch\n", "\n", @@ -74,19 +63,10 @@ }, { "cell_type": "code", - "execution_count": 511, + "execution_count": null, "id": "08e6e3d2-62d3-4890-a6be-41fe0a931ef6", "metadata": {}, - "outputs": [ - { - "name": "stdin", - "output_type": "stream", - "text": [ - "Elastic Cloud ID: ········\n", - "Elastic Api Key: ········\n" - ] - } - ], + "outputs": [], "source": [ "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", "API_KEY = getpass(\"Elastic Api Key: \")" @@ -102,21 +82,10 @@ }, { "cell_type": "code", - "execution_count": 512, + "execution_count": null, "id": "f3ada2cb-b00f-4b1d-be09-57b2ccf25c7c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'You Know, for Search'" - ] - }, - "execution_count": 512, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "es_client = Elasticsearch(\n", " cloud_id=ELASTIC_CLOUD_ID,\n", @@ -150,29 +119,10 @@ }, { "cell_type": "code", - "execution_count": 669, + "execution_count": null, "id": "0a698b05-e939-42a5-aa31-51b1b1883e6f", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1. search-search-crawler-fully-loaded-8.18\n", - " Crawler ID is 67b74f16204956a3ce9fd0a4\n", - "\n", - "2. search-daggerfall-unity-website-crawler-8.18\n", - " Crawler ID is 67b74f84204956efce9fd0b7\n", - "\n", - "3. search-migration-crawler\n", - " Crawler ID is 67b7509b2049567f859fd0d4\n", - "\n", - "4. search-basic\n", - " Crawler ID is 67b75aeb20495617d59fd0ea\n", - "\n" - ] - } - ], + "outputs": [], "source": [ " # in-memory data structure that maintains current state of the configs we've pulled\n", "inflight_configuration_data = {}\n", @@ -241,37 +191,10 @@ }, { "cell_type": "code", - "execution_count": 670, + "execution_count": null, "id": "e1c64c3d-c8d7-4236-9ed9-c9b1cb5e7972", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.) Crawler ID 67b74f16204956a3ce9fd0a4\n", - " Domain https://www.speedhunters.com found!\n", - " Seed URls found: ['https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/', 'https://www.speedhunters.com/2025/02/daniel-arsham-eroded-porsche-911/', 'https://www.speedhunters.com/2025/02/5-plus-7-equals-v12-a-custom-bmw-super-saloon/']\n", - " Sitemap URLs found: ['https://www.speedhunters.com/post_tag-sitemap2.xml']\n", - "2.) Crawler ID 67b74f84204956efce9fd0b7\n", - " Domain https://www.dfworkshop.net found!\n", - " Seed URls found: ['https://www.dfworkshop.net/']\n", - " Crawl rules found: [{'policy': 'allow', 'type': 'begins', 'pattern': '/word'}, {'policy': 'deny', 'type': 'contains', 'pattern': 'DOS'}]\n", - " Domain https://www.speedhunters.com found!\n", - " Seed URls found: ['https://www.speedhunters.com/']\n", - " Crawl rules found: [{'policy': 'deny', 'type': 'begins', 'pattern': '/BMW'}]\n", - "3.) Crawler ID 67b7509b2049567f859fd0d4\n", - " Domain https://justinjackson.ca found!\n", - " Seed URls found: ['https://justinjackson.ca/']\n", - " Domain https://matt-nowzari.myportfolio.com found!\n", - " Seed URls found: ['https://matt-nowzari.myportfolio.com/']\n", - " Crawl rules found: [{'policy': 'deny', 'type': 'begins', 'pattern': '/The'}]\n", - "4.) Crawler ID 67b75aeb20495617d59fd0ea\n", - " Domain https://www.elastic.co found!\n", - " Seed URls found: ['https://www.elastic.co/']\n" - ] - } - ], + "outputs": [], "source": [ "crawler_ids_to_query = inflight_configuration_data.keys()\n", "\n", @@ -353,7 +276,7 @@ }, { "cell_type": "code", - "execution_count": 671, + "execution_count": null, "id": "61a7df7a-72ad-4330-a30c-da319befd55c", "metadata": {}, "outputs": [], @@ -419,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 672, + "execution_count": null, "id": "d880e081-f960-41c7-921e-26896f248eab", "metadata": {}, "outputs": [], @@ -465,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": 673, + "execution_count": null, "id": "dd70f102-33ee-4106-8861-0aa0f9a223a1", "metadata": {}, "outputs": [], @@ -500,20 +423,10 @@ }, { "cell_type": "code", - "execution_count": 660, + "execution_count": null, "id": "213880cc-cbf3-40d9-8c7d-6fcf6428c16b", "metadata": {}, - "outputs": [ - { - "name": "stdin", - "output_type": "stream", - "text": [ - "Elasticsearch endpoint URL: https://4911ebad5ed44d149fe8ddad4a4b3751.us-west2.gcp.elastic-cloud.com\n", - "The Elasticsearch endpoint's port number: 443\n", - "Elasticsearch API key: ········\n" - ] - } - ], + "outputs": [], "source": [ "ENDPOINT = input(\"Elasticsearch endpoint URL: \")\n", "PORT = input(\"The Elasticsearch endpoint's port number: \")\n", @@ -549,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": 661, + "execution_count": null, "id": "6adc53db-d781-4b72-a5f3-441364f354b8", "metadata": {}, "outputs": [], @@ -579,128 +492,10 @@ }, { "cell_type": "code", - "execution_count": 674, + "execution_count": null, "id": "525aabb8-0537-4ba6-8109-109490dddafe", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "YAML config => search-search-crawler-fully-loaded-8.18-config.yml\n", - "--------\n", - "output_index: search-search-crawler-fully-loaded-8.18\n", - "schedule: []\n", - "output_sink: elasticsearch\n", - "full_html_extraction_enabled: false\n", - "elasticsearch:\n", - " host: ''\n", - " port: ''\n", - " api_key: ''\n", - "domains:\n", - "- url: https://www.speedhunters.com\n", - " seed_urls:\n", - " - https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/\n", - " - https://www.speedhunters.com/2025/02/daniel-arsham-eroded-porsche-911/\n", - " - https://www.speedhunters.com/2025/02/5-plus-7-equals-v12-a-custom-bmw-super-saloon/\n", - " sitemap_urls:\n", - " - https://www.speedhunters.com/post_tag-sitemap2.xml\n", - "\n", - "--------------------------------------------------------------------------------\n", - "YAML config => search-daggerfall-unity-website-crawler-8.18-config.yml\n", - "--------\n", - "output_index: search-daggerfall-unity-website-crawler-8.18\n", - "schedule: 0 30 8 * * ?\n", - "output_sink: elasticsearch\n", - "full_html_extraction_enabled: false\n", - "elasticsearch:\n", - " host: ''\n", - " port: ''\n", - " api_key: ''\n", - "domains:\n", - "- url: https://www.dfworkshop.net\n", - " seed_urls:\n", - " - https://www.dfworkshop.net/\n", - " crawl_rules:\n", - " - policy: allow\n", - " type: begins\n", - " pattern: /word\n", - " - policy: deny\n", - " type: contains\n", - " pattern: DOS\n", - " extraction_rulesets:\n", - " - url_filters:\n", - " - type: begins\n", - " pattern: /elderscrolls/*\n", - " rules:\n", - " - action: set\n", - " field_name: elder_field\n", - " selector: /elderscrolls/*\n", - " join_as: string\n", - " value: ping\n", - " source: url\n", - "- url: https://www.speedhunters.com\n", - " seed_urls:\n", - " - https://www.speedhunters.com/\n", - " crawl_rules:\n", - " - policy: deny\n", - " type: begins\n", - " pattern: /BMW\n", - "\n", - "--------------------------------------------------------------------------------\n", - "YAML config => search-migration-crawler-config.yml\n", - "--------\n", - "output_index: search-migration-crawler\n", - "schedule: []\n", - "output_sink: elasticsearch\n", - "full_html_extraction_enabled: false\n", - "elasticsearch:\n", - " host: ''\n", - " port: ''\n", - " api_key: ''\n", - "domains:\n", - "- url: https://justinjackson.ca\n", - " seed_urls:\n", - " - https://justinjackson.ca/\n", - "- url: https://matt-nowzari.myportfolio.com\n", - " seed_urls:\n", - " - https://matt-nowzari.myportfolio.com/\n", - " crawl_rules:\n", - " - policy: deny\n", - " type: begins\n", - " pattern: /The\n", - " extraction_rulesets:\n", - " - url_filters: []\n", - " rules:\n", - " - action: set\n", - " field_name: test_field\n", - " selector: /html/body/a/@title\n", - " join_as: string\n", - " value: some_rando_value\n", - " source: html\n", - "\n", - "--------------------------------------------------------------------------------\n", - "YAML config => search-basic-config.yml\n", - "--------\n", - "output_index: search-basic\n", - "schedule:\n", - "- unit: hour\n", - " frequency: 24\n", - "output_sink: elasticsearch\n", - "full_html_extraction_enabled: false\n", - "elasticsearch:\n", - " host: ''\n", - " port: ''\n", - " api_key: ''\n", - "domains:\n", - "- url: https://www.elastic.co\n", - " seed_urls:\n", - " - https://www.elastic.co/\n", - "\n", - "--------------------------------------------------------------------------------\n" - ] - } - ], + "outputs": [], "source": [ "for crawler_config in inflight_configuration_data.values():\n", " yaml_out = yaml.safe_dump(\n", diff --git a/migration/crawler_migration.ipynb b/migration/crawler_migration.ipynb index b74898d7..b8799165 100644 --- a/migration/crawler_migration.ipynb +++ b/migration/crawler_migration.ipynb @@ -32,21 +32,10 @@ }, { "cell_type": "code", - "execution_count": 510, + "execution_count": null, "id": "da411d2f-9aff-46af-845a-5fe9be19ea3c", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: elasticsearch in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (8.17.1)\n", - "Requirement already satisfied: elastic-transport<9,>=8.15.1 in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (from elasticsearch) (8.17.0)\n", - "Requirement already satisfied: urllib3<3,>=1.26.2 in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (from elastic-transport<9,>=8.15.1->elasticsearch) (2.3.0)\n", - "Requirement already satisfied: certifi in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (from elastic-transport<9,>=8.15.1->elasticsearch) (2024.12.14)\n" - ] - } - ], + "outputs": [], "source": [ "!pip install elasticsearch\n", "\n", @@ -74,19 +63,10 @@ }, { "cell_type": "code", - "execution_count": 511, + "execution_count": null, "id": "08e6e3d2-62d3-4890-a6be-41fe0a931ef6", "metadata": {}, - "outputs": [ - { - "name": "stdin", - "output_type": "stream", - "text": [ - "Elastic Cloud ID: ········\n", - "Elastic Api Key: ········\n" - ] - } - ], + "outputs": [], "source": [ "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", "API_KEY = getpass(\"Elastic Api Key: \")" @@ -102,21 +82,10 @@ }, { "cell_type": "code", - "execution_count": 512, + "execution_count": null, "id": "f3ada2cb-b00f-4b1d-be09-57b2ccf25c7c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'You Know, for Search'" - ] - }, - "execution_count": 512, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "es_client = Elasticsearch(\n", " cloud_id=ELASTIC_CLOUD_ID,\n", @@ -150,29 +119,10 @@ }, { "cell_type": "code", - "execution_count": 669, + "execution_count": null, "id": "0a698b05-e939-42a5-aa31-51b1b1883e6f", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1. search-search-crawler-fully-loaded-8.18\n", - " Crawler ID is 67b74f16204956a3ce9fd0a4\n", - "\n", - "2. search-daggerfall-unity-website-crawler-8.18\n", - " Crawler ID is 67b74f84204956efce9fd0b7\n", - "\n", - "3. search-migration-crawler\n", - " Crawler ID is 67b7509b2049567f859fd0d4\n", - "\n", - "4. search-basic\n", - " Crawler ID is 67b75aeb20495617d59fd0ea\n", - "\n" - ] - } - ], + "outputs": [], "source": [ " # in-memory data structure that maintains current state of the configs we've pulled\n", "inflight_configuration_data = {}\n", @@ -241,37 +191,10 @@ }, { "cell_type": "code", - "execution_count": 670, + "execution_count": null, "id": "e1c64c3d-c8d7-4236-9ed9-c9b1cb5e7972", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.) Crawler ID 67b74f16204956a3ce9fd0a4\n", - " Domain https://www.speedhunters.com found!\n", - " Seed URls found: ['https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/', 'https://www.speedhunters.com/2025/02/daniel-arsham-eroded-porsche-911/', 'https://www.speedhunters.com/2025/02/5-plus-7-equals-v12-a-custom-bmw-super-saloon/']\n", - " Sitemap URLs found: ['https://www.speedhunters.com/post_tag-sitemap2.xml']\n", - "2.) Crawler ID 67b74f84204956efce9fd0b7\n", - " Domain https://www.dfworkshop.net found!\n", - " Seed URls found: ['https://www.dfworkshop.net/']\n", - " Crawl rules found: [{'policy': 'allow', 'type': 'begins', 'pattern': '/word'}, {'policy': 'deny', 'type': 'contains', 'pattern': 'DOS'}]\n", - " Domain https://www.speedhunters.com found!\n", - " Seed URls found: ['https://www.speedhunters.com/']\n", - " Crawl rules found: [{'policy': 'deny', 'type': 'begins', 'pattern': '/BMW'}]\n", - "3.) Crawler ID 67b7509b2049567f859fd0d4\n", - " Domain https://justinjackson.ca found!\n", - " Seed URls found: ['https://justinjackson.ca/']\n", - " Domain https://matt-nowzari.myportfolio.com found!\n", - " Seed URls found: ['https://matt-nowzari.myportfolio.com/']\n", - " Crawl rules found: [{'policy': 'deny', 'type': 'begins', 'pattern': '/The'}]\n", - "4.) Crawler ID 67b75aeb20495617d59fd0ea\n", - " Domain https://www.elastic.co found!\n", - " Seed URls found: ['https://www.elastic.co/']\n" - ] - } - ], + "outputs": [], "source": [ "crawler_ids_to_query = inflight_configuration_data.keys()\n", "\n", @@ -353,7 +276,7 @@ }, { "cell_type": "code", - "execution_count": 671, + "execution_count": null, "id": "61a7df7a-72ad-4330-a30c-da319befd55c", "metadata": {}, "outputs": [], @@ -419,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 672, + "execution_count": null, "id": "d880e081-f960-41c7-921e-26896f248eab", "metadata": {}, "outputs": [], @@ -465,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": 673, + "execution_count": null, "id": "dd70f102-33ee-4106-8861-0aa0f9a223a1", "metadata": {}, "outputs": [], @@ -500,20 +423,10 @@ }, { "cell_type": "code", - "execution_count": 660, + "execution_count": null, "id": "213880cc-cbf3-40d9-8c7d-6fcf6428c16b", "metadata": {}, - "outputs": [ - { - "name": "stdin", - "output_type": "stream", - "text": [ - "Elasticsearch endpoint URL: https://4911ebad5ed44d149fe8ddad4a4b3751.us-west2.gcp.elastic-cloud.com\n", - "The Elasticsearch endpoint's port number: 443\n", - "Elasticsearch API key: ········\n" - ] - } - ], + "outputs": [], "source": [ "ENDPOINT = input(\"Elasticsearch endpoint URL: \")\n", "PORT = input(\"The Elasticsearch endpoint's port number: \")\n", @@ -549,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": 661, + "execution_count": null, "id": "6adc53db-d781-4b72-a5f3-441364f354b8", "metadata": {}, "outputs": [], @@ -579,128 +492,10 @@ }, { "cell_type": "code", - "execution_count": 674, + "execution_count": null, "id": "525aabb8-0537-4ba6-8109-109490dddafe", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "YAML config => search-search-crawler-fully-loaded-8.18-config.yml\n", - "--------\n", - "output_index: search-search-crawler-fully-loaded-8.18\n", - "schedule: []\n", - "output_sink: elasticsearch\n", - "full_html_extraction_enabled: false\n", - "elasticsearch:\n", - " host: ''\n", - " port: ''\n", - " api_key: ''\n", - "domains:\n", - "- url: https://www.speedhunters.com\n", - " seed_urls:\n", - " - https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/\n", - " - https://www.speedhunters.com/2025/02/daniel-arsham-eroded-porsche-911/\n", - " - https://www.speedhunters.com/2025/02/5-plus-7-equals-v12-a-custom-bmw-super-saloon/\n", - " sitemap_urls:\n", - " - https://www.speedhunters.com/post_tag-sitemap2.xml\n", - "\n", - "--------------------------------------------------------------------------------\n", - "YAML config => search-daggerfall-unity-website-crawler-8.18-config.yml\n", - "--------\n", - "output_index: search-daggerfall-unity-website-crawler-8.18\n", - "schedule: 0 30 8 * * ?\n", - "output_sink: elasticsearch\n", - "full_html_extraction_enabled: false\n", - "elasticsearch:\n", - " host: ''\n", - " port: ''\n", - " api_key: ''\n", - "domains:\n", - "- url: https://www.dfworkshop.net\n", - " seed_urls:\n", - " - https://www.dfworkshop.net/\n", - " crawl_rules:\n", - " - policy: allow\n", - " type: begins\n", - " pattern: /word\n", - " - policy: deny\n", - " type: contains\n", - " pattern: DOS\n", - " extraction_rulesets:\n", - " - url_filters:\n", - " - type: begins\n", - " pattern: /elderscrolls/*\n", - " rules:\n", - " - action: set\n", - " field_name: elder_field\n", - " selector: /elderscrolls/*\n", - " join_as: string\n", - " value: ping\n", - " source: url\n", - "- url: https://www.speedhunters.com\n", - " seed_urls:\n", - " - https://www.speedhunters.com/\n", - " crawl_rules:\n", - " - policy: deny\n", - " type: begins\n", - " pattern: /BMW\n", - "\n", - "--------------------------------------------------------------------------------\n", - "YAML config => search-migration-crawler-config.yml\n", - "--------\n", - "output_index: search-migration-crawler\n", - "schedule: []\n", - "output_sink: elasticsearch\n", - "full_html_extraction_enabled: false\n", - "elasticsearch:\n", - " host: ''\n", - " port: ''\n", - " api_key: ''\n", - "domains:\n", - "- url: https://justinjackson.ca\n", - " seed_urls:\n", - " - https://justinjackson.ca/\n", - "- url: https://matt-nowzari.myportfolio.com\n", - " seed_urls:\n", - " - https://matt-nowzari.myportfolio.com/\n", - " crawl_rules:\n", - " - policy: deny\n", - " type: begins\n", - " pattern: /The\n", - " extraction_rulesets:\n", - " - url_filters: []\n", - " rules:\n", - " - action: set\n", - " field_name: test_field\n", - " selector: /html/body/a/@title\n", - " join_as: string\n", - " value: some_rando_value\n", - " source: html\n", - "\n", - "--------------------------------------------------------------------------------\n", - "YAML config => search-basic-config.yml\n", - "--------\n", - "output_index: search-basic\n", - "schedule:\n", - "- unit: hour\n", - " frequency: 24\n", - "output_sink: elasticsearch\n", - "full_html_extraction_enabled: false\n", - "elasticsearch:\n", - " host: ''\n", - " port: ''\n", - " api_key: ''\n", - "domains:\n", - "- url: https://www.elastic.co\n", - " seed_urls:\n", - " - https://www.elastic.co/\n", - "\n", - "--------------------------------------------------------------------------------\n" - ] - } - ], + "outputs": [], "source": [ "for crawler_config in inflight_configuration_data.values():\n", " yaml_out = yaml.safe_dump(\n", From defca9066e1f1378f9ba59a6de84385e34f1df45 Mon Sep 17 00:00:00 2001 From: mattnowzari Date: Fri, 21 Feb 2025 10:02:00 -0500 Subject: [PATCH 4/6] removed exploration notebook --- migration/crawler_migration_exploration.ipynb | 564 ------------------ 1 file changed, 564 deletions(-) delete mode 100644 migration/crawler_migration_exploration.ipynb diff --git a/migration/crawler_migration_exploration.ipynb b/migration/crawler_migration_exploration.ipynb deleted file mode 100644 index 42a3922b..00000000 --- a/migration/crawler_migration_exploration.ipynb +++ /dev/null @@ -1,564 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "id": "4f198cd5-cc9c-4080-8dd4-425628b05d4d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "This is the start of the notebook\n", - "Hopefully we can learn a thing or two, before doing a thing or two.\n" - ] - } - ], - "source": [ - "print (\"This is the start of the notebook\")\n", - "print (\"Hopefully we can learn a thing or two, before doing a thing or two.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "1914cbf9-a18b-4b1d-9ea1-5bc04e23ceff", - "metadata": {}, - "outputs": [], - "source": [ - "from elasticsearch import Elasticsearch\n", - "import json\n", - "\n", - "endpoint = \"https://5a5b8a5cdd84464dae4c7c7ae8a59562.us-east1.gcp.elastic-cloud.com:443\"\n", - "api_key = \"aTN4MUdwVUJLTFFTSmFFWjBlTFM6dmU0ZXJnTjdUaUs5dXhIUU1fd0xiZw==\"\n", - "\n", - "es_client = Elasticsearch(\n", - " endpoint,\n", - " api_key=api_key,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "711bb339-bbc8-4112-a392-dde01f5e5729", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The index [search-migration_crawler] was found!\n" - ] - } - ], - "source": [ - "index_name = \"search-migration_crawler\"\n", - "if not es_client.indices.exists(index=index_name):\n", - " print (\"Eek! The index does not exist!\")\n", - "else:\n", - " print (f\"The index [{index_name}] was found!\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "a14009b2-2d34-465e-b43d-a274f01fbff0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Let's see if we can get _all_ indices in this ES instance related to crawler?\n", - "\n", - "We got the list as a JSON dictionary! We received 20 indices.\n", - "\n", - " 4986743 docs -> .ds-logs-elastic_crawler-default-2025.02.05-000001\n", - " 0 docs -> .ent-search-actastic-app_search_crawler_content_metadata\n", - " 0 docs -> .ent-search-actastic-app_search_crawler_content_metadata-content_hash-engine_oid-unique-constraint\n", - " 0 docs -> .ent-search-actastic-app_search_crawler_content_url_metadata\n", - " 3 docs -> .ent-search-actastic-crawler2_configurations_v2\n", - " 3 docs -> .ent-search-actastic-crawler2_configurations_v2-index_name-unique-constraint\n", - " 9385 docs -> .ent-search-actastic-crawler2_content_metadata\n", - " 9385 docs -> .ent-search-actastic-crawler2_content_metadata-configuration_oid-content_hash-unique-constraint\n", - " 9532 docs -> .ent-search-actastic-crawler2_content_url_metadata\n", - " 332 docs -> .ent-search-actastic-crawler2_crawl_requests_v2\n", - " 4 docs -> .ent-search-actastic-crawler2_domains\n", - " 4 docs -> .ent-search-actastic-crawler2_domains-configuration_oid-name-unique-constraint\n", - " 2 docs -> .ent-search-actastic-crawler2_extraction_rules\n", - " 0 docs -> .ent-search-actastic-crawler2_process_crawls\n", - " 651 docs -> .ent-search-actastic-crawler2_robots_txts\n", - " 0 docs -> .ent-search-actastic-crawler_crawl_requests_v7\n", - " 0 docs -> .ent-search-actastic-crawler_domains_v6\n", - " 0 docs -> .ent-search-actastic-crawler_domains_v6-engine_oid-name-unique-constraint\n", - " 0 docs -> .ent-search-actastic-crawler_process_crawls\n", - " 0 docs -> .ent-search-actastic-crawler_robots_txts_v3\n", - "\n", - "There are 20 healthy indices, 0 sick indices and 0 unhealthy indices.\n", - "11 indices have docs, and 9 indices do not.\n" - ] - } - ], - "source": [ - "print (\"Let's see if we can get _all_ indices in this ES instance related to crawler?\\n\")\n", - "\n", - "json_response = es_client.cat.indices(\n", - " index=\".*crawler*\", # take note of the . before *crawler* - this tells the API to query 'hidden' indices as well\n", - " s=\"index\",\n", - " format=\"json\"\n", - ")\n", - "\n", - "print (f\"We got the list as a JSON dictionary! We received {len(json_response)} indices.\\n\")\n", - "\n", - "health_histogram = {\n", - " \"green\": 0,\n", - " \"yellow\": 0,\n", - " \"red\": 0,\n", - "}\n", - "\n", - "indices_with_docs = {\n", - " \"with_docs\": 0,\n", - " \"without_docs\": 0,\n", - "}\n", - "\n", - "index_names = [] # save the index names to run through all of them at some point in the future\n", - "\n", - "for item in json_response.body: # Note that calling .body on the response will get us a List of dictionaries:\n", - " health_status = item[\"health\"]\n", - " health_histogram[health_status] += 1\n", - "\n", - " if int(item['docs.count']) > 0:\n", - " indices_with_docs['with_docs'] += 1\n", - " index_names.append(item['index'])\n", - " else:\n", - " indices_with_docs['without_docs'] += 1\n", - "\n", - " print (f\" {item['docs.count']} docs -> {item['index']}\")\n", - "\n", - "print (f\"\\nThere are {health_histogram['green']} healthy indices, {health_histogram['yellow']} sick indices \\\n", - "and {health_histogram['red']} unhealthy indices.\")\n", - "\n", - "print (f\"{indices_with_docs['with_docs']} indices have docs, and {indices_with_docs['without_docs']} \\\n", - "indices do not.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "db96133f-bb1b-4f40-b02f-4812a585964d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Let's grab the extraction rules for our crawler.\n", - "The extraction rules are defined in the index .ent-search-actastic-crawler2_extraction_rules\n", - "\n", - "{'hits': [{'_id': '67a393cc17490cc0d24853ef',\n", - " '_index': '.ent-search-actastic-crawler2_extraction_rules',\n", - " '_score': 1.0,\n", - " '_source': {'configuration_oid': '67a3857117490c2adf48274d',\n", - " 'created_at': '2025-02-05T16:37:32Z',\n", - " 'description': 'extraction_rule_generic',\n", - " 'domain_oid': '67a3858717490ccc74482755',\n", - " 'edited_by': '1432693181',\n", - " 'id': '67a393cc17490cc0d24853ef',\n", - " 'rules': [{'content_from': {'value': '\"some_rando_value\"',\n", - " 'value_type': 'fixed'},\n", - " 'field_name': 'test_field',\n", - " 'multiple_objects_handling': 'string',\n", - " 'selector': '/html/body/a/@title',\n", - " 'source_type': 'html'}],\n", - " 'updated_at': '2025-02-05T16:37:32Z',\n", - " 'url_filters': []}},\n", - " {'_id': '67adfbd0cf0332091f8af6e2',\n", - " '_index': '.ent-search-actastic-crawler2_extraction_rules',\n", - " '_score': 1.0,\n", - " '_source': {'configuration_oid': '67adf643cf03320a318aeefa',\n", - " 'created_at': '2025-02-13T14:04:00Z',\n", - " 'description': 'df_ex_rule',\n", - " 'domain_oid': '67adf64bcf0332cb308aef02',\n", - " 'edited_by': '1432693181',\n", - " 'id': '67adfbd0cf0332091f8af6e2',\n", - " 'rules': [{'content_from': {'value': '',\n", - " 'value_type': 'extracted'},\n", - " 'field_name': 'df_xtraction_rule',\n", - " 'multiple_objects_handling': 'string',\n", - " 'selector': '/html/body/a/@title',\n", - " 'source_type': 'html'}],\n", - " 'updated_at': '2025-02-13T14:04:00Z',\n", - " 'url_filters': []}},\n", - " {'_id': '67b5e3a2cf0332ddecbf47e4',\n", - " '_index': '.ent-search-actastic-crawler2_extraction_rules',\n", - " '_score': 1.0,\n", - " '_source': {'configuration_oid': '67adf643cf03320a318aeefa',\n", - " 'created_at': '2025-02-19T13:58:58Z',\n", - " 'description': 'new_url_extraction_rule',\n", - " 'domain_oid': '67adf64bcf0332cb308aef02',\n", - " 'edited_by': '1432693181',\n", - " 'id': '67b5e3a2cf0332ddecbf47e4',\n", - " 'rules': [{'content_from': {'value': '',\n", - " 'value_type': 'extracted'},\n", - " 'field_name': 'df_new_url_extractor',\n", - " 'multiple_objects_handling': 'string',\n", - " 'selector': '/df_url/*',\n", - " 'source_type': 'url'}],\n", - " 'updated_at': '2025-02-19T13:58:58Z',\n", - " 'url_filters': [{'filter': 'begins',\n", - " 'pattern': '/df/*'}]}},\n", - " {'_id': '67b5e822cf0332c6c4bf738c',\n", - " '_index': '.ent-search-actastic-crawler2_extraction_rules',\n", - " '_score': 1.0,\n", - " '_source': {'configuration_oid': '67adf643cf03320a318aeefa',\n", - " 'created_at': '2025-02-19T14:18:10Z',\n", - " 'description': 'yet_another_rule',\n", - " 'domain_oid': '67adf64bcf0332cb308aef02',\n", - " 'edited_by': '1432693181',\n", - " 'id': '67b5e822cf0332c6c4bf738c',\n", - " 'rules': [{'content_from': {'value': 'ping',\n", - " 'value_type': 'fixed'},\n", - " 'field_name': 'elder_field',\n", - " 'multiple_objects_handling': 'string',\n", - " 'selector': '/elderscrolls/*',\n", - " 'source_type': 'url'}],\n", - " 'updated_at': '2025-02-19T14:18:10Z',\n", - " 'url_filters': [{'filter': 'regex',\n", - " 'pattern': '/elderscrolls/*'}]}}],\n", - " 'max_score': 1.0,\n", - " 'total': {'relation': 'eq', 'value': 4}}\n", - "\n", - "Index: .ent-search-actastic-crawler2_extraction_rules\n", - "[{'content_from': {'value_type': 'fixed', 'value': '\"some_rando_value\"'}, 'field_name': 'test_field', 'multiple_objects_handling': 'string', 'selector': '/html/body/a/@title', 'source_type': 'html'}]\n", - "Description: extraction_rule_generic\n", - "Configuration OID: 67a3857117490c2adf48274d\n", - "Domain OID: 67a3858717490ccc74482755\n", - "Value type: fixed\n", - "Value: \"some_rando_value\"\n", - "Field name: test_field\n", - "Multiple Objects Handling: string\n", - "Selector: /html/body/a/@title\n", - "Source type: html\n", - "\n", - "Index: .ent-search-actastic-crawler2_extraction_rules\n", - "[{'content_from': {'value_type': 'extracted', 'value': ''}, 'field_name': 'df_xtraction_rule', 'multiple_objects_handling': 'string', 'selector': '/html/body/a/@title', 'source_type': 'html'}]\n", - "Description: df_ex_rule\n", - "Configuration OID: 67adf643cf03320a318aeefa\n", - "Domain OID: 67adf64bcf0332cb308aef02\n", - "Value type: extracted\n", - "Value: \n", - "Field name: df_xtraction_rule\n", - "Multiple Objects Handling: string\n", - "Selector: /html/body/a/@title\n", - "Source type: html\n", - "\n", - "Index: .ent-search-actastic-crawler2_extraction_rules\n", - "[{'content_from': {'value_type': 'extracted', 'value': ''}, 'field_name': 'df_new_url_extractor', 'multiple_objects_handling': 'string', 'selector': '/df_url/*', 'source_type': 'url'}]\n", - "Description: new_url_extraction_rule\n", - "Configuration OID: 67adf643cf03320a318aeefa\n", - "Domain OID: 67adf64bcf0332cb308aef02\n", - "Value type: extracted\n", - "Value: \n", - "Field name: df_new_url_extractor\n", - "Multiple Objects Handling: string\n", - "Selector: /df_url/*\n", - "Source type: url\n", - "\n", - "Index: .ent-search-actastic-crawler2_extraction_rules\n", - "[{'content_from': {'value_type': 'fixed', 'value': 'ping'}, 'field_name': 'elder_field', 'multiple_objects_handling': 'string', 'selector': '/elderscrolls/*', 'source_type': 'url'}]\n", - "Description: yet_another_rule\n", - "Configuration OID: 67adf643cf03320a318aeefa\n", - "Domain OID: 67adf64bcf0332cb308aef02\n", - "Value type: fixed\n", - "Value: ping\n", - "Field name: elder_field\n", - "Multiple Objects Handling: string\n", - "Selector: /elderscrolls/*\n", - "Source type: url\n", - "\n" - ] - } - ], - "source": [ - "print (\"Let's grab the extraction rules for our crawler.\")\n", - "print (\"The extraction rules are defined in the index .ent-search-actastic-crawler2_extraction_rules\\n\")\n", - "\n", - "ex_r = es_client.search(\n", - " index=\".ent-search-actastic-crawler2_extraction_rules\",\n", - " # _source=\"rules\"\n", - ")\n", - "import pprint as pp\n", - "\n", - "pp.pprint (dict(ex_r[\"hits\"]))\n", - "print ()\n", - "\n", - "for i in ex_r[\"hits\"][\"hits\"]:\n", - " print (f\"Index: {i['_index']}\")\n", - " rules = i[\"_source\"][\"rules\"]\n", - " print (rules)\n", - " for rule in rules:\n", - " print (f\"Description: {i['_source']['description']}\")\n", - " print (f\"Configuration OID: {i['_source']['configuration_oid']}\") # <-- I wonder if we can use these to\n", - " print (f\"Domain OID: {i['_source']['domain_oid']}\") # <-- match specific crawlers\n", - " \n", - " print (f\"Value type: {rule['content_from']['value_type']}\") # <-- this maps to 'action'\n", - " print (f\"Value: {rule['content_from']['value']}\") # <--\n", - " print (f\"Field name: {rule['field_name']}\") # <--\n", - " print (f\"Multiple Objects Handling: {rule['multiple_objects_handling']}\") # <-- this maps to \"join_as\"\n", - " print (f\"Selector: {rule['selector']}\") # <--\n", - " print (f\"Source type: {rule['source_type']}\\n\") # <--" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "8c0b6c8d-52f5-4325-92ca-8b7976c69abf", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Let's try grabbing the configurations docs in .ent-search-actastic-crawler2_configurations_v2\n", - "{'took': 1, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 3, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': '.ent-search-actastic-crawler2_configurations_v2', '_id': '67a3857117490c2adf48274d', '_score': 1.0, '_source': {'id': '67a3857117490c2adf48274d', 'created_at': '2025-02-05T15:36:17Z', 'updated_at': '2025-02-05T16:34:04Z', 'index_name': 'search-migration_crawler', 'crawl_schedule': [{'unit': 'hour', 'frequency': 1}], 'use_connector_schedule': False}}, {'_index': '.ent-search-actastic-crawler2_configurations_v2', '_id': '67adf643cf03320a318aeefa', '_score': 1.0, '_source': {'id': '67adf643cf03320a318aeefa', 'created_at': '2025-02-13T13:40:19Z', 'updated_at': '2025-02-13T14:29:16Z', 'index_name': 'search-daggerfall-unity-website-crawler', 'crawl_schedule': [{'unit': 'hour', 'frequency': 24}], 'use_connector_schedule': True}}, {'_index': '.ent-search-actastic-crawler2_configurations_v2', '_id': '67abc13fcf0332544683f928', '_score': 1.0, '_source': {'id': '67abc13fcf0332544683f928', 'created_at': '2025-02-11T21:29:35Z', 'updated_at': '2025-02-14T16:58:50Z', 'index_name': 'search-crawler-fully-loaded', 'crawl_schedule': None, 'use_connector_schedule': False}}]}}\n", - "\n", - "Inside this index, we can find the following values:\n", - "\n", - "Index: .ent-search-actastic-crawler2_configurations_v2\n", - "Configuration ID: 67a3857117490c2adf48274d\n", - "Index name: search-migration_crawler\n", - "Crawl schedule: [{'unit': 'hour', 'frequency': 1}]\n", - "Use crawl schedule?: False\n", - "\n", - "Index: .ent-search-actastic-crawler2_configurations_v2\n", - "Configuration ID: 67adf643cf03320a318aeefa\n", - "Index name: search-daggerfall-unity-website-crawler\n", - "Crawl schedule: [{'unit': 'hour', 'frequency': 24}]\n", - "Use crawl schedule?: True\n", - "\n", - "Index: .ent-search-actastic-crawler2_configurations_v2\n", - "Configuration ID: 67abc13fcf0332544683f928\n", - "Index name: search-crawler-fully-loaded\n", - "Crawl schedule: None\n", - "Use crawl schedule?: False\n", - "\n" - ] - } - ], - "source": [ - "print (\"Let's try grabbing the configurations docs in .ent-search-actastic-crawler2_configurations_v2\")\n", - "\n", - "config_r = es_client.search(\n", - " index=\".ent-search-actastic-crawler2_configurations_v2\",\n", - ")\n", - "\n", - "print (config_r)\n", - "print ()\n", - "\n", - "print (\"Inside this index, we can find the following values:\\n\")\n", - "for i in config_r[\"hits\"][\"hits\"]:\n", - " source = i[\"_source\"]\n", - " print (f\"Index: {i['_index']}\")\n", - " print (f\"Configuration ID: {source['id']}\") # <--\n", - " print (f\"Index name: {source['index_name']}\") # <--\n", - " print (f\"Crawl schedule: {source['crawl_schedule']}\")\n", - " print (f\"Use crawl schedule?: {source['use_connector_schedule']}\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "85adaae5-38d1-4410-b38c-3dd827d3170c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ".ent-search-actastic-crawler2_domains\n", - "\n", - "{'took': 1, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 4, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': '.ent-search-actastic-crawler2_domains', '_id': '67abc186cf0332829183f93b', '_score': 1.0, '_source': {'id': '67abc186cf0332829183f93b', 'configuration_oid': '67abc13fcf0332544683f928', 'name': 'https://www.speedhunters.com', 'crawl_rules': [], 'seed_urls': [{'created_at': '2025-02-11T21:31:04Z', 'id': '67abc198cf0332d5ae83f93d', 'url': 'https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/'}], 'sitemaps': [{'created_at': '2025-02-14T16:58:47Z', 'id': '67af7647cf0332fdd191f64d', 'url': 'https://www.speedhunters.com/post_tag-sitemap2.xml'}], 'auth': 'akh4My83Myt3Y2NkbnJHUll2RFVlZz09LS1wbjU5TEZmNVdxUDZCK0lSa2dwRlhRPT0=--21413c9faa1309012319bdf4132ef5f4d0add9fb'}}, {'_index': '.ent-search-actastic-crawler2_domains', '_id': '67adf64bcf0332cb308aef02', '_score': 1.0, '_source': {'id': '67adf64bcf0332cb308aef02', 'configuration_oid': '67adf643cf03320a318aeefa', 'name': 'https://www.dfworkshop.net', 'crawl_rules': [{'pattern': 'DOS', 'created_at': '2025-02-13T13:43:27Z', 'rule': 'contains', 'id': '67adf6ffcf03320a318aef0a', 'order': 0, 'policy': 'deny'}], 'seed_urls': [{'created_at': '2025-02-13T13:40:26Z', 'id': '67adf64acf0332cb308aef01', 'url': 'https://www.dfworkshop.net/'}], 'sitemaps': [], 'auth': 'V001b0R1a2xCQmsyYloyQUYvTzJJQT09LS1md284bXJjOGhyRllMeGR3RlVpYU13PT0=--34940590d810db53c8dd783f58244db6c74dad20'}}, {'_index': '.ent-search-actastic-crawler2_domains', '_id': '67a3867317490c4bca482807', '_score': 1.0, '_source': {'id': '67a3867317490c4bca482807', 'configuration_oid': '67a3857117490c2adf48274d', 'name': 'https://justinjackson.ca', 'crawl_rules': [], 'seed_urls': [{'url': 'https://justinjackson.ca/words.html', 'id': '67a3867317490c4bca482806', 'created_at': '2025-02-05T15:40:35Z'}], 'sitemaps': [], 'auth': 'SWpyRVpYdUp0WHlCL1hHOCtXaEhaQT09LS1Ub3ZpV1dpL2FONGpod0ZSeUFYV1RBPT0=--c83fbe57eeffa5429b450de0ffe1fe4aade85e43'}}, {'_index': '.ent-search-actastic-crawler2_domains', '_id': '67a3858717490ccc74482755', '_score': 1.0, '_source': {'id': '67a3858717490ccc74482755', 'configuration_oid': '67a3857117490c2adf48274d', 'name': 'https://matt-nowzari.myportfolio.com', 'crawl_rules': [{'pattern': '/The', 'created_at': '2025-02-05T16:33:59Z', 'rule': 'begins', 'id': '67a392f717490ccb5b4853ec', 'order': 0, 'policy': 'deny'}], 'seed_urls': [{'url': 'https://matt-nowzari.myportfolio.com/', 'id': '67a3858717490ccc74482754', 'created_at': '2025-02-05T15:36:39Z'}], 'sitemaps': [], 'auth': 'U1J4d0xBTW1EN2Ryb05iNXoxRmF1UT09LS1kMDhmMEZWMys0RlJDd2hzWmZMNWx3PT0=--2f4cae86febac77fb8e1fb0133d6520029716a37'}}]}}\n", - "\n", - "Name: https://www.speedhunters.com\n", - "Configuration OID: 67abc13fcf0332544683f928\n", - "Domain ID: 67abc186cf0332829183f93b\n", - "Sitemaps : [{'created_at': '2025-02-14T16:58:47Z', 'id': '67af7647cf0332fdd191f64d', 'url': 'https://www.speedhunters.com/post_tag-sitemap2.xml'}]\n", - "Crawl rules: []\n", - "Seed URLs: [{'created_at': '2025-02-11T21:31:04Z', 'id': '67abc198cf0332d5ae83f93d', 'url': 'https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/'}]\n", - "Auth (decode and split on the ':' delimiter): akh4My83Myt3Y2NkbnJHUll2RFVlZz09LS1wbjU5TEZmNVdxUDZCK0lSa2dwRlhRPT0=--21413c9faa1309012319bdf4132ef5f4d0add9fb\n", - "\n", - "Name: https://www.dfworkshop.net\n", - "Configuration OID: 67adf643cf03320a318aeefa\n", - "Domain ID: 67adf64bcf0332cb308aef02\n", - "Sitemaps : []\n", - "Crawl rules: [{'pattern': 'DOS', 'created_at': '2025-02-13T13:43:27Z', 'rule': 'contains', 'id': '67adf6ffcf03320a318aef0a', 'order': 0, 'policy': 'deny'}]\n", - "Seed URLs: [{'created_at': '2025-02-13T13:40:26Z', 'id': '67adf64acf0332cb308aef01', 'url': 'https://www.dfworkshop.net/'}]\n", - "Auth (decode and split on the ':' delimiter): V001b0R1a2xCQmsyYloyQUYvTzJJQT09LS1md284bXJjOGhyRllMeGR3RlVpYU13PT0=--34940590d810db53c8dd783f58244db6c74dad20\n", - "\n", - "Name: https://justinjackson.ca\n", - "Configuration OID: 67a3857117490c2adf48274d\n", - "Domain ID: 67a3867317490c4bca482807\n", - "Sitemaps : []\n", - "Crawl rules: []\n", - "Seed URLs: [{'url': 'https://justinjackson.ca/words.html', 'id': '67a3867317490c4bca482806', 'created_at': '2025-02-05T15:40:35Z'}]\n", - "Auth (decode and split on the ':' delimiter): SWpyRVpYdUp0WHlCL1hHOCtXaEhaQT09LS1Ub3ZpV1dpL2FONGpod0ZSeUFYV1RBPT0=--c83fbe57eeffa5429b450de0ffe1fe4aade85e43\n", - "\n", - "Name: https://matt-nowzari.myportfolio.com\n", - "Configuration OID: 67a3857117490c2adf48274d\n", - "Domain ID: 67a3858717490ccc74482755\n", - "Sitemaps : []\n", - "Crawl rules: [{'pattern': '/The', 'created_at': '2025-02-05T16:33:59Z', 'rule': 'begins', 'id': '67a392f717490ccb5b4853ec', 'order': 0, 'policy': 'deny'}]\n", - "Seed URLs: [{'url': 'https://matt-nowzari.myportfolio.com/', 'id': '67a3858717490ccc74482754', 'created_at': '2025-02-05T15:36:39Z'}]\n", - "Auth (decode and split on the ':' delimiter): U1J4d0xBTW1EN2Ryb05iNXoxRmF1UT09LS1kMDhmMEZWMys0RlJDd2hzWmZMNWx3PT0=--2f4cae86febac77fb8e1fb0133d6520029716a37\n", - "\n" - ] - } - ], - "source": [ - "print (\".ent-search-actastic-crawler2_domains\\n\")\n", - "\n", - "domains_r = es_client.search(\n", - " index=\".ent-search-actastic-crawler2_domains\",\n", - " _source=[\"name\",\n", - " \"configuration_oid\",\n", - " \"id\",\n", - " \"sitemaps\",\n", - " \"crawl_rules\",\n", - " \"seed_urls\",\n", - " \"auth\"]\n", - ")\n", - "\n", - "print (domains_r)\n", - "print ()\n", - "\n", - "for i in domains_r[\"hits\"][\"hits\"]:\n", - " source = i[\"_source\"]\n", - " print (f\"Name: {source['name']}\") # <--\n", - " \n", - " print (f\"Configuration OID: {source['configuration_oid']}\") # <--\n", - " print (f\"Domain ID: {source['id']}\") # <--\n", - " \n", - " print (f\"Sitemaps : {source['sitemaps']}\") # <--\n", - " print (f\"Crawl rules: {source['crawl_rules']}\") # <--\n", - " print (f\"Seed URLs: {source['seed_urls']}\") # <--\n", - " \n", - " print (f\"Auth (decode and split on the ':' delimiter): {source['auth']}\\n\") # <--" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "eef6570b-26d0-4d98-a279-cf55fd9d31a5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['67adf64bcf0332cb308aef02', '67abc186cf0332829183f93b', '67a3867317490c4bca482807', '67a3858717490ccc74482755']\n", - "['67adf643cf03320a318aeefa', '67abc13fcf0332544683f928', '67a3857117490c2adf48274d']\n" - ] - } - ], - "source": [ - "all_domain_ids = []\n", - "all_config_ids = []\n", - "for i in domains_r[\"hits\"][\"hits\"]:\n", - " source = i[\"_source\"]\n", - " \n", - " all_domain_ids.append(source['id'])\n", - "\n", - " if source['configuration_oid'] not in all_config_ids:\n", - " all_config_ids.append(source['configuration_oid'])\n", - "\n", - "print (all_domain_ids)\n", - "print (all_config_ids)" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "fbddf119-e73c-4e97-9759-3f2f34b8000f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/Users/mattnowzari/repos/search_and_transform/crawler/migration/my-output-conf.yml\n" - ] - } - ], - "source": [ - "import yaml\n", - "import os\n", - "import pprint as pp\n", - "\n", - "##### You may change the value of file_name #####\n", - "file_name = \"my-output-conf.yml\"\n", - "#################################################\n", - "\n", - "data = {\n", - " \"url\": \"\",\n", - " \"seed_urls\": [\"seed1\", \"seed2\", \"seed3\"],\n", - " \"sitemap_urls\": [\"sitemap1\", \"sitemap2\"],\n", - " 'extraction_rulesets':[{\n", - " \"url_filters\": [{\n", - " \"type\": \"begins\",\n", - " \"pattern\": \"/cool/pattern/*\"\n", - " }],\n", - " \"rules\": [{\n", - " \"action\": \"extract\",\n", - " \"field_name\": \"author\",\n", - " \"selector\": \".author\",\n", - " \"join_as\": \"array\",\n", - " \"value\": \"yes\",\n", - " \"source\": \"html\"\n", - " }]\n", - " }]\n", - "}\n", - "# pp.pprint(data)\n", - "\n", - "base_dir = os.getcwd()\n", - "\n", - "output_path = os.path.join(base_dir, file_name)\n", - "print (output_path)\n", - "\n", - "if os.path.exists(base_dir):\n", - " with open(output_path, 'w') as file:\n", - " yaml.dump(data, file)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d525c194-643b-4f97-a59b-f0fa17ba3bfd", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 3ebb15fd10922de0be56ce2dc1a1758b467f6dd1 Mon Sep 17 00:00:00 2001 From: mattnowzari Date: Fri, 21 Feb 2025 10:54:53 -0500 Subject: [PATCH 5/6] removed checkpoint files --- .../crawler_migration-checkpoint.ipynb | 541 ------------------ ...ler_migration_exploration-checkpoint.ipynb | 466 --------------- 2 files changed, 1007 deletions(-) delete mode 100644 migration/.ipynb_checkpoints/crawler_migration-checkpoint.ipynb delete mode 100644 migration/.ipynb_checkpoints/crawler_migration_exploration-checkpoint.ipynb diff --git a/migration/.ipynb_checkpoints/crawler_migration-checkpoint.ipynb b/migration/.ipynb_checkpoints/crawler_migration-checkpoint.ipynb deleted file mode 100644 index b8799165..00000000 --- a/migration/.ipynb_checkpoints/crawler_migration-checkpoint.ipynb +++ /dev/null @@ -1,541 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "89b4646f-6a71-44e0-97b9-846319bf0162", - "metadata": {}, - "source": [ - "## Hello, future Elastic Open Crawler user!\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)]()\n", - "\n", - "This notebook is designed to help you migrate your Elastic Crawler configurations to Open Crawler-friendly YAML!\n", - "\n", - "We recommend running each cell individually in a sequential fashion, as each cell is dependent on previous cells having been run." - ] - }, - { - "cell_type": "markdown", - "id": "f8b41584-1cce-440e-b3af-e8ae0cb1312c", - "metadata": {}, - "source": [ - "_If you are running this notebook inside Google Colab, or have not installed elasticsearch in your local environment yet, please run the following cell to make sure the Python `elasticsearch` client is installed._" - ] - }, - { - "cell_type": "markdown", - "id": "8bc65371-58ea-4be9-a319-2f7ed9713145", - "metadata": {}, - "source": [ - "### Setup\n", - "First, let's start by making sure `elasticsearch` and other required dependencies are installed and imported by running the following cell:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "da411d2f-9aff-46af-845a-5fe9be19ea3c", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install elasticsearch\n", - "\n", - "from getpass import getpass\n", - "from elasticsearch import Elasticsearch\n", - "\n", - "import os\n", - "import json\n", - "import yaml\n", - "import pprint\n" - ] - }, - { - "cell_type": "markdown", - "id": "f4131f88-9895-4c0e-8b0a-6ec7b3b45653", - "metadata": {}, - "source": [ - "We are going to need a few things from your Elasticsearch deployment before we can migrate your configurations:\n", - "- Your **Elasticsearch Cloud ID**\n", - "- An **API key**\n", - "\n", - "To find the Cloud ID for your deployment, go to https://cloud.elastic.co/deployments and select your deployment.\n", - "You can create a new API key from the Stack Management -> API keys menu in Kibana. Be sure to copy or write down your key in a safe place once it is created it will be displayed only upon creation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "08e6e3d2-62d3-4890-a6be-41fe0a931ef6", - "metadata": {}, - "outputs": [], - "source": [ - "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", - "API_KEY = getpass(\"Elastic Api Key: \")" - ] - }, - { - "cell_type": "markdown", - "id": "4993104f-ebb6-4715-b758-1fa262a224f3", - "metadata": {}, - "source": [ - "Great! Now let's try connecting to your Elasticsearch instance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f3ada2cb-b00f-4b1d-be09-57b2ccf25c7c", - "metadata": {}, - "outputs": [], - "source": [ - "es_client = Elasticsearch(\n", - " cloud_id=ELASTIC_CLOUD_ID,\n", - " api_key=API_KEY,\n", - ")\n", - "\n", - "# ping ES to make sure we have positive connection\n", - "es_client.info()['tagline']" - ] - }, - { - "cell_type": "markdown", - "id": "85f99942-58ae-437d-a72b-70b8d1f4432c", - "metadata": {}, - "source": [ - "Hopefully you received our tagline 'You Know, for Search'. If so, we are connected and ready to go!\n", - "\n", - "If not, please double-check your Cloud ID and API key that you provided above. " - ] - }, - { - "cell_type": "markdown", - "id": "a55236e7-19dc-4f4c-92b9-d10848dd6af9", - "metadata": {}, - "source": [ - "### Step 1: Acquire Basic Configurations\n", - "\n", - "The first order of business is to establish what Crawlers you have and their basic configuration details.\n", - "This migration notebook will attempt to pull configurations for every distinct Crawler you have in your Elasticsearch instance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0a698b05-e939-42a5-aa31-51b1b1883e6f", - "metadata": {}, - "outputs": [], - "source": [ - " # in-memory data structure that maintains current state of the configs we've pulled\n", - "inflight_configuration_data = {}\n", - "\n", - "crawler_configurations = es_client.search(\n", - " index=\".ent-search-actastic-crawler2_configurations_v2\",\n", - ")\n", - "\n", - "crawler_counter = 1\n", - "for configuration in crawler_configurations[\"hits\"][\"hits\"]:\n", - " source = configuration[\"_source\"]\n", - "\n", - " # extract values\n", - " crawler_oid = source[\"id\"]\n", - " output_index = source[\"index_name\"]\n", - "\n", - " print (f\"{crawler_counter}. {output_index}\")\n", - " print (f\" Crawler ID is {crawler_oid}\\n\")\n", - " crawler_counter += 1\n", - "\n", - " crawl_schedule = [] # either no schedule or a specific schedule - determined in Step 4\n", - " if source[\"use_connector_schedule\"] == False and source[\"crawl_schedule\"]: # an interval schedule is being used\n", - " crawl_schedule = source[\"crawl_schedule\"] # this will be transformed in Step 4\n", - "\n", - " # populate a temporary hashmap\n", - " temp_conf_map = {\n", - " \"output_index\": output_index,\n", - " \"schedule\": crawl_schedule\n", - " }\n", - " # pre-populate some necessary fields in preparation for upcoming steps\n", - " temp_conf_map[\"domains_temp\"] = {}\n", - " temp_conf_map[\"output_sink\"] = \"elasticsearch\"\n", - " temp_conf_map[\"full_html_extraction_enabled\"] = False\n", - " temp_conf_map[\"elasticsearch\"] = {\n", - " \"host\": \"\",\n", - " \"port\": \"\",\n", - " \"api_key\": \"\",\n", - " # \"username\": \"\",\n", - " # \"password\": \"\",\n", - " }\n", - " # populate the in-memory data structure\n", - " inflight_configuration_data[crawler_oid] = temp_conf_map\n", - "\n", - "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT" - ] - }, - { - "cell_type": "markdown", - "id": "34f5e024-688c-4ffb-a16f-35f5171ba7a8", - "metadata": {}, - "source": [ - "**Before continuing, please verify in the output above that the correct number of Crawlers was found!**\n", - "\n", - "Now that we have some basic data about your Crawlers, let's use this information to get more configuration values!" - ] - }, - { - "cell_type": "markdown", - "id": "2b9e2da7-853c-40bd-9ee1-02c4d92b3b43", - "metadata": {}, - "source": [ - "### Step 2: URLs, Sitemaps, and Crawl Rules\n", - "\n", - "In this cell, we will need to query Elasticsearch for information about each Crawler's domain URLs, seed URLs, sitemaps, and crawling rules." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e1c64c3d-c8d7-4236-9ed9-c9b1cb5e7972", - "metadata": {}, - "outputs": [], - "source": [ - "crawler_ids_to_query = inflight_configuration_data.keys()\n", - "\n", - "crawler_counter = 1\n", - "for crawler_oid in crawler_ids_to_query:\n", - " # query ES to get the crawler's domain configurations\n", - " crawler_domains = es_client.search(\n", - " index=\".ent-search-actastic-crawler2_domains\",\n", - " query={\"match\": {\"configuration_oid\": crawler_oid}},\n", - " _source=[\"name\",\n", - " \"configuration_oid\",\n", - " \"id\",\n", - " \"sitemaps\",\n", - " \"crawl_rules\",\n", - " \"seed_urls\",\n", - " \"auth\"]\n", - " )\n", - " print (f\"{crawler_counter}.) Crawler ID {crawler_oid}\")\n", - " crawler_counter += 1\n", - " \n", - " # for each domain the Crawler has, grab its config values\n", - " # and update the in-memory data structure\n", - " for domain_info in crawler_domains[\"hits\"][\"hits\"]:\n", - " source = domain_info[\"_source\"]\n", - "\n", - " # extract values\n", - " domain_oid = str(source[\"id\"])\n", - " domain_url = source[\"name\"]\n", - " seed_urls = source[\"seed_urls\"]\n", - " sitemap_urls = source[\"sitemaps\"]\n", - " crawl_rules = source[\"crawl_rules\"]\n", - "\n", - " print (f\" Domain {domain_url} found!\")\n", - " \n", - " # transform seed, sitemap, and crawl rules into arrays\n", - " seed_urls_list = []\n", - " for seed_obj in seed_urls:\n", - " seed_urls_list.append(seed_obj[\"url\"])\n", - "\n", - " sitemap_urls_list= []\n", - " for sitemap_obj in sitemap_urls:\n", - " sitemap_urls_list.append(sitemap_obj[\"url\"])\n", - "\n", - " crawl_rules_list = []\n", - " for crawl_rules_obj in crawl_rules:\n", - " crawl_rules_list.append({\n", - " \"policy\" : crawl_rules_obj[\"policy\"],\n", - " \"type\": crawl_rules_obj[\"rule\"],\n", - " \"pattern\": crawl_rules_obj[\"pattern\"]\n", - " })\n", - "\n", - " # populate a temporary hashmap\n", - " temp_domain_conf = {\"url\": domain_url}\n", - " if seed_urls_list:\n", - " temp_domain_conf[\"seed_urls\"] = seed_urls_list\n", - " print (f\" Seed URls found: {seed_urls_list}\")\n", - " if sitemap_urls_list:\n", - " temp_domain_conf[\"sitemap_urls\"] = sitemap_urls_list\n", - " print (f\" Sitemap URLs found: {sitemap_urls_list}\")\n", - " if crawl_rules_list:\n", - " temp_domain_conf[\"crawl_rules\"] = crawl_rules_list\n", - " print (f\" Crawl rules found: {crawl_rules_list}\")\n", - " \n", - " # populate the in-memory data structure\n", - " inflight_configuration_data[crawler_oid][\"domains_temp\"][domain_oid] = temp_domain_conf\n", - "\n", - "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT" - ] - }, - { - "cell_type": "markdown", - "id": "575c00ac-7c84-465e-83d7-aa51f8e5310d", - "metadata": {}, - "source": [ - "### Step 3: Extracting the Extraction Rules\n", - "\n", - "In the following cell, we will be acquiring any extraction rules you may have set in your Elastic Crawlers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "61a7df7a-72ad-4330-a30c-da319befd55c", - "metadata": {}, - "outputs": [], - "source": [ - "extraction_rules = es_client.search(\n", - " index=\".ent-search-actastic-crawler2_extraction_rules\",\n", - " _source=[\"configuration_oid\", \"domain_oid\", \"rules\", \"url_filters\"]\n", - ")\n", - "\n", - "for exr_rule in extraction_rules[\"hits\"][\"hits\"]:\n", - " source = exr_rule[\"_source\"]\n", - "\n", - " config_oid = source[\"configuration_oid\"]\n", - " domain_oid = source[\"domain_oid\"]\n", - " \n", - " all_rules = source[\"rules\"]\n", - " all_url_filters = source[\"url_filters\"]\n", - "\n", - " # extract url filters\n", - " url_filters = []\n", - " if all_url_filters:\n", - " url_filters = [{\n", - " \"type\": all_url_filters[0][\"filter\"],\n", - " \"pattern\": all_url_filters[0][\"pattern\"],\n", - " }]\n", - "\n", - " # extract rulesets\n", - " action_translation_map = {\n", - " \"fixed\": \"set\",\n", - " \"extracted\": \"extract\",\n", - " }\n", - " \n", - " ruleset = {}\n", - " if all_rules:\n", - " ruleset = [{\n", - " \"action\": action_translation_map[all_rules[0][\"content_from\"][\"value_type\"]],\n", - " \"field_name\": all_rules[0][\"field_name\"],\n", - " \"selector\": all_rules[0][\"selector\"],\n", - " \"join_as\": all_rules[0][\"multiple_objects_handling\"],\n", - " \"value\": all_rules[0][\"content_from\"][\"value\"],\n", - " \"source\": all_rules[0][\"source_type\"],\n", - " }]\n", - "\n", - " # populate the in-memory data structure\n", - " temp_extraction_rulesets = [{\n", - " \"url_filters\": url_filters,\n", - " \"rules\": ruleset,\n", - " }]\n", - " inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\"extraction_rulesets\"] = temp_extraction_rulesets\n", - "\n", - "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT" - ] - }, - { - "cell_type": "markdown", - "id": "538fb054-1399-4b88-bd1e-fef116491421", - "metadata": {}, - "source": [ - "### Step 4: Schedules\n", - "\n", - "In the upcoming cell, we will be gathing any schedules your Crawlers have set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d880e081-f960-41c7-921e-26896f248eab", - "metadata": {}, - "outputs": [], - "source": [ - "def generate_cron_expression(interval_values: dict) -> str:\n", - " return interval_values # TODO TODO this ** might not be needed? **\n", - "\n", - "# ---------------------------\n", - "\n", - "for crawler_oid, crawler_config in inflight_configuration_data.items():\n", - " output_index = crawler_config[\"output_index\"]\n", - " \n", - " existing_schedule_value = crawler_config[\"schedule\"]\n", - "\n", - " if not existing_schedule_value:\n", - " # query ES to get this Crawler's specific time schedule\n", - " schedules_result = es_client.search(\n", - " index=\".elastic-connectors-v1\",\n", - " query={\"match\": {\"index_name\": output_index}},\n", - " _source=[\"index_name\", \"scheduling\"]\n", - " )\n", - " # update schedule field with cron expression if specific time scheduling is enabled\n", - " if schedules_result[\"hits\"][\"hits\"][0][\"_source\"][\"scheduling\"][\"full\"][\"enabled\"]:\n", - " specific_time_schedule = schedules_result[\"hits\"][\"hits\"][0][\"_source\"][\"scheduling\"][\"full\"][\"interval\"]\n", - " crawler_config[\"schedule\"] = specific_time_schedule\n", - " elif isinstance(existing_schedule_value[0], dict):\n", - " crawler_config[\"schedule\"] = generate_cron_expression(existing_schedule_value)\n", - " \n", - "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT " - ] - }, - { - "cell_type": "markdown", - "id": "b1586df2-283d-435f-9b08-ba9fad3a7e0a", - "metadata": {}, - "source": [ - "### Step 5: Creating the Open Crawler YAML configuration files\n", - "\n", - "In this final step, we will be creating the actual YAML files you need to get up and running with Open Crawler!\n", - "\n", - "The upcoming cell performs some final transformations to the in-memory data structure that is keeping track of your configurations." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dd70f102-33ee-4106-8861-0aa0f9a223a1", - "metadata": {}, - "outputs": [], - "source": [ - "# Final transform of the in-memory data structure to a form we can dump to YAML\n", - "# for each crawler, collect all of its domain configurations into a list\n", - "for crawler_config in inflight_configuration_data.values():\n", - " all_crawler_domains = []\n", - " \n", - " for domain_config in crawler_config[\"domains_temp\"].values():\n", - " all_crawler_domains.append(domain_config)\n", - " # create a new key called \"domains\" that points to a list of domain configs only - no domain_oid values as keys\n", - " crawler_config[\"domains\"] = all_crawler_domains\n", - " # delete the temporary domain key\n", - " del crawler_config[\"domains_temp\"]\n", - "\n", - "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT " - ] - }, - { - "cell_type": "markdown", - "id": "e611a486-e12f-4951-ab95-ca54241a7a06", - "metadata": {}, - "source": [ - "#### **Wait! Before we continue onto creating our YAML files, we're going to need your input on a few things.**\n", - "\n", - "In the following cell, please enter the following details about the _Elasticsearch instance you will be using with Open Crawler_:\n", - "- The Elasticsearch endpoint URL\n", - "- The port number of your Elasticsearch endpoint\n", - "- An API key" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "213880cc-cbf3-40d9-8c7d-6fcf6428c16b", - "metadata": {}, - "outputs": [], - "source": [ - "ENDPOINT = input(\"Elasticsearch endpoint URL: \")\n", - "PORT = input(\"The Elasticsearch endpoint's port number: \")\n", - "API_KEY = getpass(\"Elasticsearch API key: \")\n", - "\n", - "# set the above values in each Crawler's configuration\n", - "for crawler_config in inflight_configuration_data.values():\n", - " crawler_config[\"elasticsearch\"][\"host\"] = ENDPOINT\n", - " crawler_config[\"elasticsearch\"][\"port\"] = int(PORT)\n", - " crawler_config[\"elasticsearch\"][\"api_key\"] = API_KEY" - ] - }, - { - "cell_type": "markdown", - "id": "67dfc7c6-429e-42f0-ab08-2c84d72945cb", - "metadata": {}, - "source": [ - "#### **This is the final step! You have two options here:**\n", - "\n", - "- The \"Write to YAML\" cell will create _n_ number of YAML files, one for each Crawler you have.\n", - "- The \"Print to output\" cell will print each Crawler's configuration YAML in the Notebook, so you can copy-paste them into your Open Crawler YAML files manually.\n", - "\n", - "Feel free to run both! You can run Option 2 first to see the output before running Option 1 to save the configs into YAML files." - ] - }, - { - "cell_type": "markdown", - "id": "7ca5ad33-364c-4d13-88fc-db19052363d5", - "metadata": {}, - "source": [ - "#### Option 1: Write to YAML file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6adc53db-d781-4b72-a5f3-441364f354b8", - "metadata": {}, - "outputs": [], - "source": [ - "# Dump each Crawler's configuration into its own YAML file\n", - "for crawler_config in inflight_configuration_data.values():\n", - " base_dir = os.getcwd()\n", - " file_name = f\"{crawler_config['output_index']}-config.yml\" # autogen a custom filename\n", - " output_path = os.path.join(base_dir, file_name)\n", - "\n", - " if os.path.exists(base_dir):\n", - " with open(output_path, 'w') as file:\n", - " yaml.safe_dump(\n", - " crawler_config,\n", - " file,\n", - " sort_keys=False\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "35c56a2b-4acd-47f5-90e3-9dd39fa4383f", - "metadata": {}, - "source": [ - "#### Option 2: Print to output" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "525aabb8-0537-4ba6-8109-109490dddafe", - "metadata": {}, - "outputs": [], - "source": [ - "for crawler_config in inflight_configuration_data.values():\n", - " yaml_out = yaml.safe_dump(\n", - " crawler_config,\n", - " sort_keys=False\n", - " )\n", - " \n", - " print (f\"YAML config => {crawler_config['output_index']}-config.yml\\n--------\")\n", - " print (yaml_out)\n", - " print (\"--------------------------------------------------------------------------------\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "55888204-f823-48cd-bca4-a7663e0fe56a", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/migration/.ipynb_checkpoints/crawler_migration_exploration-checkpoint.ipynb b/migration/.ipynb_checkpoints/crawler_migration_exploration-checkpoint.ipynb deleted file mode 100644 index a62478bb..00000000 --- a/migration/.ipynb_checkpoints/crawler_migration_exploration-checkpoint.ipynb +++ /dev/null @@ -1,466 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "id": "4f198cd5-cc9c-4080-8dd4-425628b05d4d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "This is the start of the notebook\n", - "Hopefully we can learn a thing or two, before doing a thing or two.\n" - ] - } - ], - "source": [ - "print (\"This is the start of the notebook\")\n", - "print (\"Hopefully we can learn a thing or two, before doing a thing or two.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "1914cbf9-a18b-4b1d-9ea1-5bc04e23ceff", - "metadata": {}, - "outputs": [], - "source": [ - "from elasticsearch import Elasticsearch\n", - "import json\n", - "\n", - "endpoint = \"https://5a5b8a5cdd84464dae4c7c7ae8a59562.us-east1.gcp.elastic-cloud.com:443\"\n", - "api_key = \"aTN4MUdwVUJLTFFTSmFFWjBlTFM6dmU0ZXJnTjdUaUs5dXhIUU1fd0xiZw==\"\n", - "\n", - "es_client = Elasticsearch(\n", - " endpoint,\n", - " api_key=api_key,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "711bb339-bbc8-4112-a392-dde01f5e5729", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The index [search-migration_crawler] was found!\n" - ] - } - ], - "source": [ - "index_name = \"search-migration_crawler\"\n", - "if not es_client.indices.exists(index=index_name):\n", - " print (\"Eek! The index does not exist!\")\n", - "else:\n", - " print (f\"The index [{index_name}] was found!\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "a14009b2-2d34-465e-b43d-a274f01fbff0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Let's see if we can get _all_ indices in this ES instance related to crawler?\n", - "\n", - "We got the list as a JSON dictionary! We received 20 indices.\n", - "\n", - " 4986743 docs -> .ds-logs-elastic_crawler-default-2025.02.05-000001\n", - " 0 docs -> .ent-search-actastic-app_search_crawler_content_metadata\n", - " 0 docs -> .ent-search-actastic-app_search_crawler_content_metadata-content_hash-engine_oid-unique-constraint\n", - " 0 docs -> .ent-search-actastic-app_search_crawler_content_url_metadata\n", - " 3 docs -> .ent-search-actastic-crawler2_configurations_v2\n", - " 3 docs -> .ent-search-actastic-crawler2_configurations_v2-index_name-unique-constraint\n", - " 9385 docs -> .ent-search-actastic-crawler2_content_metadata\n", - " 9385 docs -> .ent-search-actastic-crawler2_content_metadata-configuration_oid-content_hash-unique-constraint\n", - " 9532 docs -> .ent-search-actastic-crawler2_content_url_metadata\n", - " 332 docs -> .ent-search-actastic-crawler2_crawl_requests_v2\n", - " 4 docs -> .ent-search-actastic-crawler2_domains\n", - " 4 docs -> .ent-search-actastic-crawler2_domains-configuration_oid-name-unique-constraint\n", - " 2 docs -> .ent-search-actastic-crawler2_extraction_rules\n", - " 0 docs -> .ent-search-actastic-crawler2_process_crawls\n", - " 651 docs -> .ent-search-actastic-crawler2_robots_txts\n", - " 0 docs -> .ent-search-actastic-crawler_crawl_requests_v7\n", - " 0 docs -> .ent-search-actastic-crawler_domains_v6\n", - " 0 docs -> .ent-search-actastic-crawler_domains_v6-engine_oid-name-unique-constraint\n", - " 0 docs -> .ent-search-actastic-crawler_process_crawls\n", - " 0 docs -> .ent-search-actastic-crawler_robots_txts_v3\n", - "\n", - "There are 20 healthy indices, 0 sick indices and 0 unhealthy indices.\n", - "11 indices have docs, and 9 indices do not.\n" - ] - } - ], - "source": [ - "print (\"Let's see if we can get _all_ indices in this ES instance related to crawler?\\n\")\n", - "\n", - "json_response = es_client.cat.indices(\n", - " index=\".*crawler*\", # take note of the . before *crawler* - this tells the API to query 'hidden' indices as well\n", - " s=\"index\",\n", - " format=\"json\"\n", - ")\n", - "\n", - "print (f\"We got the list as a JSON dictionary! We received {len(json_response)} indices.\\n\")\n", - "\n", - "health_histogram = {\n", - " \"green\": 0,\n", - " \"yellow\": 0,\n", - " \"red\": 0,\n", - "}\n", - "\n", - "indices_with_docs = {\n", - " \"with_docs\": 0,\n", - " \"without_docs\": 0,\n", - "}\n", - "\n", - "index_names = [] # save the index names to run through all of them at some point in the future\n", - "\n", - "for item in json_response.body: # Note that calling .body on the response will get us a List of dictionaries:\n", - " health_status = item[\"health\"]\n", - " health_histogram[health_status] += 1\n", - "\n", - " if int(item['docs.count']) > 0:\n", - " indices_with_docs['with_docs'] += 1\n", - " index_names.append(item['index'])\n", - " else:\n", - " indices_with_docs['without_docs'] += 1\n", - "\n", - " print (f\" {item['docs.count']} docs -> {item['index']}\")\n", - "\n", - "print (f\"\\nThere are {health_histogram['green']} healthy indices, {health_histogram['yellow']} sick indices \\\n", - "and {health_histogram['red']} unhealthy indices.\")\n", - "\n", - "print (f\"{indices_with_docs['with_docs']} indices have docs, and {indices_with_docs['without_docs']} \\\n", - "indices do not.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "db96133f-bb1b-4f40-b02f-4812a585964d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Let's grab the extraction rules for our crawler.\n", - "The extraction rules are defined in the index .ent-search-actastic-crawler2_extraction_rules\n", - "\n", - "{'took': 0, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': '.ent-search-actastic-crawler2_extraction_rules', '_id': '67a393cc17490cc0d24853ef', '_score': 1.0, '_source': {'id': '67a393cc17490cc0d24853ef', 'created_at': '2025-02-05T16:37:32Z', 'updated_at': '2025-02-05T16:37:32Z', 'description': 'extraction_rule_generic', 'domain_oid': '67a3858717490ccc74482755', 'configuration_oid': '67a3857117490c2adf48274d', 'url_filters': [], 'rules': [{'content_from': {'value_type': 'fixed', 'value': '\"some_rando_value\"'}, 'field_name': 'test_field', 'multiple_objects_handling': 'string', 'selector': '/html/body/a/@title', 'source_type': 'html'}], 'edited_by': '1432693181'}}, {'_index': '.ent-search-actastic-crawler2_extraction_rules', '_id': '67adfbd0cf0332091f8af6e2', '_score': 1.0, '_source': {'id': '67adfbd0cf0332091f8af6e2', 'created_at': '2025-02-13T14:04:00Z', 'updated_at': '2025-02-13T14:04:00Z', 'description': 'df_ex_rule', 'domain_oid': '67adf64bcf0332cb308aef02', 'configuration_oid': '67adf643cf03320a318aeefa', 'url_filters': [], 'rules': [{'content_from': {'value_type': 'extracted', 'value': ''}, 'field_name': 'df_xtraction_rule', 'multiple_objects_handling': 'string', 'selector': '/html/body/a/@title', 'source_type': 'html'}], 'edited_by': '1432693181'}}]}}\n", - "\n", - "Index: .ent-search-actastic-crawler2_extraction_rules\n", - "[{'content_from': {'value_type': 'fixed', 'value': '\"some_rando_value\"'}, 'field_name': 'test_field', 'multiple_objects_handling': 'string', 'selector': '/html/body/a/@title', 'source_type': 'html'}]\n", - "Description: extraction_rule_generic\n", - "Configuration OID: 67a3857117490c2adf48274d\n", - "Domain OID: 67a3858717490ccc74482755\n", - "Value type: fixed\n", - "Value: \"some_rando_value\"\n", - "Field name: test_field\n", - "Multiple Objects Handling: string\n", - "Selector: /html/body/a/@title\n", - "Source type: html\n", - "\n", - "Index: .ent-search-actastic-crawler2_extraction_rules\n", - "[{'content_from': {'value_type': 'extracted', 'value': ''}, 'field_name': 'df_xtraction_rule', 'multiple_objects_handling': 'string', 'selector': '/html/body/a/@title', 'source_type': 'html'}]\n", - "Description: df_ex_rule\n", - "Configuration OID: 67adf643cf03320a318aeefa\n", - "Domain OID: 67adf64bcf0332cb308aef02\n", - "Value type: extracted\n", - "Value: \n", - "Field name: df_xtraction_rule\n", - "Multiple Objects Handling: string\n", - "Selector: /html/body/a/@title\n", - "Source type: html\n", - "\n" - ] - } - ], - "source": [ - "print (\"Let's grab the extraction rules for our crawler.\")\n", - "print (\"The extraction rules are defined in the index .ent-search-actastic-crawler2_extraction_rules\\n\")\n", - "\n", - "ex_r = es_client.search(\n", - " index=\".ent-search-actastic-crawler2_extraction_rules\",\n", - " # _source=\"rules\"\n", - ")\n", - "\n", - "print (ex_r)\n", - "print ()\n", - "\n", - "for i in ex_r[\"hits\"][\"hits\"]:\n", - " print (f\"Index: {i['_index']}\")\n", - " rules = i[\"_source\"][\"rules\"]\n", - " print (rules)\n", - " for rule in rules:\n", - " print (f\"Description: {i['_source']['description']}\")\n", - " print (f\"Configuration OID: {i['_source']['configuration_oid']}\") # <-- I wonder if we can use these to\n", - " print (f\"Domain OID: {i['_source']['domain_oid']}\") # <-- match specific crawlers\n", - " \n", - " print (f\"Value type: {rule['content_from']['value_type']}\") # <-- this maps to 'action'\n", - " print (f\"Value: {rule['content_from']['value']}\") # <--\n", - " print (f\"Field name: {rule['field_name']}\") # <--\n", - " print (f\"Multiple Objects Handling: {rule['multiple_objects_handling']}\") # <-- this maps to \"join_as\"\n", - " print (f\"Selector: {rule['selector']}\") # <--\n", - " print (f\"Source type: {rule['source_type']}\\n\") # <--" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "8c0b6c8d-52f5-4325-92ca-8b7976c69abf", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Let's try grabbing the configurations docs in .ent-search-actastic-crawler2_configurations_v2\n", - "{'took': 1, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 3, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': '.ent-search-actastic-crawler2_configurations_v2', '_id': '67a3857117490c2adf48274d', '_score': 1.0, '_source': {'id': '67a3857117490c2adf48274d', 'created_at': '2025-02-05T15:36:17Z', 'updated_at': '2025-02-05T16:34:04Z', 'index_name': 'search-migration_crawler', 'crawl_schedule': [{'unit': 'hour', 'frequency': 1}], 'use_connector_schedule': False}}, {'_index': '.ent-search-actastic-crawler2_configurations_v2', '_id': '67adf643cf03320a318aeefa', '_score': 1.0, '_source': {'id': '67adf643cf03320a318aeefa', 'created_at': '2025-02-13T13:40:19Z', 'updated_at': '2025-02-13T14:29:16Z', 'index_name': 'search-daggerfall-unity-website-crawler', 'crawl_schedule': [{'unit': 'hour', 'frequency': 24}], 'use_connector_schedule': True}}, {'_index': '.ent-search-actastic-crawler2_configurations_v2', '_id': '67abc13fcf0332544683f928', '_score': 1.0, '_source': {'id': '67abc13fcf0332544683f928', 'created_at': '2025-02-11T21:29:35Z', 'updated_at': '2025-02-14T16:58:50Z', 'index_name': 'search-crawler-fully-loaded', 'crawl_schedule': None, 'use_connector_schedule': False}}]}}\n", - "\n", - "Inside this index, we can find the following values:\n", - "\n", - "Index: .ent-search-actastic-crawler2_configurations_v2\n", - "Configuration ID: 67a3857117490c2adf48274d\n", - "Index name: search-migration_crawler\n", - "Crawl schedule: [{'unit': 'hour', 'frequency': 1}]\n", - "Use crawl schedule?: False\n", - "\n", - "Index: .ent-search-actastic-crawler2_configurations_v2\n", - "Configuration ID: 67adf643cf03320a318aeefa\n", - "Index name: search-daggerfall-unity-website-crawler\n", - "Crawl schedule: [{'unit': 'hour', 'frequency': 24}]\n", - "Use crawl schedule?: True\n", - "\n", - "Index: .ent-search-actastic-crawler2_configurations_v2\n", - "Configuration ID: 67abc13fcf0332544683f928\n", - "Index name: search-crawler-fully-loaded\n", - "Crawl schedule: None\n", - "Use crawl schedule?: False\n", - "\n" - ] - } - ], - "source": [ - "print (\"Let's try grabbing the configurations docs in .ent-search-actastic-crawler2_configurations_v2\")\n", - "\n", - "config_r = es_client.search(\n", - " index=\".ent-search-actastic-crawler2_configurations_v2\",\n", - ")\n", - "\n", - "print (config_r)\n", - "print ()\n", - "\n", - "print (\"Inside this index, we can find the following values:\\n\")\n", - "for i in config_r[\"hits\"][\"hits\"]:\n", - " source = i[\"_source\"]\n", - " print (f\"Index: {i['_index']}\")\n", - " print (f\"Configuration ID: {source['id']}\") # <--\n", - " print (f\"Index name: {source['index_name']}\") # <--\n", - " print (f\"Crawl schedule: {source['crawl_schedule']}\")\n", - " print (f\"Use crawl schedule?: {source['use_connector_schedule']}\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "85adaae5-38d1-4410-b38c-3dd827d3170c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ".ent-search-actastic-crawler2_domains\n", - "\n", - "{'took': 1, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 4, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': '.ent-search-actastic-crawler2_domains', '_id': '67abc186cf0332829183f93b', '_score': 1.0, '_source': {'id': '67abc186cf0332829183f93b', 'configuration_oid': '67abc13fcf0332544683f928', 'name': 'https://www.speedhunters.com', 'crawl_rules': [], 'seed_urls': [{'created_at': '2025-02-11T21:31:04Z', 'id': '67abc198cf0332d5ae83f93d', 'url': 'https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/'}], 'sitemaps': [{'created_at': '2025-02-14T16:58:47Z', 'id': '67af7647cf0332fdd191f64d', 'url': 'https://www.speedhunters.com/post_tag-sitemap2.xml'}], 'auth': 'akh4My83Myt3Y2NkbnJHUll2RFVlZz09LS1wbjU5TEZmNVdxUDZCK0lSa2dwRlhRPT0=--21413c9faa1309012319bdf4132ef5f4d0add9fb'}}, {'_index': '.ent-search-actastic-crawler2_domains', '_id': '67adf64bcf0332cb308aef02', '_score': 1.0, '_source': {'id': '67adf64bcf0332cb308aef02', 'configuration_oid': '67adf643cf03320a318aeefa', 'name': 'https://www.dfworkshop.net', 'crawl_rules': [{'pattern': 'DOS', 'created_at': '2025-02-13T13:43:27Z', 'rule': 'contains', 'id': '67adf6ffcf03320a318aef0a', 'order': 0, 'policy': 'deny'}], 'seed_urls': [{'created_at': '2025-02-13T13:40:26Z', 'id': '67adf64acf0332cb308aef01', 'url': 'https://www.dfworkshop.net/'}], 'sitemaps': [], 'auth': 'V001b0R1a2xCQmsyYloyQUYvTzJJQT09LS1md284bXJjOGhyRllMeGR3RlVpYU13PT0=--34940590d810db53c8dd783f58244db6c74dad20'}}, {'_index': '.ent-search-actastic-crawler2_domains', '_id': '67a3867317490c4bca482807', '_score': 1.0, '_source': {'id': '67a3867317490c4bca482807', 'configuration_oid': '67a3857117490c2adf48274d', 'name': 'https://justinjackson.ca', 'crawl_rules': [], 'seed_urls': [{'url': 'https://justinjackson.ca/words.html', 'id': '67a3867317490c4bca482806', 'created_at': '2025-02-05T15:40:35Z'}], 'sitemaps': [], 'auth': 'SWpyRVpYdUp0WHlCL1hHOCtXaEhaQT09LS1Ub3ZpV1dpL2FONGpod0ZSeUFYV1RBPT0=--c83fbe57eeffa5429b450de0ffe1fe4aade85e43'}}, {'_index': '.ent-search-actastic-crawler2_domains', '_id': '67a3858717490ccc74482755', '_score': 1.0, '_source': {'id': '67a3858717490ccc74482755', 'configuration_oid': '67a3857117490c2adf48274d', 'name': 'https://matt-nowzari.myportfolio.com', 'crawl_rules': [{'pattern': '/The', 'created_at': '2025-02-05T16:33:59Z', 'rule': 'begins', 'id': '67a392f717490ccb5b4853ec', 'order': 0, 'policy': 'deny'}], 'seed_urls': [{'url': 'https://matt-nowzari.myportfolio.com/', 'id': '67a3858717490ccc74482754', 'created_at': '2025-02-05T15:36:39Z'}], 'sitemaps': [], 'auth': 'U1J4d0xBTW1EN2Ryb05iNXoxRmF1UT09LS1kMDhmMEZWMys0RlJDd2hzWmZMNWx3PT0=--2f4cae86febac77fb8e1fb0133d6520029716a37'}}]}}\n", - "\n", - "Name: https://www.speedhunters.com\n", - "Configuration OID: 67abc13fcf0332544683f928\n", - "Domain ID: 67abc186cf0332829183f93b\n", - "Sitemaps : [{'created_at': '2025-02-14T16:58:47Z', 'id': '67af7647cf0332fdd191f64d', 'url': 'https://www.speedhunters.com/post_tag-sitemap2.xml'}]\n", - "Crawl rules: []\n", - "Seed URLs: [{'created_at': '2025-02-11T21:31:04Z', 'id': '67abc198cf0332d5ae83f93d', 'url': 'https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/'}]\n", - "Auth (decode and split on the ':' delimiter): akh4My83Myt3Y2NkbnJHUll2RFVlZz09LS1wbjU5TEZmNVdxUDZCK0lSa2dwRlhRPT0=--21413c9faa1309012319bdf4132ef5f4d0add9fb\n", - "\n", - "Name: https://www.dfworkshop.net\n", - "Configuration OID: 67adf643cf03320a318aeefa\n", - "Domain ID: 67adf64bcf0332cb308aef02\n", - "Sitemaps : []\n", - "Crawl rules: [{'pattern': 'DOS', 'created_at': '2025-02-13T13:43:27Z', 'rule': 'contains', 'id': '67adf6ffcf03320a318aef0a', 'order': 0, 'policy': 'deny'}]\n", - "Seed URLs: [{'created_at': '2025-02-13T13:40:26Z', 'id': '67adf64acf0332cb308aef01', 'url': 'https://www.dfworkshop.net/'}]\n", - "Auth (decode and split on the ':' delimiter): V001b0R1a2xCQmsyYloyQUYvTzJJQT09LS1md284bXJjOGhyRllMeGR3RlVpYU13PT0=--34940590d810db53c8dd783f58244db6c74dad20\n", - "\n", - "Name: https://justinjackson.ca\n", - "Configuration OID: 67a3857117490c2adf48274d\n", - "Domain ID: 67a3867317490c4bca482807\n", - "Sitemaps : []\n", - "Crawl rules: []\n", - "Seed URLs: [{'url': 'https://justinjackson.ca/words.html', 'id': '67a3867317490c4bca482806', 'created_at': '2025-02-05T15:40:35Z'}]\n", - "Auth (decode and split on the ':' delimiter): SWpyRVpYdUp0WHlCL1hHOCtXaEhaQT09LS1Ub3ZpV1dpL2FONGpod0ZSeUFYV1RBPT0=--c83fbe57eeffa5429b450de0ffe1fe4aade85e43\n", - "\n", - "Name: https://matt-nowzari.myportfolio.com\n", - "Configuration OID: 67a3857117490c2adf48274d\n", - "Domain ID: 67a3858717490ccc74482755\n", - "Sitemaps : []\n", - "Crawl rules: [{'pattern': '/The', 'created_at': '2025-02-05T16:33:59Z', 'rule': 'begins', 'id': '67a392f717490ccb5b4853ec', 'order': 0, 'policy': 'deny'}]\n", - "Seed URLs: [{'url': 'https://matt-nowzari.myportfolio.com/', 'id': '67a3858717490ccc74482754', 'created_at': '2025-02-05T15:36:39Z'}]\n", - "Auth (decode and split on the ':' delimiter): U1J4d0xBTW1EN2Ryb05iNXoxRmF1UT09LS1kMDhmMEZWMys0RlJDd2hzWmZMNWx3PT0=--2f4cae86febac77fb8e1fb0133d6520029716a37\n", - "\n" - ] - } - ], - "source": [ - "print (\".ent-search-actastic-crawler2_domains\\n\")\n", - "\n", - "domains_r = es_client.search(\n", - " index=\".ent-search-actastic-crawler2_domains\",\n", - " _source=[\"name\",\n", - " \"configuration_oid\",\n", - " \"id\",\n", - " \"sitemaps\",\n", - " \"crawl_rules\",\n", - " \"seed_urls\",\n", - " \"auth\"]\n", - ")\n", - "\n", - "print (domains_r)\n", - "print ()\n", - "\n", - "for i in domains_r[\"hits\"][\"hits\"]:\n", - " source = i[\"_source\"]\n", - " print (f\"Name: {source['name']}\") # <--\n", - " \n", - " print (f\"Configuration OID: {source['configuration_oid']}\") # <--\n", - " print (f\"Domain ID: {source['id']}\") # <--\n", - " \n", - " print (f\"Sitemaps : {source['sitemaps']}\") # <--\n", - " print (f\"Crawl rules: {source['crawl_rules']}\") # <--\n", - " print (f\"Seed URLs: {source['seed_urls']}\") # <--\n", - " \n", - " print (f\"Auth (decode and split on the ':' delimiter): {source['auth']}\\n\") # <--" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "eef6570b-26d0-4d98-a279-cf55fd9d31a5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['67adf64bcf0332cb308aef02', '67abc186cf0332829183f93b', '67a3867317490c4bca482807', '67a3858717490ccc74482755']\n", - "['67adf643cf03320a318aeefa', '67abc13fcf0332544683f928', '67a3857117490c2adf48274d']\n" - ] - } - ], - "source": [ - "all_domain_ids = []\n", - "all_config_ids = []\n", - "for i in domains_r[\"hits\"][\"hits\"]:\n", - " source = i[\"_source\"]\n", - " \n", - " all_domain_ids.append(source['id'])\n", - "\n", - " if source['configuration_oid'] not in all_config_ids:\n", - " all_config_ids.append(source['configuration_oid'])\n", - "\n", - "print (all_domain_ids)\n", - "print (all_config_ids)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "fbddf119-e73c-4e97-9759-3f2f34b8000f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'A': [{'a': 1}],\n", - " 'extraction_rulesets': [{'rules': {'action': '',\n", - " 'field_name': '',\n", - " 'join_as': '',\n", - " 'selector': '',\n", - " 'source': '',\n", - " 'value': ''},\n", - " 'url_filters': {'pattern': '', 'type': ''}}]}\n", - "/Users/mattnowzari/repos/search_and_transform/crawler/migration/my-output-conf.yml\n" - ] - } - ], - "source": [ - "import yaml\n", - "import os\n", - "import pprint as pp\n", - "\n", - "##### You may change the value of file_name #####\n", - "file_name = \"my-output-conf.yml\"\n", - "#################################################\n", - "\n", - "data = {\n", - " 'A':[{'a':1}],\n", - " 'extraction_rulesets':[{\n", - " \"url_filters\": {\"type\": \"\", \"pattern\": \"\"},\n", - " \"rules\": {\"action\": \"\", \"field_name\": \"\", \"selector\": \"\", \"join_as\": \"\", \"value\": \"\", \"source\": \"\"}\n", - " }\n", - " ]\n", - " \n", - "}\n", - "# pp.pprint(data)\n", - "\n", - "base_dir = os.getcwd()\n", - "\n", - "output_path = os.path.join(base_dir, file_name)\n", - "print (output_path)\n", - "\n", - "if os.path.exists(base_dir):\n", - " with open(output_path, 'w') as file:\n", - " yaml.dump(data, file)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d525c194-643b-4f97-a59b-f0fa17ba3bfd", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From b32a9f97c280cac5d576b1020a74df2cbbdba046 Mon Sep 17 00:00:00 2001 From: mattnowzari Date: Fri, 21 Feb 2025 15:42:12 -0500 Subject: [PATCH 6/6] Cleanup --- .gitignore | 3 + migration/crawler_migration.ipynb | 363 +++++++++++++++++++++++------- 2 files changed, 285 insertions(+), 81 deletions(-) diff --git a/.gitignore b/.gitignore index 2b67b8b9..72be2849 100644 --- a/.gitignore +++ b/.gitignore @@ -73,3 +73,6 @@ script/licenses/**/_downloaded_*-LICENSE.txt bin/container-structure-test .artifacts .buildkite/publish/container-structure-test.yaml + +# Migration +/migration/.ipynb_checkpoints/* diff --git a/migration/crawler_migration.ipynb b/migration/crawler_migration.ipynb index b8799165..82eaf99a 100644 --- a/migration/crawler_migration.ipynb +++ b/migration/crawler_migration.ipynb @@ -10,32 +10,31 @@ "\n", "This notebook is designed to help you migrate your Elastic Crawler configurations to Open Crawler-friendly YAML!\n", "\n", - "We recommend running each cell individually in a sequential fashion, as each cell is dependent on previous cells having been run." - ] - }, - { - "cell_type": "markdown", - "id": "f8b41584-1cce-440e-b3af-e8ae0cb1312c", - "metadata": {}, - "source": [ - "_If you are running this notebook inside Google Colab, or have not installed elasticsearch in your local environment yet, please run the following cell to make sure the Python `elasticsearch` client is installed._" - ] - }, - { - "cell_type": "markdown", - "id": "8bc65371-58ea-4be9-a319-2f7ed9713145", - "metadata": {}, - "source": [ + "We recommend running each cell individually in a sequential fashion, as each cell is dependent on previous cells having been run.\n", + "\n", + "_If you are running this notebook inside Google Colab, or have not installed elasticsearch in your local environment yet, please run the following cell to make sure the Python `elasticsearch` client is installed._\n", + "\n", "### Setup\n", "First, let's start by making sure `elasticsearch` and other required dependencies are installed and imported by running the following cell:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 675, "id": "da411d2f-9aff-46af-845a-5fe9be19ea3c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: elasticsearch in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (8.17.1)\n", + "Requirement already satisfied: elastic-transport<9,>=8.15.1 in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (from elasticsearch) (8.17.0)\n", + "Requirement already satisfied: urllib3<3,>=1.26.2 in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (from elastic-transport<9,>=8.15.1->elasticsearch) (2.3.0)\n", + "Requirement already satisfied: certifi in /Users/mattnowzari/repos/python/mn_venv/lib/python3.12/site-packages (from elastic-transport<9,>=8.15.1->elasticsearch) (2024.12.14)\n" + ] + } + ], "source": [ "!pip install elasticsearch\n", "\n", @@ -63,30 +62,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 753, "id": "08e6e3d2-62d3-4890-a6be-41fe0a931ef6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Elastic Cloud ID: ········\n", + "Elastic Api Key: ········\n" + ] + }, + { + "data": { + "text/plain": [ + "'You Know, for Search'" + ] + }, + "execution_count": 753, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", - "API_KEY = getpass(\"Elastic Api Key: \")" - ] - }, - { - "cell_type": "markdown", - "id": "4993104f-ebb6-4715-b758-1fa262a224f3", - "metadata": {}, - "source": [ - "Great! Now let's try connecting to your Elasticsearch instance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f3ada2cb-b00f-4b1d-be09-57b2ccf25c7c", - "metadata": {}, - "outputs": [], - "source": [ + "API_KEY = getpass(\"Elastic Api Key: \")\n", + "\n", "es_client = Elasticsearch(\n", " cloud_id=ELASTIC_CLOUD_ID,\n", " api_key=API_KEY,\n", @@ -119,10 +121,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 816, "id": "0a698b05-e939-42a5-aa31-51b1b1883e6f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. search-search-crawler-fully-loaded-8.18\n", + "2. search-daggerfall-unity-website-crawler-8.18\n", + "3. search-migration-crawler\n", + "4. search-basic\n", + " search-basic uses an interval schedule, which is not supported in Open Crawler!\n" + ] + } + ], "source": [ " # in-memory data structure that maintains current state of the configs we've pulled\n", "inflight_configuration_data = {}\n", @@ -140,12 +154,11 @@ " output_index = source[\"index_name\"]\n", "\n", " print (f\"{crawler_counter}. {output_index}\")\n", - " print (f\" Crawler ID is {crawler_oid}\\n\")\n", " crawler_counter += 1\n", "\n", " crawl_schedule = [] # either no schedule or a specific schedule - determined in Step 4\n", " if source[\"use_connector_schedule\"] == False and source[\"crawl_schedule\"]: # an interval schedule is being used\n", - " crawl_schedule = source[\"crawl_schedule\"] # this will be transformed in Step 4\n", + " print (f\" {output_index} uses an interval schedule, which is not supported in Open Crawler!\")\n", "\n", " # populate a temporary hashmap\n", " temp_conf_map = {\n", @@ -164,14 +177,12 @@ " # \"password\": \"\",\n", " }\n", " # populate the in-memory data structure\n", - " inflight_configuration_data[crawler_oid] = temp_conf_map\n", - "\n", - "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT" + " inflight_configuration_data[crawler_oid] = temp_conf_map" ] }, { "cell_type": "markdown", - "id": "34f5e024-688c-4ffb-a16f-35f5171ba7a8", + "id": "2804d02b-870d-4173-9c5f-6d5eb434d49b", "metadata": {}, "source": [ "**Before continuing, please verify in the output above that the correct number of Crawlers was found!**\n", @@ -191,10 +202,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 817, "id": "e1c64c3d-c8d7-4236-9ed9-c9b1cb5e7972", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.) Crawler ID 67b74f16204956a3ce9fd0a4\n", + " Domain https://www.speedhunters.com found!\n", + " Seed URls found: ['https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/', 'https://www.speedhunters.com/2025/02/daniel-arsham-eroded-porsche-911/', 'https://www.speedhunters.com/2025/02/5-plus-7-equals-v12-a-custom-bmw-super-saloon/']\n", + " Sitemap URLs found: ['https://www.speedhunters.com/post_tag-sitemap2.xml']\n", + "2.) Crawler ID 67b74f84204956efce9fd0b7\n", + " Domain https://www.dfworkshop.net found!\n", + " Seed URls found: ['https://www.dfworkshop.net/']\n", + " Crawl rules found: [{'policy': 'allow', 'type': 'begins', 'pattern': '/word'}, {'policy': 'deny', 'type': 'contains', 'pattern': 'DOS'}]\n", + " Domain https://www.speedhunters.com found!\n", + " Seed URls found: ['https://www.speedhunters.com/']\n", + " Crawl rules found: [{'policy': 'deny', 'type': 'begins', 'pattern': '/BMW'}]\n", + "3.) Crawler ID 67b7509b2049567f859fd0d4\n", + " Domain https://justinjackson.ca found!\n", + " Seed URls found: ['https://justinjackson.ca/']\n", + " Domain https://matt-nowzari.myportfolio.com found!\n", + " Seed URls found: ['https://matt-nowzari.myportfolio.com/']\n", + " Crawl rules found: [{'policy': 'deny', 'type': 'begins', 'pattern': '/The'}]\n", + "4.) Crawler ID 67b75aeb20495617d59fd0ea\n", + " Domain https://www.elastic.co found!\n", + " Seed URls found: ['https://www.elastic.co/']\n" + ] + } + ], "source": [ "crawler_ids_to_query = inflight_configuration_data.keys()\n", "\n", @@ -259,9 +297,7 @@ " print (f\" Crawl rules found: {crawl_rules_list}\")\n", " \n", " # populate the in-memory data structure\n", - " inflight_configuration_data[crawler_oid][\"domains_temp\"][domain_oid] = temp_domain_conf\n", - "\n", - "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT" + " inflight_configuration_data[crawler_oid][\"domains_temp\"][domain_oid] = temp_domain_conf" ] }, { @@ -276,16 +312,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 818, "id": "61a7df7a-72ad-4330-a30c-da319befd55c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4 total extraction rules found!\n" + ] + } + ], "source": [ "extraction_rules = es_client.search(\n", " index=\".ent-search-actastic-crawler2_extraction_rules\",\n", " _source=[\"configuration_oid\", \"domain_oid\", \"rules\", \"url_filters\"]\n", ")\n", "\n", + "extr_count = 0\n", "for exr_rule in extraction_rules[\"hits\"][\"hits\"]:\n", " source = exr_rule[\"_source\"]\n", "\n", @@ -325,9 +370,10 @@ " \"url_filters\": url_filters,\n", " \"rules\": ruleset,\n", " }]\n", + " extr_count += 1\n", " inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\"extraction_rulesets\"] = temp_extraction_rulesets\n", "\n", - "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT" + "print (f\"{extr_count} total extraction rules found!\")" ] }, { @@ -337,21 +383,25 @@ "source": [ "### Step 4: Schedules\n", "\n", - "In the upcoming cell, we will be gathing any schedules your Crawlers have set." + "In the upcoming cell, we will be gathering any specific time schedules your Crawlers have set. Please note that _interval time schedules_ are not supported by Open Crawler and will be ignored." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 819, "id": "d880e081-f960-41c7-921e-26896f248eab", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.) Crawler search-daggerfall-unity-website-crawler-8.18 has the schedule '0 30 8 * * ?'\n" + ] + } + ], "source": [ - "def generate_cron_expression(interval_values: dict) -> str:\n", - " return interval_values # TODO TODO this ** might not be needed? **\n", - "\n", - "# ---------------------------\n", - "\n", + "crawler_counter = 1\n", "for crawler_oid, crawler_config in inflight_configuration_data.items():\n", " output_index = crawler_config[\"output_index\"]\n", " \n", @@ -366,12 +416,9 @@ " )\n", " # update schedule field with cron expression if specific time scheduling is enabled\n", " if schedules_result[\"hits\"][\"hits\"][0][\"_source\"][\"scheduling\"][\"full\"][\"enabled\"]:\n", - " specific_time_schedule = schedules_result[\"hits\"][\"hits\"][0][\"_source\"][\"scheduling\"][\"full\"][\"interval\"]\n", - " crawler_config[\"schedule\"] = specific_time_schedule\n", - " elif isinstance(existing_schedule_value[0], dict):\n", - " crawler_config[\"schedule\"] = generate_cron_expression(existing_schedule_value)\n", - " \n", - "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT " + " crawler_config[\"schedule\"] = schedules_result[\"hits\"][\"hits\"][0][\"_source\"][\"scheduling\"][\"full\"][\"interval\"]\n", + " print (f\"{crawler_counter}.) Crawler {output_index} has the schedule '{crawler_config[\"schedule\"]}'\")\n", + " crawler_counter += 1" ] }, { @@ -388,7 +435,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 820, "id": "dd70f102-33ee-4106-8861-0aa0f9a223a1", "metadata": {}, "outputs": [], @@ -403,9 +450,7 @@ " # create a new key called \"domains\" that points to a list of domain configs only - no domain_oid values as keys\n", " crawler_config[\"domains\"] = all_crawler_domains\n", " # delete the temporary domain key\n", - " del crawler_config[\"domains_temp\"]\n", - "\n", - "# pprint.pprint(inflight_configuration_data) # REMOVE BEFORE FLIGHT " + " del crawler_config[\"domains_temp\"]" ] }, { @@ -423,20 +468,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 826, "id": "213880cc-cbf3-40d9-8c7d-6fcf6428c16b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Elasticsearch endpoint URL: https://4911ebad5ed44d149fe8ddad4a4b3751.us-west2.gcp.elastic-cloud.com\n", + "The Elasticsearch endpoint's port number: 443\n", + "Elasticsearch API key: ········\n" + ] + }, + { + "data": { + "text/plain": [ + "'You Know, for Search'" + ] + }, + "execution_count": 826, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ENDPOINT = input(\"Elasticsearch endpoint URL: \")\n", "PORT = input(\"The Elasticsearch endpoint's port number: \")\n", - "API_KEY = getpass(\"Elasticsearch API key: \")\n", + "OUTPUT_API_KEY = getpass(\"Elasticsearch API key: \")\n", "\n", "# set the above values in each Crawler's configuration\n", "for crawler_config in inflight_configuration_data.values():\n", " crawler_config[\"elasticsearch\"][\"host\"] = ENDPOINT\n", " crawler_config[\"elasticsearch\"][\"port\"] = int(PORT)\n", - " crawler_config[\"elasticsearch\"][\"api_key\"] = API_KEY" + " crawler_config[\"elasticsearch\"][\"api_key\"] = OUTPUT_API_KEY\n", + "\n", + "#ping ES to make sure we have positive connection\n", + "es_client = Elasticsearch(\n", + " \":\".join([ENDPOINT, PORT]),\n", + " api_key=OUTPUT_API_KEY,\n", + ")\n", + "\n", + "es_client.info()['tagline']" ] }, { @@ -462,10 +535,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 827, "id": "6adc53db-d781-4b72-a5f3-441364f354b8", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Wrote search-search-crawler-fully-loaded-8.18-config.yml to /Users/mattnowzari/repos/search_and_transform/crawler/migration/search-search-crawler-fully-loaded-8.18-config.yml\n", + " Wrote search-daggerfall-unity-website-crawler-8.18-config.yml to /Users/mattnowzari/repos/search_and_transform/crawler/migration/search-daggerfall-unity-website-crawler-8.18-config.yml\n", + " Wrote search-migration-crawler-config.yml to /Users/mattnowzari/repos/search_and_transform/crawler/migration/search-migration-crawler-config.yml\n", + " Wrote search-basic-config.yml to /Users/mattnowzari/repos/search_and_transform/crawler/migration/search-basic-config.yml\n" + ] + } + ], "source": [ "# Dump each Crawler's configuration into its own YAML file\n", "for crawler_config in inflight_configuration_data.values():\n", @@ -479,7 +563,8 @@ " crawler_config,\n", " file,\n", " sort_keys=False\n", - " )" + " )\n", + " print (f\" Wrote {file_name} to {output_path}\")" ] }, { @@ -492,10 +577,126 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 828, "id": "525aabb8-0537-4ba6-8109-109490dddafe", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "YAML config => search-search-crawler-fully-loaded-8.18-config.yml\n", + "--------\n", + "output_index: search-search-crawler-fully-loaded-8.18\n", + "schedule: []\n", + "output_sink: elasticsearch\n", + "full_html_extraction_enabled: false\n", + "elasticsearch:\n", + " host: https://4911ebad5ed44d149fe8ddad4a4b3751.us-west2.gcp.elastic-cloud.com\n", + " port: 443\n", + " api_key: d1RBMktaVUJRdEdzS0U4d05BSWI6ZDlGaE9PbWdrVER3VEZFVlBPWkxVQQ==\n", + "domains:\n", + "- url: https://www.speedhunters.com\n", + " seed_urls:\n", + " - https://www.speedhunters.com/2025/01/the-mystery-of-the-hks-zero-r/\n", + " - https://www.speedhunters.com/2025/02/daniel-arsham-eroded-porsche-911/\n", + " - https://www.speedhunters.com/2025/02/5-plus-7-equals-v12-a-custom-bmw-super-saloon/\n", + " sitemap_urls:\n", + " - https://www.speedhunters.com/post_tag-sitemap2.xml\n", + "\n", + "--------------------------------------------------------------------------------\n", + "YAML config => search-daggerfall-unity-website-crawler-8.18-config.yml\n", + "--------\n", + "output_index: search-daggerfall-unity-website-crawler-8.18\n", + "schedule: 0 30 8 * * ?\n", + "output_sink: elasticsearch\n", + "full_html_extraction_enabled: false\n", + "elasticsearch:\n", + " host: https://4911ebad5ed44d149fe8ddad4a4b3751.us-west2.gcp.elastic-cloud.com\n", + " port: 443\n", + " api_key: d1RBMktaVUJRdEdzS0U4d05BSWI6ZDlGaE9PbWdrVER3VEZFVlBPWkxVQQ==\n", + "domains:\n", + "- url: https://www.dfworkshop.net\n", + " seed_urls:\n", + " - https://www.dfworkshop.net/\n", + " crawl_rules:\n", + " - policy: allow\n", + " type: begins\n", + " pattern: /word\n", + " - policy: deny\n", + " type: contains\n", + " pattern: DOS\n", + " extraction_rulesets:\n", + " - url_filters:\n", + " - type: begins\n", + " pattern: /elderscrolls/*\n", + " rules:\n", + " - action: set\n", + " field_name: elder_field\n", + " selector: /elderscrolls/*\n", + " join_as: string\n", + " value: ping\n", + " source: url\n", + "- url: https://www.speedhunters.com\n", + " seed_urls:\n", + " - https://www.speedhunters.com/\n", + " crawl_rules:\n", + " - policy: deny\n", + " type: begins\n", + " pattern: /BMW\n", + "\n", + "--------------------------------------------------------------------------------\n", + "YAML config => search-migration-crawler-config.yml\n", + "--------\n", + "output_index: search-migration-crawler\n", + "schedule: []\n", + "output_sink: elasticsearch\n", + "full_html_extraction_enabled: false\n", + "elasticsearch:\n", + " host: https://4911ebad5ed44d149fe8ddad4a4b3751.us-west2.gcp.elastic-cloud.com\n", + " port: 443\n", + " api_key: d1RBMktaVUJRdEdzS0U4d05BSWI6ZDlGaE9PbWdrVER3VEZFVlBPWkxVQQ==\n", + "domains:\n", + "- url: https://justinjackson.ca\n", + " seed_urls:\n", + " - https://justinjackson.ca/\n", + "- url: https://matt-nowzari.myportfolio.com\n", + " seed_urls:\n", + " - https://matt-nowzari.myportfolio.com/\n", + " crawl_rules:\n", + " - policy: deny\n", + " type: begins\n", + " pattern: /The\n", + " extraction_rulesets:\n", + " - url_filters: []\n", + " rules:\n", + " - action: set\n", + " field_name: test_field\n", + " selector: /html/body/a/@title\n", + " join_as: string\n", + " value: some_rando_value\n", + " source: html\n", + "\n", + "--------------------------------------------------------------------------------\n", + "YAML config => search-basic-config.yml\n", + "--------\n", + "output_index: search-basic\n", + "schedule: []\n", + "output_sink: elasticsearch\n", + "full_html_extraction_enabled: false\n", + "elasticsearch:\n", + " host: https://4911ebad5ed44d149fe8ddad4a4b3751.us-west2.gcp.elastic-cloud.com\n", + " port: 443\n", + " api_key: d1RBMktaVUJRdEdzS0U4d05BSWI6ZDlGaE9PbWdrVER3VEZFVlBPWkxVQQ==\n", + "domains:\n", + "- url: https://www.elastic.co\n", + " seed_urls:\n", + " - https://www.elastic.co/\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], "source": [ "for crawler_config in inflight_configuration_data.values():\n", " yaml_out = yaml.safe_dump(\n",