From 6c69ca68fd5c1303710aa4771100cc9ed9a2c7ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20Stra=C3=9Fer?= Date: Mon, 17 Mar 2025 16:00:05 +0100 Subject: [PATCH] code clean up --- .../colpali/02_bit_vectors.ipynb | 28 +++---- .../colpali/03_average_vector.ipynb | 30 ++++---- .../colpali/04_token_pooling.ipynb | 75 +++++-------------- 3 files changed, 43 insertions(+), 90 deletions(-) diff --git a/supporting-blog-content/colpali/02_bit_vectors.ipynb b/supporting-blog-content/colpali/02_bit_vectors.ipynb index ee406e0f..eeaee725 100644 --- a/supporting-blog-content/colpali/02_bit_vectors.ipynb +++ b/supporting-blog-content/colpali/02_bit_vectors.ipynb @@ -39,18 +39,14 @@ "import numpy as np\n", "\n", "\n", - "def to_bit_vectors(embedding: list) -> list:\n", - " embeddings = []\n", - " for idx, patch_embedding in enumerate(embedding):\n", - " patch_embedding = np.array(patch_embedding)\n", - " binary_vector = (\n", - " np.packbits(np.where(patch_embedding > 0, 1, 0))\n", - " .astype(np.int8)\n", - " .tobytes()\n", - " .hex()\n", - " )\n", - " embeddings.append(binary_vector)\n", - " return embeddings" + "def to_bit_vectors(embeddings: list) -> list:\n", + " return [\n", + " np.packbits(np.where(np.array(embedding) > 0, 1, 0))\n", + " .astype(np.int8)\n", + " .tobytes()\n", + " .hex()\n", + " for embedding in embeddings\n", + " ]" ] }, { @@ -71,7 +67,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[INFO] Index 'searchlabs-colpali-hamming' already exists.\n" + "[INFO] Creating index: searchlabs-colpali-hamming\n" ] } ], @@ -126,7 +122,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "022b4af8891b4a06962e023c7f92d8f4", + "model_id": "8be1b809674143c486705f1699f440dd", "version_major": 2, "version_minor": 0 }, @@ -191,7 +187,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "064e33061bac40e4802138e30599225b", + "model_id": "63a206f0fc2d491196b83b450eb4b93a", "version_major": 2, "version_minor": 0 }, @@ -251,7 +247,7 @@ { "data": { "text/html": [ - "
\"image_104.jpg\"\"image_3.jpg\"\"image_12.jpg\"\"image_2.jpg\"\"image_92.jpg\"
" + "
\"image_104.jpg\"\"image_3.jpg\"\"image_12.jpg\"\"image_2.jpg\"\"image_110.jpg\"
" ], "text/plain": [ "" diff --git a/supporting-blog-content/colpali/03_average_vector.ipynb b/supporting-blog-content/colpali/03_average_vector.ipynb index f944f3aa..7dcf4968 100644 --- a/supporting-blog-content/colpali/03_average_vector.ipynb +++ b/supporting-blog-content/colpali/03_average_vector.ipynb @@ -32,18 +32,14 @@ "import numpy as np\n", "\n", "\n", - "def to_bit_vectors(embedding: list) -> list:\n", - " embeddings = []\n", - " for idx, patch_embedding in enumerate(embedding):\n", - " patch_embedding = np.array(patch_embedding)\n", - " binary_vector = (\n", - " np.packbits(np.where(patch_embedding > 0, 1, 0))\n", - " .astype(np.int8)\n", - " .tobytes()\n", - " .hex()\n", - " )\n", - " embeddings.append(binary_vector)\n", - " return embeddings\n", + "def to_bit_vectors(embeddings: list) -> list:\n", + " return [\n", + " np.packbits(np.where(np.array(embedding) > 0, 1, 0))\n", + " .astype(np.int8)\n", + " .tobytes()\n", + " .hex()\n", + " for embedding in embeddings\n", + " ]\n", "\n", "\n", "def to_avg_vector(vectors):\n", @@ -79,7 +75,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[INFO] Index 'searchlabs-colpali-average-vector' already exists.\n" + "[INFO] Creating index: searchlabs-colpali-average-vector\n" ] } ], @@ -149,7 +145,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "4a9d08424a504956a4f50208a19cce90", + "model_id": "bb2e8a4c74b5494c8203308df49ec750", "version_major": 2, "version_minor": 0 }, @@ -216,7 +212,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b7967bc15d67486ab5e2db2c673db036", + "model_id": "bdf787df90d24a289fa70e6abd0d9597", "version_major": 2, "version_minor": 0 }, @@ -267,7 +263,7 @@ { "data": { "text/html": [ - "
\"image_12.jpg\"\"image_3.jpg\"\"image_49.jpg\"\"image_123.jpg\"\"image_104.jpg\"
" + "
\"image_3.jpg\"\"image_12.jpg\"\"image_104.jpg\"\"image_2.jpg\"\"image_250.jpg\"
" ], "text/plain": [ "" @@ -329,7 +325,7 @@ { "data": { "text/html": [ - "
\"image_104.jpg\"\"image_3.jpg\"\"image_2.jpg\"\"image_12.jpg\"\"image_49.jpg\"
" + "
\"image_104.jpg\"\"image_3.jpg\"\"image_2.jpg\"\"image_12.jpg\"\"image_250.jpg\"
" ], "text/plain": [ "" diff --git a/supporting-blog-content/colpali/04_token_pooling.ipynb b/supporting-blog-content/colpali/04_token_pooling.ipynb index 555e2977..0d8155a1 100644 --- a/supporting-blog-content/colpali/04_token_pooling.ipynb +++ b/supporting-blog-content/colpali/04_token_pooling.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "be6ffdc5-fbaa-40b5-8b33-5540a3f957ba", "metadata": {}, "outputs": [], @@ -24,18 +24,14 @@ "import numpy as np\n", "\n", "\n", - "def to_bit_vectors(embedding: list) -> list:\n", - " embeddings = []\n", - " for idx, patch_embedding in enumerate(embedding):\n", - " patch_embedding = np.array(patch_embedding)\n", - " binary_vector = (\n", - " np.packbits(np.where(patch_embedding > 0, 1, 0))\n", - " .astype(np.int8)\n", - " .tobytes()\n", - " .hex()\n", - " )\n", - " embeddings.append(binary_vector)\n", - " return embeddings" + "def to_bit_vectors(embeddings: list) -> list:\n", + " return [\n", + " np.packbits(np.where(np.array(embedding) > 0, 1, 0))\n", + " .astype(np.int8)\n", + " .tobytes()\n", + " .hex()\n", + " for embedding in embeddings\n", + " ]" ] }, { @@ -49,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "9871c9c5-c923-4deb-9f5b-aa6796ba0bbf", "metadata": {}, "outputs": [], @@ -70,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "id": "2de5872d-b372-40fe-85c5-111b9f9fa6c8", "metadata": {}, "outputs": [ @@ -78,7 +74,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[INFO] Index 'searchlabs-colpali-token-pooling' already exists.\n" + "[INFO] Creating index: searchlabs-colpali-token-pooling\n" ] } ], @@ -126,14 +122,14 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "bdf6ff33-3e22-43c1-9f3e-c3dd663b40e2", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "cef0c48b9b5d4b3982fbdb4773494ec8", + "model_id": "047c33b3344f49328bda552b123c168d", "version_major": 2, "version_minor": 0 }, @@ -143,13 +139,6 @@ }, "metadata": {}, "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Completed indexing 500 documents\n" - ] } ], "source": [ @@ -193,25 +182,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "1dfc3713-d649-46db-aa81-171d6d92668e", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d4361ebd1e59483aa8060a4fbe71715b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Loading checkpoint shards: 0%| | 0/2 [00:00\"image_3.jpg\"\"image_104.jpg\"\"image_2.jpg\"\"image_12.jpg\"\"image_120.jpg\"" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from IPython.display import display, HTML\n", "import os\n",