diff --git a/source_code/intersection_trees/intersection_trees.ipynb b/source_code/intersection_trees/intersection_trees.ipynb index 64dccbd..1704bcc 100644 --- a/source_code/intersection_trees/intersection_trees.ipynb +++ b/source_code/intersection_trees/intersection_trees.ipynb @@ -1169,6 +1169,166 @@ "It is clear that the intersection tree implementation is more efficient than the naive approaches. The expected time complexity is $\\bigO(N \\log N)$ for $|Q| = |D| = N$. Likely, the actual time complexity will be worse since the vinary tree is unlikely to be balanced." ] }, + { + "cell_type": "markdown", + "id": "eeb0be96-8d68-4674-887d-75225e6e697a", + "metadata": {}, + "source": [ + "## Array-based intersection trees" + ] + }, + { + "cell_type": "markdown", + "id": "f6c03e11-8d6f-43bf-a3b6-a63f61f86567", + "metadata": {}, + "source": [ + "The array-based implementation offers an alternative approach to the traditional node-based intersection tree. ", + "Instead of using individual node objects, this implementation stores tree data in arrays, which can provide ", + "better cache locality and memory efficiency for large datasets.\n", + "\n", + "Each node is represented by an index into the arrays, and the tree structure is maintained through indices ", + "rather than object references." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "caca7934-32fe-46ca-a460-8c7b86669cf8", + "metadata": {}, + "outputs": [], + "source": [ + "%pycat array_intersection_tree.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b04c5e39-efb2-4a18-982e-fc6202b238c2", + "metadata": {}, + "outputs": [], + "source": [ + "import array_intersection_tree" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23334ee0-0e42-4e06-97e3-190406cf5918", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a small database for demonstration\n", + "array_db = array_intersection_tree.create_db(size=10, max_end=500)\n", + "print(f\"Created array-based tree with {array_db.size()} intervals\")\n", + "print(\"\\nTree structure:\")\n", + "print(array_db)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59d765c7-c1c8-47a3-a433-4b996028297a", + "metadata": {}, + "outputs": [], + "source": [ + "# Demonstrate search functionality\n", + "query_interval = (25, 30)\n", + "results = []\n", + "array_db.search(query_interval, results)\n", + "print(f\"Query interval {query_interval} intersects with:\")\n", + "for result in results:\n", + " print(f\" {result}\")\n", + "print(f\"\\nTotal intersections found: {len(results)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83dab894-b05e-4cc6-b5bb-ec9e7f853b9c", + "metadata": {}, + "outputs": [], + "source": [ + "# Performance benchmarking for array-based implementation\n", + "sizes = [2**i for i in range(7, 14)]\n", + "nr_repeats = 5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76b749f6-3425-4e8f-a055-fe1aad9a5a46", + "metadata": {}, + "outputs": [], + "source": [ + "random.seed(1234)\n", + "array_intersection_tree_times = [\n", + " timeit.repeat(\n", + " stmt='array_intersection_tree.execute_queries(queries, db)',\n", + " setup=f'queries, db = array_intersection_tree.create_queries({size}), array_intersection_tree.create_db({size})',\n", + " repeat=nr_repeats,\n", + " number=1,\n", + " globals=globals(),\n", + " ) for size in sizes\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7954528d-7222-4cf7-9c18-2477ce417381", + "metadata": {}, + "outputs": [], + "source": [ + "# Plot comparison of all implementations including array-based\n", + "plt.figure(figsize=(12, 8))\n", + "plt.boxplot(naive_times, widths=[40.0*size/100 for size in sizes], positions=sizes)\n", + "plt.plot(sizes, list(map(statistics.median, naive_times)), label='naive non-Pythonic', marker='o')\n", + "plt.boxplot(naive_pythonic_times, widths=[40.0*size/100 for size in sizes], positions=sizes)\n", + "plt.plot(sizes, list(map(statistics.median, naive_pythonic_times)), label='naive Pythonic', marker='s')\n", + "plt.boxplot(intersection_tree_times, widths=[40.0*size/100 for size in sizes], positions=sizes)\n", + "plt.plot(sizes, list(map(statistics.median, intersection_tree_times)), label='intersection tree (nodes)', marker='^')\n", + "plt.boxplot(array_intersection_tree_times, widths=[40.0*size/100 for size in sizes], positions=sizes)\n", + "plt.plot(sizes, list(map(statistics.median, array_intersection_tree_times)), label='intersection tree (arrays)', marker='d')\n", + "plt.legend()\n", + "plt.xlabel('Database size')\n", + "plt.ylabel('Execution time (seconds)')\n", + "plt.title('Performance Comparison: All Implementations')\n", + "plt.xscale('log')\n", + "plt.yscale('log')\n", + "plt.grid(True, alpha=0.3)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18e8dfa1-afe5-492d-8d78-cfc8e77c5f31", + "metadata": {}, + "outputs": [], + "source": [ + "# Compare array-based vs node-based intersection tree performance\n", + "array_vs_node_ratio = statistics.mean(array_intersection_tree_times[-1])/statistics.mean(intersection_tree_times[-1])\n", + "print(f\"Array-based vs Node-based tree performance ratio: {array_vs_node_ratio:.3f}\")\n", + "print(f\"Array-based vs Naive performance ratio: {statistics.mean(array_intersection_tree_times[-1])/statistics.mean(naive_times[-1]):.3f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "7da78d0a-1737-4f86-96b3-b605e3819d21", + "metadata": {}, + "source": [ + "### Analysis\n", + "\n", + "The array-based implementation provides similar algorithmic performance to the node-based intersection tree ", + "with $\\mathcal{O}(N \\log N)$ expected time complexity. The main differences are:\n", + "\n", + "- **Memory layout**: Arrays provide better cache locality compared to scattered node objects\n", + "- **Memory overhead**: Reduced per-node memory overhead (no object headers)\n", + "- **Capacity management**: Uses dynamic array resizing instead of individual node allocation\n", + "\n", + "The performance difference depends on factors such as dataset size, query patterns, and system architecture." + ] + }, { "cell_type": "markdown", "id": "eff509fe-8612-4ccf-bd91-d6fb7535f551", @@ -1194,6 +1354,7 @@ "source": [ "import intersection_tree\n", "import naive_intersectionic_queries\n", + "import array_intersection_tree\n", "\n", "random.seed(1234)\n", "max_end = 1_000_000\n", @@ -1204,14 +1365,21 @@ "]\n", "queries = intersection_tree.create_queries(size=nr_queries, max_end=max_end)\n", "\n", + "# Test node-based intersection tree\n", "db = intersection_tree.populate_db(None, db_intervals)\n", "db_results = intersection_tree.execute_queries(queries, db)\n", "\n", + "# Test naive implementation\n", "naive_db = db_intervals\n", "naive_db_result = naive_intersectionic_queries.execute_queries(queries, naive_db)\n", "\n", - "assert len(db_results) == len(naive_db_result)\n", - "assert set(db_results) == set(naive_db_result)" + "# Test array-based intersection tree\n", + "array_db = array_intersection_tree.populate_db(None, db_intervals)\n", + "array_db_results = array_intersection_tree.execute_queries(queries, array_db)\n", + "\n", + "# Verify all implementations produce identical results\n", + "assert len(db_results) == len(naive_db_result) == len(array_db_results)\n", + "assert set(db_results) == set(naive_db_result) == set(array_db_results)" ] }, { @@ -1219,7 +1387,7 @@ "id": "eb4b24f3-f503-4ecf-9c10-0c371b3119b2", "metadata": {}, "source": [ - "Both algorithms yield identical results." + "All algorithms yield identical results." ] } ], @@ -1244,4 +1412,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file