1+ #!/usr/bin/env python3
2+ '''
3+ Focused analysis of build time performance comparing node-based and array-based
4+ intersection tree implementations. This script specifically addresses the question:
5+ "The query performance of the array-based implementation is lower than that of the
6+ naive implementation, but what about the build time? How does insertion of intervals
7+ stack up between the two approaches?"
8+ '''
9+
10+ import random
11+ import time
12+ import typing
13+ import gc
14+ import intersection_tree as node_based
15+ import array_intersection_tree as array_based
16+ import naive_intersectionic_queries as naive
17+
18+
19+ def time_build_process (intervals : list [tuple [int , int ]], implementation : str ) -> tuple [float , float ]:
20+ '''Time the build process for a given implementation.
21+
22+ Parameters
23+ ----------
24+ intervals: list[tuple[int, int]]
25+ list of intervals to insert
26+ implementation: str
27+ either 'node', 'array', or 'naive'
28+
29+ Returns
30+ -------
31+ tuple[float, float]
32+ (total_build_time, average_insertion_time)
33+ '''
34+ gc .collect () # Clean up before timing
35+
36+ if implementation == 'node' :
37+ start_time = time .time ()
38+ db = node_based .populate_db (None , intervals )
39+ build_time = time .time () - start_time
40+
41+ elif implementation == 'array' :
42+ start_time = time .time ()
43+ db = array_based .populate_db (None , intervals )
44+ build_time = time .time () - start_time
45+
46+ elif implementation == 'naive' :
47+ start_time = time .time ()
48+ db = naive .create_db (len (intervals )) # Naive doesn't have populate_db
49+ build_time = time .time () - start_time
50+
51+ else :
52+ raise ValueError (f"Unknown implementation: { implementation } " )
53+
54+ avg_insertion_time = build_time / len (intervals ) if intervals else 0
55+ return build_time , avg_insertion_time
56+
57+
58+ def incremental_insertion_test (max_size : int = 10000 , step : int = 1000 ) -> None :
59+ '''Test incremental insertion performance.
60+
61+ Parameters
62+ ----------
63+ max_size: int
64+ maximum number of intervals to test
65+ step: int
66+ step size for increasing interval count
67+ '''
68+ print ("Incremental Insertion Performance Test" )
69+ print ("=" * 60 )
70+ print ("Size\t Node Build\t Array Build\t Naive Build\t Node/Array\t Node/Naive\t Array/Naive" )
71+ print ("-" * 95 )
72+
73+ sizes = list (range (step , max_size + 1 , step ))
74+
75+ for size in sizes :
76+ # Generate same intervals for all tests
77+ random .seed (42 )
78+ intervals = [node_based .generate_interval (1_000_000 ) for _ in range (size )]
79+
80+ # Test node-based implementation
81+ random .seed (42 )
82+ node_build_time , node_avg = time_build_process (intervals , 'node' )
83+
84+ # Test array-based implementation
85+ random .seed (42 )
86+ array_build_time , array_avg = time_build_process (intervals , 'array' )
87+
88+ # Test naive implementation
89+ random .seed (42 )
90+ naive_build_time , naive_avg = time_build_process (intervals , 'naive' )
91+
92+ # Calculate ratios
93+ node_array_ratio = node_build_time / array_build_time if array_build_time > 0 else float ('inf' )
94+ node_naive_ratio = node_build_time / naive_build_time if naive_build_time > 0 else float ('inf' )
95+ array_naive_ratio = array_build_time / naive_build_time if naive_build_time > 0 else float ('inf' )
96+
97+ print (f"{ size } \t { node_build_time :.4f} s\t { array_build_time :.4f} s\t { naive_build_time :.4f} s\t { node_array_ratio :.2f} x\t { node_naive_ratio :.2f} x\t { array_naive_ratio :.2f} x" )
98+
99+
100+ def single_insertion_timing_test (num_trials : int = 1000 ) -> None :
101+ '''Test individual insertion timing.
102+
103+ Parameters
104+ ----------
105+ num_trials: int
106+ number of insertion operations to time
107+ '''
108+ print (f"\n Single Insertion Timing Test ({ num_trials } trials)" )
109+ print ("=" * 50 )
110+
111+ # Prepare intervals
112+ random .seed (42 )
113+ intervals = [node_based .generate_interval (1_000_000 ) for _ in range (num_trials )]
114+
115+ # Test node-based individual insertions
116+ node_tree = node_based .Node (intervals [0 ])
117+ gc .collect ()
118+
119+ start_time = time .time ()
120+ for interval in intervals [1 :]:
121+ node_tree .insert (interval )
122+ node_insertion_time = time .time () - start_time
123+
124+ # Test array-based individual insertions
125+ array_tree = array_based .ArrayTree ()
126+ array_tree .insert (intervals [0 ])
127+ gc .collect ()
128+
129+ start_time = time .time ()
130+ for interval in intervals [1 :]:
131+ array_tree .insert (interval )
132+ array_insertion_time = time .time () - start_time
133+
134+ node_avg = node_insertion_time / (num_trials - 1 )
135+ array_avg = array_insertion_time / (num_trials - 1 )
136+ speedup = node_insertion_time / array_insertion_time if array_insertion_time > 0 else float ('inf' )
137+
138+ print (f"Node-based total time: { node_insertion_time :.4f} s" )
139+ print (f"Array-based total time: { array_insertion_time :.4f} s" )
140+ print (f"Node-based avg per insertion: { node_avg * 1000 :.3f} ms" )
141+ print (f"Array-based avg per insertion: { array_avg * 1000 :.3f} ms" )
142+ print (f"Insertion speedup (Node/Array): { speedup :.2f} x" )
143+
144+
145+ def memory_efficiency_during_build (sizes : list [int ] = [1000 , 5000 , 10000 ]) -> None :
146+ '''Analyze memory efficiency during the build process.
147+
148+ Parameters
149+ ----------
150+ sizes: list[int]
151+ list of database sizes to test
152+ '''
153+ print (f"\n Memory Efficiency During Build" )
154+ print ("=" * 40 )
155+ print ("Size\t Node Memory\t Array Memory\t Savings\t Node Build\t Array Build\t Mem/Time Ratio" )
156+ print ("-" * 85 )
157+
158+ import sys
159+
160+ for size in sizes :
161+ # Generate intervals
162+ random .seed (42 )
163+ intervals = [node_based .generate_interval (1_000_000 ) for _ in range (size )]
164+
165+ # Build and measure node-based
166+ random .seed (42 )
167+ gc .collect ()
168+ start_time = time .time ()
169+ node_db = node_based .populate_db (None , intervals )
170+ node_build_time = time .time () - start_time
171+
172+ # Estimate node memory (rough approximation)
173+ def estimate_tree_size (node ):
174+ if node is None :
175+ return 0
176+ size = sys .getsizeof (node )
177+ size += sys .getsizeof (node ._start ) + sys .getsizeof (node ._end ) + sys .getsizeof (node ._max_end )
178+ if hasattr (node , '_left' ) and node ._left :
179+ size += estimate_tree_size (node ._left )
180+ if hasattr (node , '_right' ) and node ._right :
181+ size += estimate_tree_size (node ._right )
182+ return size
183+
184+ node_memory = estimate_tree_size (node_db )
185+
186+ # Build and measure array-based
187+ random .seed (42 )
188+ gc .collect ()
189+ start_time = time .time ()
190+ array_db = array_based .populate_db (None , intervals )
191+ array_build_time = time .time () - start_time
192+
193+ array_memory = (
194+ sys .getsizeof (array_db .start ) +
195+ sys .getsizeof (array_db .end ) +
196+ sys .getsizeof (array_db .max_end ) +
197+ sys .getsizeof (array_db .left ) +
198+ sys .getsizeof (array_db .right ) +
199+ sys .getsizeof (array_db )
200+ )
201+
202+ savings = (node_memory - array_memory ) / node_memory * 100 if node_memory > 0 else 0
203+ mem_time_ratio_node = node_memory / node_build_time if node_build_time > 0 else float ('inf' )
204+ mem_time_ratio_array = array_memory / array_build_time if array_build_time > 0 else float ('inf' )
205+ ratio_improvement = mem_time_ratio_node / mem_time_ratio_array if mem_time_ratio_array > 0 else float ('inf' )
206+
207+ print (f"{ size } \t { node_memory / 1024 :.0f} KB\t { array_memory / 1024 :.0f} KB\t { savings :.1f} %\t { node_build_time :.4f} s\t { array_build_time :.4f} s\t { ratio_improvement :.2f} x" )
208+
209+
210+ def build_vs_query_tradeoff_analysis (sizes : list [int ] = [1000 , 5000 , 10000 ], num_queries : int = 100 ) -> None :
211+ '''Analyze the tradeoff between build time and query time.
212+
213+ Parameters
214+ ----------
215+ sizes: list[int]
216+ list of database sizes to test
217+ num_queries: int
218+ number of queries to execute
219+ '''
220+ print (f"\n Build vs Query Performance Tradeoff Analysis" )
221+ print ("=" * 60 )
222+ print ("Size\t Node Build\t Array Build\t Node Query\t Array Query\t Build Advantage\t Query Disadvantage" )
223+ print ("-" * 95 )
224+
225+ for size in sizes :
226+ # Generate test data
227+ random .seed (42 )
228+ intervals = [node_based .generate_interval (1_000_000 ) for _ in range (size )]
229+ queries = [node_based .generate_interval (1_000_000 ) for _ in range (num_queries )]
230+
231+ # Node-based timing
232+ random .seed (42 )
233+ node_build_time , _ = time_build_process (intervals , 'node' )
234+
235+ node_db = node_based .populate_db (None , intervals )
236+ gc .collect ()
237+ start_time = time .time ()
238+ node_based .execute_queries (queries , node_db )
239+ node_query_time = time .time () - start_time
240+
241+ # Array-based timing
242+ random .seed (42 )
243+ array_build_time , _ = time_build_process (intervals , 'array' )
244+
245+ array_db = array_based .populate_db (None , intervals )
246+ gc .collect ()
247+ start_time = time .time ()
248+ array_based .execute_queries (queries , array_db )
249+ array_query_time = time .time () - start_time
250+
251+ # Calculate advantages/disadvantages
252+ build_advantage = (array_build_time - node_build_time ) / node_build_time * 100 if node_build_time > 0 else 0
253+ query_disadvantage = (array_query_time - node_query_time ) / node_query_time * 100 if node_query_time > 0 else 0
254+
255+ print (f"{ size } \t { node_build_time :.4f} s\t { array_build_time :.4f} s\t { node_query_time :.4f} s\t { array_query_time :.4f} s\t { build_advantage :+.1f} %\t { query_disadvantage :+.1f} %" )
256+
257+
258+ def main ():
259+ '''Run focused build time performance analysis.'''
260+ print ("Build Time Performance Analysis" )
261+ print ("Comparing Node-based vs Array-based Intersection Trees" )
262+ print ("=" * 70 )
263+
264+ # Run incremental insertion test
265+ incremental_insertion_test (max_size = 10000 , step = 1000 )
266+
267+ # Run single insertion timing
268+ single_insertion_timing_test (num_trials = 1000 )
269+
270+ # Run memory efficiency analysis
271+ memory_efficiency_during_build ()
272+
273+ # Run build vs query tradeoff analysis
274+ build_vs_query_tradeoff_analysis ()
275+
276+ print ("\n " + "=" * 70 )
277+ print ("SUMMARY:" )
278+ print ("- Array-based implementation may have different build time characteristics" )
279+ print ("- Individual insertion performance comparison completed" )
280+ print ("- Memory efficiency during build process analyzed" )
281+ print ("- Build vs Query performance tradeoff quantified" )
282+ print ("=" * 70 )
283+
284+
285+ if __name__ == '__main__' :
286+ main ()
0 commit comments