In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.formula.api as sm


In [None]:

#load in the csv files 
# where N varies and D = 2 with fixed K 
df_1 = pd.read_csv('1vary_N_Fixed_D2_and_K.csv')

In [None]:
# Filter the DataFrame for each data structure
df1_kd = df_1[df_1['dataStructure'] == 'KD']
df1_quad = df_1[df_1['dataStructure'] == 'Quad']
df1_bucket = df_1[df_1['dataStructure'] == 'Bucket']

In [None]:
# Create a single plot
plt.figure(figsize=(10,6))


# Convert average time from microseconds to seconds
kd_avg_time_sec = df1_kd['AverageTime'] / 1000000
quad_avg_time_sec = df1_quad['AverageTime'] / 1000000
bucket_avg_time_sec = df1_bucket['AverageTime'] / 1000000

# KD plot
plt.plot(df1_kd['numTrainingPoints'], kd_avg_time_sec, marker='o', label='KD Tree')

# QUAD plot
plt.plot(df1_quad['numTrainingPoints'], quad_avg_time_sec, marker='o', label='Quad Tree')

# Bucket plot
plt.plot(df1_bucket['numTrainingPoints'], bucket_avg_time_sec, marker='o', label='Bucket Tree')

# Set plot title and labels
plt.title(' Comparison of Average Time\nWhere N Varies and D = 2 with Fixed K')
plt.xlabel('numTrainingPoints')
plt.ylabel('Average Time (seconds)')

# Add a legend
plt.legend()

# Show the plot
plt.show()



Linear regression

In [None]:
df1_kd['AverageTime'] = df1_kd['AverageTime'] / 1000000
df1_quad['AverageTime'] = df1_quad['AverageTime'] / 1000000
df1_bucket['AverageTime'] = df1_bucket['AverageTime'] / 1000000

In [None]:
plt.figure(figsize=(10,10))

# KD Tree plot
plt.subplot(3,1,1)
lr_model_kd = sm.ols(formula= 'AverageTime ~ numTrainingPoints', data=df1_kd).fit()
plt.plot(df1_kd['numTrainingPoints'], lr_model_kd.predict(), color = 'red')
plt.xlabel('numTrainingPoints')
plt.ylabel('AverageTime(seconds)')
plt.title('KD Tree\nAverageTime vs numTrainingPoints')

# Quad Tree plot
lr_model_quad = sm.ols(formula= 'AverageTime ~ numTrainingPoints', data=df1_quad).fit()
plt.subplot(3,1,2)
plt.plot(df1_quad['numTrainingPoints'], lr_model_quad.predict(), color = 'red')
plt.xlabel('numTrainingPoints')
plt.ylabel('AverageTime(seconds)')
plt.title('QUAD Tree\nAverageTime vs numTrainingPoints')

# Bucket Tree plot
lr_model_bucket = sm.ols(formula= 'AverageTime ~ numTrainingPoints', data=df1_bucket).fit()
plt.subplot(3,1,3)
plt.plot(df1_bucket['numTrainingPoints'], lr_model_bucket.predict(), color = 'red')
plt.xlabel('numTrainingPoints')
plt.ylabel('AverageTime(seconds)')
plt.title('Bucket Tree\nAverageTime vs numTrainingPoints')

plt.tight_layout()  # Adjust spacing between subplots
plt.show()


SUMMARY

In [None]:
lr_model_kd.summary()

In [None]:
lr_model_quad.summary()

In [None]:
lr_model_bucket.summary()

In [None]:
# df_2: where K varies and D = 2 with fixed N 
df_2 = pd.read_csv("2vary_K_Fixed_D2_and_N.csv")

In [None]:
# Filter the DataFrame for each data structure
df2_kd = df_2[df_2['dataStructure'] == 'KD']
df2_quad = df_2[df_2['dataStructure'] == 'Quad']
df2_bucket = df_2[df_2['dataStructure'] == 'Bucket']
df2_bucket.columns

In [None]:
# Convert average time from microseconds to seconds
df2_kd['AverageTime'] = df2_kd['AverageTime'] / 1000000
df2_quad['AverageTime'] = df2_quad['AverageTime'] / 1000000
df2_bucket['AverageTime'] = df2_bucket['AverageTime']  / 1000000

# Create a single plot for all data structures
plt.figure(figsize=(10, 6))

# KD plot
plt.plot(df2_kd['K'],df2_kd['AverageTime'] , marker='o', label='KD Tree')

# QUAD plot
plt.plot(df2_quad['K'], df2_quad['AverageTime'] , marker='o', label='Quad Tree')

# Bucket plot
plt.plot(df2_bucket['K'], df2_bucket['AverageTime'] , marker='o', label='Bucket Tree')

plt.title('Comparison of Average Time\nWhere K Varies and D = 2 with Fixed N')

plt.xlabel('K')
plt.ylabel('Average Time (seconds)')
plt.legend()

plt.show()


linear regression 

In [None]:
plt.figure(figsize=(10,10))

# KD Tree plot
plt.subplot(3,1,1)
lr_model_kd2 = sm.ols(formula= 'AverageTime ~ K', data=df2_kd).fit()
plt.plot(df2_kd['K'], lr_model_kd2.predict(), color = 'red')
plt.xlabel('K')
plt.ylabel('AverageTime(seconds)')
plt.title('KD Tree\nAverageTime vs K')

# Quad Tree plot
lr_model_quad2 = sm.ols(formula= 'AverageTime ~ K', data=df2_quad).fit()
plt.subplot(3,1,2)
plt.plot(df2_quad['K'], lr_model_quad2.predict(), color = 'red')
plt.xlabel('K')
plt.ylabel('AverageTime(seconds)')
plt.title('QUAD Tree\nAverageTime vs K')

# Bucket Tree plot
lr_model_bucket2 = sm.ols(formula= 'AverageTime ~ K', data=df2_bucket).fit()
plt.subplot(3,1,3)
plt.plot(df2_bucket['K'], lr_model_bucket2.predict(), color = 'red')
plt.xlabel('K')
plt.ylabel('AverageTime(seconds)')
plt.title('Bucket Tree\nAverageTime vs K')

plt.tight_layout()  # Adjust spacing between subplots
plt.show()


SUMMARY From Varying K , dim = 2 and Fixed N

In [None]:
lr_model_kd2.summary()

In [None]:
lr_model_quad2.summary()

In [None]:
lr_model_bucket2.summary()

In [None]:
# df_3: where D varies and Fixed N and fixed K 
df_3 = pd.read_csv("3vary_D_Fixed_N_and_K.csv")
df_3.head(30)

In [None]:
# Filter the DataFrame for each data structure
df3_kd = df_3[df_3['dataStructure'] == 'KD']
df3_bucket = df_3[df_3['dataStructure'] == 'Bucket']

In [None]:
df3_bucket.head()

In [None]:
# Convert average time from microseconds to seconds
kd_avg_time_sec3 =df3_kd['AverageTime'] / 1000000
bucket_avg_time_sec3 = df3_bucket['AverageTime']  / 1000000

# Create a single plot for all data structures
plt.figure(figsize=(10, 6))

# KD plot
plt.plot(df3_kd['dim'],kd_avg_time_sec3 , marker='o', label='KD Tree')

# Bucket plot
plt.plot(df3_bucket['dim'], bucket_avg_time_sec3, marker='o', label='Bucket Tree')

plt.title('Comparison of Average Time\nWhere D Varies with Fixed K and with Fixed N')

plt.xlabel('Dim')
plt.ylabel('Average Time (seconds)')
plt.legend()

plt.show()

Linear regression with df_3

In [None]:
df3_kd['AverageTime'] = df3_kd['AverageTime'] / 1000000
df3_bucket['AverageTime'] = df3_bucket['AverageTime']  / 1000000

In [None]:
plt.figure(figsize=(10, 10))

# KD Tree plot
plt.subplot(3,1,1)
lr_model_kd3 = sm.ols(formula= 'AverageTime ~ dim', data=df3_kd).fit()
plt.plot(df3_kd['dim'], lr_model_kd3.predict(), color = 'red')
plt.xlabel('dim')
plt.ylabel('AverageTime(seconds)')
plt.title('KD Tree\nAverageTime vs dim')

# Bucket Tree plot
lr_model_bucket3 = sm.ols(formula= 'AverageTime ~ dim', data=df3_bucket).fit()
plt.subplot(3,1,2)
plt.plot(df3_bucket['dim'], lr_model_bucket3.predict(), color = 'red')
plt.xlabel('dim')
plt.ylabel('AverageTime(seconds)')
plt.title('Bucket Tree\nAverageTime vs dim')

plt.tight_layout()  # Adjust spacing between subplots
plt.show()

SUMMARY, When varying dimension with fixed K and fixed N

In [None]:
lr_model_kd3.summary()

In [None]:
lr_model_bucket3.summary()

In [None]:
#df_4: where K varies and Fixed D and fixed N
df_4 = pd.read_csv("4vary_K_Fixed_D_and_N.csv")


In [None]:
# Filter the DataFrame for each data structure
df4_kd = df_4[df_4['dataStructure'] == 'KD']
df4_bucket = df_4[df_4['dataStructure'] == 'Bucket']

In [None]:
# Convert average time from microseconds to seconds
df4_kd['AverageTime'] = df4_kd['AverageTime'] / 1000000
df4_bucket['AverageTime'] = df4_bucket['AverageTime']  / 1000000

In [None]:
# Create a single plot for all data structures
plt.figure(figsize=(10, 6))

# KD plot
plt.plot(df4_kd['K'],df4_kd['AverageTime']  , marker='o', label='KD Tree')

# Bucket plot
plt.plot(df4_bucket['K'], df4_bucket['AverageTime'] , marker='o', label='Bucket Tree')

plt.title('Comparison of Average Time\nWhere K Varies with Fixed D and with Fixed N')

plt.xlabel('K')
plt.ylabel('Average Time (seconds)')
plt.legend()

plt.show()

Linear regression where K varies and Fixed D and fixed N
df_4

In [None]:
plt.figure(figsize=(10, 10))

# KD Tree plot
plt.subplot(3,1,1)
lr_model_kd4 = sm.ols(formula= 'AverageTime ~ K', data=df4_kd).fit()
plt.plot(df4_kd['K'], lr_model_kd4.predict(), color = 'red')
plt.xlabel('K')
plt.ylabel('AverageTime(seconds)')
plt.title('KD Tree\nAverageTime vs K')

# Bucket Tree plot
lr_model_bucket4 = sm.ols(formula= 'AverageTime ~ K', data=df4_bucket).fit()
plt.subplot(3,1,2)
plt.plot(df4_bucket['K'], lr_model_bucket4.predict(), color = 'red')
plt.xlabel('K')
plt.ylabel('AverageTime(seconds)')
plt.title('Bucket Tree\nAverageTime vs K')

plt.tight_layout()  # Adjust spacing between subplots
plt.show()

SUMMARy when Varying K with fixed Dimension and fixed N

In [None]:
lr_model_kd4.summary()

In [None]:
lr_model_bucket4.summary()

In [None]:
df_5 = pd.read_csv("5vary_N_Fixed_K_and_D.csv")

# Filter the DataFrame for each data structure
df5_kd = df_5[df_5['dataStructure'] == 'KD']
df5_bucket = df_5[df_5['dataStructure'] == 'Bucket']

In [None]:
# Convert average time from microseconds to seconds
kd_avg_time_sec5 = df5_kd['AverageTime'] / 1000000
bucket_avg_time_sec5 = df5_bucket['AverageTime']  / 1000000

# Create a single plot for all data structures
plt.figure(figsize=(10, 6))

# KD plot
plt.plot(df5_kd['numTrainingPoints'],kd_avg_time_sec5 , marker='o', label='KD Tree')

# Bucket plot
plt.plot(df5_bucket['numTrainingPoints'], bucket_avg_time_sec5, marker='o', label='Bucket Tree')

plt.title('Comparison of Average Time\nWhere N Varies with Fixed D and with Fixed K')

plt.xlabel('numTrainingPoints')
plt.ylabel('Average Time (seconds)')
plt.legend()

plt.show()

Linear regresion Where N Varies with Fixed D and with Fixed K


In [None]:
df5_kd['AverageTime'] = df5_kd['AverageTime'] / 1000000
df5_bucket['AverageTime'] = df5_bucket['AverageTime']  / 1000000

In [None]:
plt.figure(figsize=(10, 10))

# KD Tree plot
plt.subplot(3,1,1)
lr_model_kd5 = sm.ols(formula= 'AverageTime ~ numTrainingPoints', data=df5_kd).fit()
plt.plot(df5_kd['numTrainingPoints'], lr_model_kd5.predict(), color = 'red')
plt.xlabel('numTrainingPoints')
plt.ylabel('AverageTime(seconds)')
plt.title('KD Tree\nAverageTime vs numTrainingPoints')

# Bucket Tree plot
lr_model_bucket5 = sm.ols(formula= 'AverageTime ~ numTrainingPoints', data=df5_bucket).fit()
plt.subplot(3,1,2)
plt.plot(df5_bucket['numTrainingPoints'], lr_model_bucket5.predict(), color = 'red')
plt.xlabel('numTrainingPoints')
plt.ylabel('AverageTime(seconds)')
plt.title('Bucket Tree\nAverageTime vs numTrainingPoints')

plt.tight_layout()  # Adjust spacing between subplots
plt.show()

SUMMARY when varying N with fixed K and fixed D

In [None]:
lr_model_kd5.summary()

In [None]:
lr_model_bucket5.summary()