# **SPACE COMPLEXITY T-TEST**

Import needed packages

In [40]:
import math
import os
import pandas as pd
from scipy import stats

Read CSVs of calculated file sizes

In [41]:
developed_df = pd.read_csv("developed/developed-complexity-result.csv")
existing_df = pd.read_csv("existing/existing-complexity-result.csv")

developed_df = developed_df.set_index("Input size")
existing_df = existing_df.set_index("Input size")

Verify tables

In [42]:
developed_df

Unnamed: 0_level_0,Space,Time
Input size,Unnamed: 1_level_1,Unnamed: 2_level_1
20,999.78125,69.196857
40,1043.421875,136.102185
60,1212.65625,203.081744
80,1119.171875,272.150621
100,973.1875,358.68025
120,942.59375,416.420019
140,976.65625,480.869472
160,937.84375,548.813015
180,864.453125,619.767451
200,975.6875,686.481075


In [43]:
existing_df

Unnamed: 0_level_0,Space,Time
Input size,Unnamed: 1_level_1,Unnamed: 2_level_1
20,445.1875,0.614236
40,445.21875,0.594085
60,445.4375,0.600756
80,445.546875,0.601893
100,445.640625,0.611381
120,446.09375,0.705962
140,446.34375,0.657288
160,446.375,0.670021
180,446.390625,0.678678
200,447.046875,0.673917


Compute difference via paired t-test

In [44]:
space_ttest = stats.ttest_rel(developed_df["Space"], existing_df["Space"])

space_ttest

TtestResult(statistic=17.752741048517994, pvalue=2.59087916630951e-08, df=9)

In [45]:
time_ttest = stats.ttest_rel(developed_df["Time"], existing_df["Time"])

time_ttest

TtestResult(statistic=5.744451881989359, pvalue=0.00027823485295741545, df=9)

Make result uniform

In [46]:
result_df = pd.DataFrame({
    "Metrics": ["Statistic", "p-value"],
    "Space": [space_ttest[0], space_ttest[1]],
    "Time": [time_ttest[0], time_ttest[1]]
})

result_df = result_df.set_index("Metrics")
result_df

Unnamed: 0_level_0,Space,Time
Metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
Statistic,17.75274,5.744452
p-value,2.590879e-08,0.000278


Save result to CSV

In [47]:
result_df.to_csv("t-test.csv")
print("Done!")

Done!


## **BREAKDOWN**

Trying to achieve the same result through steps:

Mean differences:

In [48]:
def get_diff_mean(df_a: pd.DataFrame, df_b: pd.DataFrame, column: str):
    """
    Subtracts table `df_b` with `column` name from table `df_a` with column name `column`,
    and takes the average of the resulting column.

    This function will return a Series/array/list of differences in the same index
    of the two columns passed accessed with the `column` string argument in the `df_a`
    and `df_b` DataFrames.
    """
    return df_a[column].subtract(df_b[column]).mean()

In [49]:
space_diff_mean = get_diff_mean(developed_df, existing_df, "Space")
time_diff_mean = get_diff_mean(developed_df, existing_df, "Time")

Standard deviations of each:

In [50]:
def get_sd(df_a: pd.DataFrame, df_b: pd.DataFrame, column: str, mean):
    """
    Gets standard deviation of the differences from `df_a` and `df_b` with column name
    `column`, using the mean difference `mean`.
    """
    numerator = 0
    denominator = len(df_a[column]) - 1

    series = df_a[column].subtract(df_b[column])

    for value in series:
        numerator += (value - mean) ** 2
    
    return math.sqrt(numerator / denominator)

In [51]:
space_sd = get_sd(developed_df, existing_df, "Space", space_diff_mean)
time_sd = get_sd(developed_df, existing_df, "Time", time_diff_mean)

Finally, for t-test output:

In [52]:
space_ttest = space_diff_mean / (space_sd / math.sqrt(len(developed_df["Space"])))
space_ttest

17.752741048517994

In [53]:
time_ttest = time_diff_mean / (time_sd / math.sqrt(len(existing_df["Time"])))
time_ttest

5.744451881989359

For p-value:

In [54]:
# For one-tailed paired t-test, alpha level 0.05
# We're doing one-tailed since we are only concerned with significant changes in the positive direction.
# Source: https://www.sjsu.edu/faculty/gerstman/StatPrimer/t-table.pdf
critical_value = -1.833

space_is_significant = space_ttest < critical_value
time_is_significant = time_ttest < critical_value

space_is_significant, time_is_significant

(False, False)

Compile all values for uniformity:

In [55]:
variables_df = pd.DataFrame({
    "Variable": ["T-statistic", "n", "d", "Critical value", "Sd"],
    "Space": [space_ttest, len(developed_df["Space"]), space_diff_mean, critical_value, space_sd],
    "Time": [time_ttest, len(developed_df["Time"]), time_diff_mean, critical_value, time_sd],
})

if not os.path.isdir("manual"):
    os.mkdir("manual")

variables_df = variables_df.set_index("Variable")
variables_df.to_csv("manual/variables.csv")
print("Done!")

Done!


In [56]:
ttest_df = pd.DataFrame({
    "Metric": ["Space", "Time"],
    "T-statistic": [space_ttest, time_ttest],
    "Critical value": [ critical_value for _ in range(2) ],
    "Is significant": [space_is_significant, time_is_significant]
})

ttest_df = ttest_df.set_index("Metric")
ttest_df.to_csv("manual/t-test.csv")
print("Done!")

Done!
