In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = []

In [3]:
def parse_filename(filename):
    # csorting_timsort_normal1000KBpartialsorting.txt
    name, _ = os.path.splitext(filename)
    parts = name.split("_")
    language = parts[0]  # Example: "csorting"
    algorithm = parts[1]  # Example: "timsort"

    rest = parts[2]  # normal1000KBpartialsorting
    distribution = "".join([c for c in rest if not c.isdigit()]).replace("KB", "").split("partialsorting")[0]
    size = int("".join([c for c in rest if c.isdigit()]))  # Extract size as an integer
    organization = "partialsorting" if "partialsorting" in rest else "unsorted" if "unsorted" in rest else "fullysorted"

    return language, algorithm, distribution, size, organization

In [4]:
def compute_stats(file_path):
    with open(file_path, 'r') as file:
        numbers = [float(line.strip()) for line in file if line.strip()]
        mean = np.mean(numbers)
        q1 = np.percentile(numbers, 25)
        q3 = np.percentile(numbers, 75)
        std_dev = np.std(numbers)
        med = np.median(numbers)
        return mean, q1, q3, std_dev, med

In [5]:
folder_path = "/Users/aliciayang/Python Scripts/timing_measurements"

In [6]:
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        
        language, algorithm, distribution, size, organization = parse_filename(filename)
        (mean, q1, q3, std_dev, med) = compute_stats(file_path)
        
        data.append([language, algorithm, distribution, size, organization, mean, q1, q3, std_dev, med])

In [7]:
columns = ["Language", "Algorithm", "Distribution", "DataSize", "Organization", "Mean", "Q1", "Q3", "SD", "Median"]
df = pd.DataFrame(data, columns=columns)

# df.to_csv("medians.csv", index=False)

print(df.groupby(["Algorithm", "DataSize"])["Median"].mean())

Algorithm  DataSize
quicksort  1           1.545249e+07
           10          8.428816e+08
           100         7.681731e+10
           1000        2.631127e+08
radixsort  1           1.380716e+08
           10          1.382152e+09
           100         1.433399e+10
           1000        1.990187e+08
timsort    1           1.637096e+05
           10          1.998786e+06
           100         2.505245e+07
           1000        2.730797e+08
Name: Median, dtype: float64


In [8]:
df.head()

Unnamed: 0,Language,Algorithm,Distribution,DataSize,Organization,Mean,Q1,Q3,SD,Median
0,cppsorting,quicksort,lognormalnosorting,10,fullysorted,1603721.0,1573045.0,1608222.0,61541.0,1585199.5
1,csorting,timsort,uniform,1,partialsorting,175209.8,167603.2,178550.2,15955.75,173598.0
2,csorting,radixsort,lognormalfullsorting,1000,fullysorted,196089700.0,195358900.0,196620400.0,1965156.0,195904781.0
3,python3pythonsortingpy,timsort,uniformnosorting,100,fullysorted,59978850.0,59681570.0,60045810.0,461276.5,59918061.0
4,python3pythonsortingpy,radixsort,lognormal,1,partialsorting,413802600.0,410778800.0,414901200.0,6129199.0,412056020.5
