In [None]:
import os

"""
00 - Configuration
"""

dataset_zip_path = "../../data/raw_datasets/11.7_yolo_20_consumer.zip"
output_folder = "../../data/processed/11.7_yolo_20_consumer/"


In [None]:
"""
01 - Process yolo dataset

- Combine all .csv-files in the given .zip to a DataFrame
- Save the dataframe to output_path
"""


import zipfile
import time
import pandas as pd

names_container = {}
values_container = {}
timestamps_container = {}

# Open as zip
with zipfile.ZipFile(dataset_zip_path, 'r') as zip_ref:

    # Get a list of all files in the zip
    items = zip_ref.namelist()

    # Get a list of all .csv-files in the zip
    csv_files = [x for x in items if x.endswith('.csv')]
    csv_files.sort(reverse=True)  # NOTE: Sorting does not matter, but may be useful for debugging
    
    count = 0
    start_time = time.time()

    # Iterate over all csv-files
    dataframes = []
    for path in csv_files:
        count += 1
        print(f"Progress {count}/{len(csv_files):6}, ({count/len(csv_files)*100:5.3} %) (time_spent: {time.time() - start_time:.3} s  - avg: {(time.time() - start_time) / count} s)")
        with zip_ref.open(path) as csv_file:
            x = pd.read_csv(csv_file)
            dataframes.append(x)
    
    # Combine data to a single DataFrame
    df = pd.concat(dataframes)
    os.makedirs(output_folder, exist_ok=True)
    output_path = os.path.join(output_folder, "yolo_qos.feather")
    df.sort_index(inplace=True)
    df.reset_index(drop=False, inplace=True, names=["timestamp"])
    df.to_feather(output_path)
    print(f"Saved to {output_path}")



In [None]:
"""
Quick plot to partially visualize the data.
"""
import seaborn as sns
import matplotlib.pyplot as plt
sns.scatterplot(y="queue", x="start_time", hue="source", data=df)
# sns.scatterplot(y="queue", x="Unnamed: 0", data=df)
plt.savefig("yolo_queue.png")
# df.plot.scatter(y=df.queue, x=df.index)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
df2 = df.copy()
df2["q"] = df2["end_time"] - df2["start_time"]
sns.scatterplot(y="q", x="start_time", hue="source", data=df2)
plt.ylim(0,200)
# sns.scatterplot(y="queue", x="Unnamed: 0", data=df)
plt.savefig("yolo_queue.png")
# df.plot.scatter(y=df.queue, x=df.index)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
df2 = df.copy()
df2["q"] = df2["end_time"] - df2["start_time"]
sns.ecdfplot(y="start_time", hue="source", data=df2)
plt.show()
sns.ecdfplot(y="end_time", hue="source", data=df2)
plt.show()
sns.ecdfplot(y="q", hue="source", data=df2)
