In [None]:
import os
import pandas as pd

In [None]:
# It is assumed that the STRO data files were downloaded from https://doi.org/10.6084/m9.figshare.27221169.v1. 
# It is assumed that the zip-files are extracted to the same folder.
# The file path of the downloads is a bit peculiar: The name of the file is also the name of the folder which it contains.
# For example, 'images.csv.zip' is extracted as a folder 'images.csv', which contains a file 'images.csv'. 
# This is taking into account while iterating of the files in the original STRO data model (STRO 1.0).
directory = r"C:\STRO20\data" # update if necessary

In [None]:
def compute_shape_density(directory):
    """
    Compute the shape and density of DataFrames from CSV files in a given directory.
    Parameters: directory (str): The directory containing the CSV files.
    Returns: A Pandas DataFrame specifying the name of each CSV file, its shape, and density.
    """
    shapeDensity = []
    for subdir, dirs, files in os.walk(directory):
        if subdir.endswith('_MACOSX'):
            pass
        else:
            try:
                for file in files:
                    filepath = os.path.join(subdir, file)
                    # load data into a pandas dataframe
                    # specify separator if necessary (e.g. sep=';')
                    df = pd.read_csv(filepath, sep=';', quotechar='"', encoding="utf-8", low_memory=False)
                    # compute shape of the dataframe
                    shape = df.shape
                    # print(f"The shape of {file} is: ", df.shape)
                    # create a sparse array based on the dataframe
                    sparseArray = df.apply(pd.arrays.SparseArray)
                    # compute the ratio of non-sparse points to total (dense) data points
                    density = sparseArray.sparse.density
                    # print(f"The density of {file} is: ", sparr.sparse.density)
                    shapeDensity.append([file, shape, f"{density:.2f}"])
            except Exception as e:
                print(f"Error processing {file}: {e}")
    return pd.DataFrame(shapeDensity, columns=['File', 'Shape', 'Density'])               

In [None]:
result = compute_shape_density(directory)

In [None]:
result