In [1]:
import os
import pandas as pd
from pathlib import Path
import requests
import json

In [2]:
# Add the name of a directory to download the re-engineered data STRO 2.0
# The re-engineered STRO data are found at: https://doi.org/10.6084/m9.figshare.27176202
# We will download it below using the figshare API
directory = r"C:\STRO200"

In [4]:
# This cell is based on info found at: https://help.figshare.com/article/how-to-use-the-figshare-api#api-interface
# Because of their size (1.52 GB), it takes a few minutes to download the files.
item_id = "27176202" # This is the item ID of the figshare project STRO 2.0 (re-engineered Sound Toll Registers Online data)
BASE_URL = 'https://api.figshare.com/v2'
file_info = [] # A blank list to hold all the file metadata
r = requests.get(BASE_URL + '/articles/' + item_id + '/files')
file_metadata = json.loads(r.text)
for j in file_metadata: # Add the item id to each file record- this is used later to name a folder to save the file to
    j['item_id'] = item_id
    file_info.append(j) # Add the file metadata to the list

# Download each file to the directory indicated above
for k in file_info:
    response = requests.get(BASE_URL + '/file/download/' + str(k['id']))
    Path(directory).mkdir(exist_ok=True)
    open(directory + '/' + k['name'], 'wb').write(response.content)
    
print('Done. All files are downloaded.')

Done. All files are downloaded.


In [7]:
def compute_shape_density(directory):
    """
    Compute the shape and density of DataFrames from CSV files in a given directory.
    Parameters: directory (str): The directory containing the CSV files.
    Returns: A Pandas DataFrame specifying the name of each CSV file, its shape, and density.
    """
    shapeDensity = []
    for subdir, dirs, files in os.walk(directory):
        if subdir.endswith('_MACOSX'):
            pass
        else:
            try:
                for file in files:
                    filepath = os.path.join(subdir, file)
                    # load data into a pandas dataframe
                    df = pd.read_csv(filepath, sep=';', quotechar='"', encoding="utf-8", low_memory=False)
                    # compute shape of the dataframe
                    shape = df.shape
                    # print(f"The shape of {file} is: ", df.shape)
                    # create a sparse array based on the dataframe
                    sparseArray = df.apply(pd.arrays.SparseArray)
                    # compute the ratio of non-sparse points to total (dense) data points
                    density = sparseArray.sparse.density
                    # print(f"The density of {file} is: ", sparr.sparse.density)
                    shapeDensity.append([file, shape, f"{density:.2f}"])
            except Exception as e:
                print(f"Error processing {file}: {e}")
    return pd.DataFrame(shapeDensity, columns=['File', 'Shape', 'Density'])               

In [8]:
# takes a while to execute for all dataframes in STRO 2.0
result = compute_shape_density(directory)

Error processing ERD_STRO20.jpg: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte


In [9]:
result

Unnamed: 0,File,Shape,Density
0,cargoes_measurement.csv,"(5732724, 6)",0.84
1,cargoes_regs.csv,"(5569030, 5)",1.0
2,customs_entries.csv,"(2152705, 7)",0.95
3,departure.csv,"(2132215, 4)",1.0
4,destination.csv,"(2149349, 4)",1.0
