In [4]:
import os
import pandas as pd
from pathlib import Path
import requests
import json

In [14]:
# Add the name of a directory to download the legacy data STRO 1.0 to
# The legacy data are found at: https://doi.org/10.6084/m9.figshare.27221169.v2
# We will download it below using the figshare API
directory = r"[ADD_DIR_NAME]" # for example, 'STRO10'

In [15]:
# This cell is based on info found at: https://help.figshare.com/article/how-to-use-the-figshare-api#api-interface
# Because of their size (1.06 GB), it takes a few minutes to download the files.
item_id = "27221169" # This is the item ID of the figshare project STRO 1.0 (legacy data of Sound Toll Registers Online)
BASE_URL = 'https://api.figshare.com/v2'
file_info = [] # A blank list to hold all the file metadata
r = requests.get(BASE_URL + '/articles/' + item_id + '/files')
file_metadata = json.loads(r.text)
for j in file_metadata: #add the item id to each file record- this is used later to name a folder to save the file to
    j['item_id'] = item_id
    file_info.append(j) #Add the file metadata to the list

#Download each file to the directory indicated above
for k in file_info:
    response = requests.get(BASE_URL + '/file/download/' + str(k['id']))
    Path(directory).mkdir(exist_ok=True)
    open(directory + '/' + k['name'], 'wb').write(response.content)
    
print('Done. All files are downloaded.')

All done. If using Colab you will find the files in the little folder icon to the left.


In [18]:
def compute_shape_density(directory):
    """
    Compute the shape and density of DataFrames from CSV files in a given directory.
    Parameters: directory (str): The directory containing the CSV files.
    Returns: A Pandas DataFrame specifying the name of each CSV file, its shape, and density.
    """
    shapeDensity = []
    for subdir, dirs, files in os.walk(directory):
        if subdir.endswith('_MACOSX'):
            pass
        else:
            try:
                for file in files:
                    filepath = os.path.join(subdir, file)
                    # load data into a pandas dataframe
                    # specify separator if necessary (e.g. sep=';')
                    df = pd.read_csv(filepath, sep=',', quotechar='"', encoding="utf-8", low_memory=False)
                    # compute shape of the dataframe
                    shape = df.shape
                    # print(f"The shape of {file} is: ", df.shape)
                    # create a sparse array based on the dataframe
                    sparseArray = df.apply(pd.arrays.SparseArray)
                    # compute the ratio of non-sparse points to total (dense) data points
                    density = sparseArray.sparse.density
                    # print(f"The density of {file} is: ", sparr.sparse.density)
                    shapeDensity.append([file, shape, f"{density:.2f}"])
            except Exception as e:
                print(f"Error processing {file}: {e}")
    return pd.DataFrame(shapeDensity, columns=['File', 'Shape', 'Density'])               

In [19]:
# takes a while to execute for all six, large dataframes in STRO 1.0
result = compute_shape_density(directory)

In [20]:
result

Unnamed: 0,File,Shape,Density
0,belastingen.csv,"(2919233, 11)",0.61
1,doorvaarten.csv,"(2152705, 53)",0.37
2,images.csv,"(2197401, 5)",0.74
3,ladingen.csv,"(5569030, 16)",0.55
4,registers_totaal.csv,"(1116, 8)",0.92
5,secties_totaal.csv,"(4621, 9)",0.89
