## Generating Descriptive Statistics for CSVs

1. use mimetype on path

### Worker Task

1. given a guid
2. generate pandas.describe() output

In [1]:
import os
import sys
import pathlib

testPaths = pathlib.Path(".").absolute()
sys.path.append(str(testPaths.parent.parent))

from fairscape_mds.core.config import appConfig
from fairscape_mds.crud.fairscape_response import FairscapeResponse
from fairscape_mds.models.identifier import StoredIdentifier, MetadataTypeEnum
from fairscape_mds.models.user import UserWriteModel 
from fairscape_mds.crud.dataset import FairscapeDatasetRequest

In [7]:
from fairscape_models import IdentifierValue

In [10]:
IdentifierValue.model_validate(
	{
		"@id": "test",
		"@type": "type",
		"name": "a guid"
	}
)

IdentifierValue(guid='test', @type='type', name='a guid')

In [4]:
from fairscape_models.rocrate import ROCrateMetadataElem

In [6]:
ROCrateMetadataElem.schema()

/tmp/ipykernel_53803/4122892190.py:1: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  ROCrateMetadataElem.schema()


{'$defs': {'IdentifierValue': {'additionalProperties': True,
   'properties': {'@id': {'title': '@Id', 'type': 'string'}},
   'required': ['@id'],
   'title': 'IdentifierValue',
   'type': 'object'}},
 'additionalProperties': True,
 'description': 'Metadata Element of ROCrate that represents the crate as a whole\n\nExample\n    ```\n    {\n        \'@id\': \'https://fairscape.net/ark:59852/rocrate-2.cm4ai_chromatin_mda-mb-468_untreated_imageembedfold1_initialrun0.1alpha\',\n        \'@type\': [\'Dataset\', \'https://w3id.org/EVI#ROCrate\'],\n        \'name\': \'Initial integration run\',\n        \'description\': \'Ideker Lab CM4AI 0.1 alpha MDA-MB-468 untreated chromatin Initial integration run IF Image Embedding IF microscopy images embedding fold1\',\n        \'keywords\': [\'Ideker Lab\', \'fold1\'],\n        \'isPartOf\': [\n            {\'@id\': \'ark:/Ideker_Lab\'}, \n            {\'@id\': \'ark:/Ideker_Lab/CM4AI\'}\n            ],\n        \'version\': \'0.5alpha\',\n        \'

In [1]:
import mimetypes

In [6]:
mimetypes.guess_type("default/test/rocrates/paclitaxel.zip/paclitaxel/manifest.csv")

('text/csv', None)

In [9]:
rocrateGUID = "ark:59852/rocrate-example-premo-light-20250506201408"

cursor = appConfig.identifierCollection.find(
	{
		"metadata.isPartOf.@id": rocrateGUID,
		"@type": str(MetadataTypeEnum.DATASET.value)
	},
	projection={
		"_id": False
	}
)

for elem in cursor:
	identifier = StoredIdentifier.model_validate(elem)
	

In [20]:
import json

In [17]:
metadata = identifier.model_dump(mode='json')
stats = metadata['descriptiveStatistics']

In [21]:
json.dumps(stats['culture_neg_diag'])

'{"columnName": "culture_neg_diag", "statistics": {"count": 0.0, "mean": NaN, "std": NaN, "min": NaN, "first_quartile": NaN, "second_quartile": NaN, "third_quartile": NaN, "max": NaN}}'

In [1]:
import mimetypes

In [8]:
examplePath = "default/test/rocrates/Example.zip/Example/Demo PreMo_export_2025-05-06.csv"
datasetMimetype, _ = mimetypes.guess_type(examplePath)
datasetMimetype

'text/csv'

In [6]:
if datasetMimetype == 'text/csv':
	print("is a csv")

is a csv


In [2]:
datasetRequestFactory = FairscapeDatasetRequest(appConfig)

In [24]:
import pandas
import numpy

In [4]:
# given an ark
guid = "ark:59852/dataset-example-data export-55c2016c"


In [7]:
# get presigned link
getLink = datasetRequestFactory.getPresignedGetLink(guid)
# fix host for local testing

print(getLink)

http://localhost:9000/default/default/test/rocrates/Example.zip/Example/Demo%20PreMo_export_2025-05-06.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=miniotestadmin%2F20251105%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251105T194501Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host%3Bx-minio-extract&X-Amz-Signature=c47fdc3e07d4acd58a07ce4f6d03fe88148a88ff0dc196a33e6d900399cbfde6


In [43]:
from io import BytesIO
from typing import Dict

In [None]:
df

Unnamed: 0,subject_number,demograph_site_num,demograph_birth_ga_weeks,demograph_birth_ga_days,demograph_gender,demograph_race,demograph_ethnicity,demographics_complete,delivery_birth_weight,delivery_apgar_1_min,...,death_cause___1,death_cause___2,death_cause___3,death_cause___4,death_cause___5,death_cause___6,death_cause___7,death_cause___8,death_cause___9,discharge_or_death_complete
0,1,1,24,4,1,1,0,2,2000,,...,1,0,0,0,0,0,0,0,0,2
1,2,2,30,1,0,2,1,2,2400,,...,0,0,0,0,0,0,0,0,0,2


In [None]:
pandas.DataFrame

In [None]:
def generateSummaryStatistics(guid: str)-> Dict[str, DescriptiveStatistics]:

	datasetContent = datasetRequestFactory.loadDatasetContent(guid)

	dataframe = pandas.read_csv(BytesIO(
		datasetContent
	))
	
	statistics = {}
	numColumns = dataframe.shape[1]

	for i in range(numColumns):
		series = dataframe.iloc[:, i]

		if pandas.api.types.is_numeric_dtype(series):
			summaryStats = generateNumericalStatistics(series)
		else:
			summaryStats = generateCategoricalStatistics(series)
		
		statistics[summaryStats.columnName] = summaryStats

	return statistics


0    1
1    2
Name: subject_number, dtype: int64

{'subject_number': DescriptiveStatistics(columnName='subject_number', statistics=NumericalStatistics(count=2.0, mean=1.5, std=0.7071067811865476, min=1.0, first_quartile=1.25, second_quartile=1.5, third_quartile=1.75, max=2.0)),
 'demograph_site_num': DescriptiveStatistics(columnName='demograph_site_num', statistics=NumericalStatistics(count=2.0, mean=1.5, std=0.7071067811865476, min=1.0, first_quartile=1.25, second_quartile=1.5, third_quartile=1.75, max=2.0)),
 'demograph_birth_ga_weeks': DescriptiveStatistics(columnName='demograph_birth_ga_weeks', statistics=NumericalStatistics(count=2.0, mean=27.0, std=4.242640687119285, min=24.0, first_quartile=25.5, second_quartile=27.0, third_quartile=28.5, max=30.0)),
 'demograph_birth_ga_days': DescriptiveStatistics(columnName='demograph_birth_ga_days', statistics=NumericalStatistics(count=2.0, mean=2.5, std=2.1213203435596424, min=1.0, first_quartile=1.75, second_quartile=2.5, third_quartile=3.25, max=4.0)),
 'demograph_gender': DescriptiveSta

In [30]:
# using dtype attribute of the series
isinstance(series.dtype, numpy.dtypes.Int64DType)

# using pandas.api.types.is_numeric_dtype
pandas.api.types.is_numeric_dtype(series)

True

True

numpy.dtypes.Int64DType

In [100]:
descriptiveStats

Unnamed: 0,subject_number,demograph_site_num,demograph_birth_ga_weeks,demograph_birth_ga_days,demograph_gender,demograph_race,demograph_ethnicity,demographics_complete,delivery_birth_weight,delivery_apgar_1_min,...,death_cause___1,death_cause___2,death_cause___3,death_cause___4,death_cause___5,death_cause___6,death_cause___7,death_cause___8,death_cause___9,discharge_or_death_complete
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,1.5,1.5,27.0,2.5,0.5,1.5,0.5,2.0,2200.0,,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
std,0.707107,0.707107,4.242641,2.12132,0.707107,0.707107,0.707107,0.0,282.842712,,...,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,1.0,1.0,24.0,1.0,0.0,1.0,0.0,2.0,2000.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
25%,1.25,1.25,25.5,1.75,0.25,1.25,0.25,2.0,2100.0,,...,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
50%,1.5,1.5,27.0,2.5,0.5,1.5,0.5,2.0,2200.0,,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,1.75,1.75,28.5,3.25,0.75,1.75,0.75,2.0,2300.0,,...,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,2.0,2.0,30.0,4.0,1.0,2.0,1.0,2.0,2400.0,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [61]:
series = df.iloc[:,0]

In [28]:
series.dtype

dtype('int64')

In [26]:
series.name

'subject_number'

In [32]:
from pydantic import BaseModel, Field
from typing import Union, Optional

# context for STATO
statoContext = {
	"count": "",
	"mean": "http://purl.obolibrary.org/obo/STATO_0000573",
	"std": "http://purl.obolibrary.org/obo/STATO_0000684",
	"min":  "http://purl.obolibrary.org/obo/STATO_0000150",
	"first_quartile": "",
	"second_quartile": "",
	"third_quartile": "",
	"max": ""	
}

class NumericalStatistics(BaseModel):
	count: float
	mean: float
	std: float
	min: float
	first_quartile: float = Field(alias="25%")
	second_quartile: float = Field(alias="50%")
	third_quartile: float = Field(alias="75%")
	max: float

	def serializeStato(self):
		""" """ 
		pass

class CategoricalStatistics(BaseModel):
	count: int
	unique: int
	top: Optional[Union[str,bool]] = Field(default=None)
	freq: int

	def serializeStato(self):
		""" """ 
		pass

class DescriptiveStatistics(BaseModel):
	columnName: str
	statistics: Union[NumericalStatistics, CategoricalStatistics] 


def generateNumericalStatistics(series):

	descriptiveStats = series.describe()

	numericStats = NumericalStatistics.model_validate(descriptiveStats.to_dict(),by_alias=True)

	return DescriptiveStatistics.model_validate({
		'columnName': descriptiveStats.name,
		'statistics': numericStats
	})

def generateCategoricalStatistics(series) -> DescriptiveStatistics:
	describeSeries = series.describe()

	categoricalDict = describeSeries.to_dict()

	if categoricalDict.get('top') is None:
		categoricalDict['top'] = None

	categoricalStats = CategoricalStatistics.model_validate(categoricalDict)

	return DescriptiveStatistics.model_validate({
		'columnName': describeSeries.name,
		'statistics': categoricalStats
	})

In [70]:

descriptiveStats = series.describe()
numericStats = NumericalStatistics.model_validate(descriptiveStats.to_dict(),by_alias=True)

In [104]:
categorialDF = pandas.DataFrame({
	"categorical": ["A", "B", "C", "B", "A"],
	"boolean": [True, False, True, False, True]
	})

catSeries = categorialDF.iloc[:,0]
generateCategoricalStatistics(catSeries)

#catSeries = categorialDF.iloc[:,1]
#catSeries.describe()
generateCategoricalStatistics(catSeries)

DescriptiveStatistics(columnName='categorical', statistics=CategoricalStatistics(count=5, unique=3, top='A', freq=2))

In [105]:
catSeries.dtype

dtype('O')

In [None]:
{"descriptiveStatistics": {
	"columna": {
		""
	}
}}

In [None]:
pandas.dtype

pandas.core.dtypes.dtypes.CategoricalDtype

In [38]:
print(categorialDF.iloc[:, 0].dtype)

object


In [54]:
series = categorialDF.iloc[:, 0]

In [None]:
## Testing on 