# Data Organizer

A collection of code-snippets to help organize and manage MIAPPE-compliant metadata tables.

In [56]:
# Example to map an ppeo excel file into JSON-LD format

import pandas as pd
import json



PATH = r'/home/gryvity/Desktop/workstation/lab/MIAPPEx/scripts/notebooks/test_data/example_01_v1.1_ext.xlsx'

standard_sheet_names = ['Investigation', 
                        'Datafile', 
                        'Person', 
                        'Study', 
                        'Observation Unit', 
                        'Observed Variable', 
                        'Biological Material', 'Environment', 
                        'Factor', 
                        'Event', 
                        'Sample']




# Load the Excel file sheets into a dictionary of DataFrames
sheets = pd.read_excel(PATH, sheet_name=None).keys()



# Now read for each sheet all column names and sort them into a dictionary
column_names = {}
for sheet in sheets:
    if sheet in standard_sheet_names:
        df = pd.read_excel(PATH, sheet_name=sheet)
        column_names[sheet] = list(df.columns)[1:]


print(json.dumps(column_names, indent=4))





{
    "Investigation": [
        "Investigation unique ID",
        "Investigation title",
        "Investigation description",
        "Submission date",
        "Public release date",
        "License",
        "MIAFPE version",
        "Associated publication",
        "Metadata file version"
    ],
    "Person": [
        "Study unique ID",
        "Person name",
        "Person email",
        "Person ID",
        "Person role",
        "Person affiliation"
    ],
    "Study": [
        "Study unique ID",
        "Study title",
        "Study description",
        "Start date of study",
        "End date of study",
        "Contact institution",
        "Geographic location (country)",
        "Experimental site name",
        "Geographic location (latitude)",
        "Geographic location (longitude)",
        "Geographic location (altitude)",
        "Description of the experimental design",
        "Type of experimental design",
        "Observation unit level hierarchy",
      

In [60]:
import os 
import pandas as pd 
import re
from collections import defaultdict


class MIAPPExDataModel:
    """
    Docstring for MIAPPExDataModel

    Definitions for the DataModel
    """
    def __init__(self):

        self._context = {
            'ppeo' : 'http://purl.org/ppeo/',
            'sosa' : 'http://www.w3.org/ns/sosa/',
            'ssn'  : 'http://www.w3.org/ns/ssn/'
        }

        self._standard_sheet_names = [r'Investigation', 
                                      r'Data*File', 
                                      r'Person', 
                                      r'Study', 
                                      r'Observation*Unit', 
                                      r'Observed*Variable', 
                                      r'Biological*Material', 
                                      r'Environment', 
                                      r'*Factor', 
                                      r'Event', 
                                      r'Sample']
        
        self._sosa_sheet_names = [r'System',
                                  r'Sensor',
                                  r'Platform',
                                  r'Feature*Of*Interest',
                                  r'Procedure']
        
        self.std_sheets = self._standard_sheet_names + self._sosa_sheet_names
        
        self._classes_miappe = {
            'Investigation': 'ppeo:Investigation',
            'DataFile': 'ppeo:Datafile',
            'Person': 'ppeo:Person',
            'Study': 'ppeo:Study',
            'ObservationUnit': 'ppeo:ObservationUnit',
            'ObservedVariable': 'ppeo:ObservedVariable',
            'BiologicalMaterial': 'ppeo:BiologicalMaterial',
            'Environment': 'ppeo:Environment',
            'Factor': 'ppeo:Factor',
            'Event': 'ppeo:Event',
            'Sample': 'ppeo:Sample'
        }
        self._classes_sosassn = {
            'System': 'ssn:System',
            'Sensor': 'sosa:Sensor',
            'Platform': 'sosa:Platform',
            'FeatureOfInterest': 'sosa:FeatureOfInterest',
            'Procedure': 'sosa:Procedure',
            'Input' : 'ssn:Input',
            'Output' : 'ssn:Output'
        }

        self._obj_properties_miappe = {
            'name_01': {
                'context': 'ppeo',
                'accession': '',
                'domain': [],
                'range': [],
            }
        }
        self._obj_properties_sosassn = {}


        self._Individuals = {
            "example_01" : {
                "type": "exampleType", 
                "accession": "http://example.org/individuals/example"
            }
           
        }

    def load_ontology(self, path):
        pass
                                  

class MIAPPExXLSX(MIAPPExDataModel):
    
    """
    Docstring for MIAPPExXLSX

    """     
    def __init__(self, path):
        super().__init__()
        self.path = path
        self.validate_format()

        # Loading Data
        self.data = self.load()
        self.sheets = self.get_sheet_names()
        
        # Get MIAPPE Version
        self.version = self.get_version()

        # Collect Further Structure Information
        self.sheet_info = {}
        
        self.get_properties()

    # Loading File
    def validate_format(self):
        # Validate file format based on extension
        self.format = os.path.splitext(self.path)[-1]
        if self.format != '.xlsx':
            raise ValueError(f'Unsupported file format: {self.format}')
        
    def load(self):
        # Until now only Excel format is supported
        if self.format == '.xlsx':
            return pd.read_excel(self.path, sheet_name=None)
        else:
            raise ValueError(f'Unsupported file format: {self.format}')
        
    # Get the Version
    def get_version(self):
        # Get MIAPPE version from the Investigation sheet
        if 'Investigation' in self.data:
            if 'Field' in self.data['Investigation'].columns:
                return 1.1
            else:
                return 1.2
        else:
            raise ValueError('Investigation sheet not found in the data.')

    

    # Organizing On Sheet Level
    def get_sheet_names(self):
        # Get sheet names from the Excel file and returns a dictionary with standardized names
        sheets = defaultdict(str)
        for sheet_name in self.data.keys():
            stdname =  self.standardize_sheet_name(sheet_name)
            if stdname:
                sheets[sheet_name] = stdname
            else:
                sheets[sheet_name] = 'unknown'  # Mark unknown sheets
        return sheets

    def standardize_sheet_name(self, sheet_name):
        # Standardize sheet names based on predefined patterns
        if 'appendix' in sheet_name.lower():
            return 'appendix'
        for std in self.std_sheets:
            pattern = re.compile(rf'^{std.replace("*", ".*")}$', re.IGNORECASE)
            if pattern.match(sheet_name):
                return std.replace("*", "")
        return False

    # Organizing on Property Level
    def get_properties(self):
        # Get column names for a given sheet
        self.structure = {}
        if self.version == 1.1:
            i = 1
        else:
            i = 0
        for sheet in self.sheets.keys():
            df = pd.read_excel(self.path, sheet_name=sheet)
            self.structure[sheet] = list(df.columns)[i:]

        # Also get information about the Sheets 
        
        


  
        
    

# if __name__ == "__main__":       
base_path = "/home/gryvity/Desktop/workstation/lab/MIAPPEx/scripts/notebooks/"
example_files = [os.path.join(base_path, 'test_data/example_01_v1.1_ext.xlsx'),
                     os.path.join(base_path, 'test_data/example_02_v1.1_ext.xlsx'),
                     os.path.join(base_path, 'test_data/example_01_v1.2.xlsx'),
                     os.path.join(base_path, 'test_data/example_02_v1.2.xlsx')]
    

picked_example = example_files[2]


MIAPPEx_data = MIAPPExXLSX(picked_example)
MIAPPEx_data.structure

{'README': ['README',
  'MIAPPE Version 1.2',
  'Unnamed: 2',
  'Unnamed: 3',
  'Unnamed: 4',
  'Unnamed: 5',
  'Unnamed: 6',
  'Unnamed: 7',
  'Unnamed: 8',
  'Unnamed: 9',
  'Unnamed: 10',
  'Unnamed: 11',
  'Unnamed: 12',
  'Unnamed: 13',
  'Unnamed: 14',
  'Unnamed: 15',
  'Unnamed: 16',
  'Unnamed: 17',
  'Unnamed: 18',
  'Unnamed: 19',
  'Unnamed: 20',
  'Unnamed: 21',
  'Unnamed: 22',
  'Unnamed: 23',
  'Unnamed: 24'],
 'Investigation': ['investigationId',
  'investigationTitle',
  'investigationDescription',
  'submissionDate',
  'publicReleaseDate',
  'license',
  'miappeVersion',
  'associatedPublication',
  'variablesOntology'],
 'Study': ['studyId',
  'studyTitle',
  'studyDescription',
  'studyStartDate',
  'studyEndDate',
  'contactInst',
  'locationCountry',
  'siteName',
  'locationLatitude',
  'locationLongitude',
  'locationAltitude',
  'expeDesignDesc',
  'expeDesignType',
  'obsUnitLevelHierarchy',
  'obsUnitDesc',
  'growthFacilityDesc',
  'growthFacilityType',
  '

Matched standard sheet name: Factor with sheet name: Exp. Factor
