## Bioschemas JSON-LD and website page comparison

This script runs through the bioschemas specification folder for JSON-LD files and compares it with the list of specifications from the bioschemas website `_profile` and `_type` pages to identify any specifications which are available on the web, but are missing corresponding JSON-LD files.

In [1]:
import os
import requests
import pandas as pd

In [9]:
tmp_dir = os.getcwd()
parent_dir = os.path.dirname(tmp_dir)
available = os.listdir(parent_dir)
spec_dir = os.path.join(parent_dir,'specifications')
profile_dir = os.path.join(parent_dir,'bioschemas.github.io','pages','_profiles')
types_dir = os.path.join(parent_dir,'bioschemas.github.io','pages','_types')


C:\Users\gtsueng\Anaconda3\envs\outbreak\bioschemas\bioschemas.github.io\pages\_types


In [24]:
def list_json_specs(spec_dir):
    spec_folder_list = os.listdir(spec_dir)
    spec_file_list = []
    failurelist = []
    for eachfolder in spec_folder_list:
        try:
            jsonldpath = os.path.join(spec_dir,eachfolder,'jsonld')
            jsonfiles = os.listdir(jsonldpath)
            for eachfile in jsonfiles:
                spec_file_list.append({'specification':eachfolder,'filename':eachfile.replace('.jsonld','').replace('.json','')})
        except:
            failurelist.append(eachfolder)
    spec_folder_df = pd.DataFrame(spec_file_list)
    return spec_folder_df, failurelist


def list_page_specs(spec_dir):
    spec_folder_list = os.listdir(spec_dir)
    spec_file_list = []
    failurelist = []
    for eachfolder in spec_folder_list:
        try:
            pagepath = os.path.join(spec_dir,eachfolder)
            htmlfiles = os.listdir(pagepath)
            for eachfile in htmlfiles:
                spec_file_list.append({'specification':eachfolder,'filename':eachfile.replace('.html','').replace('.HTML','')})
        except:
            failurelist.append(eachfolder)
    spec_folder_df = pd.DataFrame(spec_file_list)
    return spec_folder_df, failurelist

In [51]:
json_specs, jsonfails = list_json_specs(spec_dir)
profile_pages, profile_fails = list_page_specs(profile_dir)
types_pages, types_fails = list_page_specs(types_dir)

json_specs['spec_name'] = [x.replace('-DEPRECATED','') for x in json_specs['filename'].tolist()]
profile_pages['spec_name'] = profile_pages['specification'].astype(str).str.cat(profile_pages['filename'].astype(str),sep='_v')
types_pages['spec_name'] = types_pages['specification'].astype(str).str.cat(types_pages['filename'].astype(str),sep='_v')


missing_profiles_list = [x for x in profile_pages['spec_name'].tolist() if x not in json_specs['spec_name'].tolist()]
missing_types_list = [x for x in types_pages['spec_name'] if x not in json_specs['spec_name']]

missing_profiles = profile_pages.loc[profile_pages['spec_name'].isin(missing_profiles_list)].copy()
missing_profiles['spec_type'] = 'profile'

missing_types = types_pages.loc[types_pages['spec_name'].isin(missing_types_list)].copy()
missing_types['spec_type'] = 'types'

missing = pd.concat((missing_profiles,missing_types),ignore_index=True)
missing.to_csv(os.path.join(tmp_dir,'results','missing_jsonld.tsv'),sep='\t',header=True)

In [29]:
print(json_specs)

        specification                                 filename
0              Beacon  Beacon_v0.2-DRAFT-2018_04_23-DEPRECATED
1       BioChemEntity               BioChemEntity_v0.7-RELEASE
2       BioChemEntity                 BioChemEntity_v0.8-DRAFT
3    BioChemStructure   BioChemStructure_v0.1-DRAFT-2019_06_20
4           BioSample                   BioSample_v0.1-RELEASE
..                ...                                      ...
161  TrainingMaterial   TrainingMaterial_v0.4-DRAFT-2019_02_08
162  TrainingMaterial   TrainingMaterial_v0.5-DRAFT-2019_02_25
163  TrainingMaterial   TrainingMaterial_v0.6-DRAFT-2019_06_06
164  TrainingMaterial   TrainingMaterial_v0.8-DRAFT-2020_10_06
165  TrainingMaterial   TrainingMaterial_v0.9-DRAFT-2020_12_08

[166 rows x 2 columns]
