In [1]:
import cmip6_cv.PrePARE.PrePARE as pp
#import subprocess
import pandas as pd
import re
from os import listdir, walk
from os.path import isfile, join
import json

## 1. Environment definitions

In [2]:
trunk="/mnt/lustre02/work/ik1017/CMIP6/data/"
dsetsinp = "/home/dkrz/k204210/catalog-to-manifest/catalogs/wg1subset-r1-datasets-pids-clean.csv.gz"
prepareSetting = {
    "exec" : pp.__file__ ,
    "logChunk":"/mnt/lustre02/work/ik1017/CMIP6/meta/c34g-qc-prepare/logs/",
    "cmip6-cmor-table-path" : "/home/dkrz/k204210/cmip6-cmor-tables/Tables" }

## 2. QC settings

In [3]:
errorSeverity=["Passed", "Minor Issue", "Major Issue"]
parsedict={"meta": ["filename", "creation_date", "dset_id", "specs_version"],
           "filenoDict":{"checked": 'files scanned: (\d+)',
                        "failed": 'with error\(s\): (\d+)'
                       },
           "errorDict":{"filename": 2,
                        "Warning" : 1,
                        "CV FAIL" : 1,
                        "Permission denied" : 2,
                        "not understood" : 2,
                        "SKIPPED" : 2},
          }

## 3. Function definitions

In [4]:
def collect_errors(dset_entry) :
    errors=[]
    max_severity=0
    for line in open(dset_entry["logfile_name"]):
        for errorKeyword in parsedict["errorDict"].keys() :
            match = re.findall(errorKeyword, line)
            if match:
                errors.append(errorKeyword)
                max_severity=max(max_severity,int(parsedict["errorDict"][errorKeyword]))
    dset_entry["errors"]=tuple(errors)
    dset_entry["max_severity"]=max_severity

In [5]:
def parse_file(dset_entry):
    checkedFiles=[]
    failedFiles=[]
    for line in open(dset_entry["logfile_name"]):
        match = re.search(parsedict["filenoDict"]["checked"], line)
        if match:
            checkedFiles.append(''.join(match.group(1)))
        match = re.search(parsedict["filenoDict"]["failed"], line)
        if match:
            failedFiles.append(''.join(match.group(1)))
    if not checkedFiles or not failedFiles :
        print(dset_entry["logfile_name"], checkedFiles, failedFiles)
    dset_entry["checked"]=int(checkedFiles[0])
    dset_entry["failed"]=int(failedFiles[0])
    dset_entry["passed"]=dset_entry["checked"]-dset_entry["failed"]
    if not dset_entry["failed"] == 0 :
        collect_errors(dset_entry)        

## 4. Create a Manifest file

In [6]:
c3s_qc_prepare_dict = {}

In [7]:
dsetslist = pd.read_csv(dsetsinp)

In [8]:
dsetslist.head()

Unnamed: 0,dataset_id,data_specs_version,pid
0,CMIP6.OMIP.NOAA-GFDL.GFDL-OM4p5B.omip1.r1i1p1f...,['01.00.27'],hdl:21.14100/64206ff6-979d-3fd2-b337-8f1803ac0abf
1,CMIP6.OMIP.NOAA-GFDL.GFDL-OM4p5B.omip1.r1i1p1f...,['01.00.27'],hdl:21.14100/6af7045e-df4b-3728-89a7-277d911ea3e7
2,CMIP6.OMIP.NOAA-GFDL.GFDL-OM4p5B.omip1.r1i1p1f...,['01.00.27'],hdl:21.14100/fd3e752b-2b1d-31c0-b3d5-15bd8df6dae8
3,CMIP6.OMIP.NOAA-GFDL.GFDL-OM4p5B.omip1.r1i1p1f...,['01.00.27'],hdl:21.14100/7a45f846-91d1-3ed7-890a-bb2070a34a35
4,CMIP6.OMIP.NOAA-GFDL.GFDL-OM4p5B.omip1.r1i1p1f...,['01.00.27'],hdl:21.14100/1e3f80c4-9b1a-323f-8614-d70bd65734ce


In [9]:
specs_paths=listdir(prepareSetting["logChunk"])
for specs_path in specs_paths:
    for dirpath, dirnames, logfile_names in walk(join(prepareSetting["logChunk"], specs_path)):
        for logfile_name in logfile_names :
            dset_entry = {"logfile_name":join(dirpath, logfile_name),
                          "creation_date":logfile_name.split(".")[0].split("-")[1],
                          "dset_id":dirpath[len(join(prepareSetting["logChunk"], specs_path))+1:],
                          "specs_version": "01.00."+specs_path}
            pid=dsetslist[dsetslist["dataset_id"]==dset_entry["dset_id"]]["pid"].tolist()
            if not len(pid) == 1 :
                pid = "nan"
            else :
                pid = pid[0]
            parse_file(dset_entry)
            c3s_qc_prepare_dict[pid]=dset_entry

In [35]:
c3s_qc_prepare = pd.DataFrame.from_dict(c3s_qc_prepare_dict, orient="index")
c3s_qc_prepare.index.name="pid"

## 5. Reformat and subset for first delivery

In [52]:
FirstDelivery="/home/dkrz/k204210/c3s_34g_qc_results/Catalogs/c3s34g-release1-datasets_v0.csv"
dsetslistFirstDelivery = pd.read_csv(FirstDelivery, names=["dataset_id", "pid"])

In [48]:
c3s_qc_prepare_firstdelivery = c3s_qc_prepare[["dset_id", "failed"]].rename(columns={"failed": "qc_status"})
c3s_qc_prepare_firstdelivery.loc[c3s_qc_prepare_firstdelivery.qc_status > 0, 'qc_status'] = "ERROR"
c3s_qc_prepare_firstdelivery['qc_status'] = c3s_qc_prepare_firstdelivery['qc_status'].replace(0, value='pass')

In [59]:
c3s_qc_prepare_firstdelivery = c3s_qc_prepare_firstdelivery[c3s_qc_prepare_firstdelivery.index.isin(dsetslistFirstDelivery["pid"])]

In [62]:
c3s_qc_prepare_firstdelivery[c3s_qc_prepare_firstdelivery["qc_status"] != "pass"]

Unnamed: 0_level_0,dset_id,qc_status
pid,Unnamed: 1_level_1,Unnamed: 2_level_1
hdl:21.14100/9c721b03-8789-3e90-ad61-76045e5dcaf1,CMIP6.ScenarioMIP.IPSL.IPSL-CM6A-LR.ssp434.r1i...,ERROR
hdl:21.14100/9233828e-782f-328b-a3e4-38965e69eb08,CMIP6.ScenarioMIP.IPSL.IPSL-CM6A-LR.ssp245.r1i...,ERROR
hdl:21.14100/0a682a4a-e720-367c-957f-e82d13b2b304,CMIP6.ScenarioMIP.IPSL.IPSL-CM6A-LR.ssp245.r1i...,ERROR
hdl:21.14100/af5fc835-3047-32b5-8500-4a868e8660a0,CMIP6.ScenarioMIP.IPSL.IPSL-CM6A-LR.ssp534-ove...,ERROR
hdl:21.14100/37272d35-0a59-37f0-8f42-fdf80f7039ec,CMIP6.ScenarioMIP.IPSL.IPSL-CM6A-LR.ssp585.r1i...,ERROR
...,...,...
hdl:21.14100/fde1d1b4-23ed-378b-8e41-ac6b5330c18d,CMIP6.ScenarioMIP.FIO-QLNM.FIO-ESM-2-0.ssp126....,ERROR
hdl:21.14100/4b9b913e-71fd-3e68-b6c6-09407ec25ba3,CMIP6.ScenarioMIP.FIO-QLNM.FIO-ESM-2-0.ssp126....,ERROR
hdl:21.14100/b745ba92-cbf1-3612-8764-74fbabebc24a,CMIP6.ScenarioMIP.FIO-QLNM.FIO-ESM-2-0.ssp126....,ERROR
hdl:21.14100/7678da2b-d112-365d-8c69-af7c4a307c7c,CMIP6.ScenarioMIP.NCAR.CESM2-WACCM.ssp370.r1i1...,ERROR


In [66]:
c3s_qc_prepare_firstdelivery.to_json("/home/dkrz/k204210/catalog-to-manifest/", orient="index")

In [None]:
c3serr = c3s_qc_prepare[c3s_qc_prepare["failed"]!= 0]

In [None]:
print(len(c3serr))

In [None]:
c3serrgb = c3serr.groupby("errors")

In [None]:
print(c3serr[c3serr['errors'] == ()]["logfile_name"].tolist())

In [None]:
print(c3serr[c3serr['errors'].str.contains("SKIPPED")])