In [162]:
import cmip6_cv.PrePARE.PrePARE as pp
#import subprocess
import pandas as pd
import re
from os import listdir, walk
from os.path import isfile, join
import json

## 1. Environment definitions

In [152]:
trunk="/mnt/lustre02/work/ik1017/CMIP6/data/"
dsetsinp = "/home/dkrz/k204210/catalog-to-manifest/catalogs/wg1subset-r1-datasets-pids-clean.csv.gz"
prepareSetting = {
    "exec" : pp.__file__ ,
    "logChunk":"/mnt/lustre02/work/ik1017/CMIP6/meta/tools/c34g-qc-prepare/logs/",
    "cmip6-cmor-table-path" : "/home/dkrz/k204210/cmip6-cmor-tables/Tables" }

## 2. QC settings

In [134]:
errorSeverity=["Passed", "Minor Issue", "Major Issue"]
parsedict={"meta": ["filename", "creation_date", "dset_id", "specs_version"],
           "filenoDict":{"checked": 'files scanned: (\d+)',
                        "failed": 'with error\(s\): (\d+)'
                       },
           "errorDict":{"filename": 2,
                        "Warning" : 1,
                        "CV FAIL" : 1},
          }

## 3. Function definitions

In [147]:
def collect_errors(dset_entry) :
    errors=[]
    max_severity=0
    for line in open(dset_entry["logfile_name"]):
        for errorKeyword in parsedict["errorDict"].keys() :
            match = re.findall(errorKeyword, line)
            if match:
                errors.append(errorKeyword)
                max_severity=max(max_severity,int(parsedict["errorDict"][errorKeyword]))
    dset_entry["errors"]=errors
    dset_entry["max_severity"]=max_severity

In [148]:
def parse_file(dset_entry):
    checkedFiles=[]
    failedFiles=[]
    for line in open(dset_entry["logfile_name"]):
        match = re.search(parsedict["filenoDict"]["checked"], line)
        if match:
            checkedFiles.append(''.join(match.group(1)))
        match = re.search(parsedict["filenoDict"]["failed"], line)
        if match:
            failedFiles.append(''.join(match.group(1)))
    if not checkedFiles or not failedFiles :
        print(dset_entry["logfile_name"], checkedFiles, failedFiles)
    dset_entry["checked"]=int(checkedFiles[0])
    dset_entry["failed"]=int(failedFiles[0])
    dset_entry["passed"]=dset_entry["checked"]-dset_entry["failed"]
    if not dset_entry["failed"] == 0 :
        collect_errors(dset_entry)        

## 4. Create a Manifest file

In [176]:
c3s_qc_prepare_dict = {}

In [177]:
dsetslist = pd.read_csv(dsetsinp)

In [178]:
dsetslist.head()

Unnamed: 0,dataset_id,data_specs_version,pid
0,CMIP6.OMIP.NOAA-GFDL.GFDL-OM4p5B.omip1.r1i1p1f...,['01.00.27'],hdl:21.14100/64206ff6-979d-3fd2-b337-8f1803ac0abf
1,CMIP6.OMIP.NOAA-GFDL.GFDL-OM4p5B.omip1.r1i1p1f...,['01.00.27'],hdl:21.14100/6af7045e-df4b-3728-89a7-277d911ea3e7
2,CMIP6.OMIP.NOAA-GFDL.GFDL-OM4p5B.omip1.r1i1p1f...,['01.00.27'],hdl:21.14100/fd3e752b-2b1d-31c0-b3d5-15bd8df6dae8
3,CMIP6.OMIP.NOAA-GFDL.GFDL-OM4p5B.omip1.r1i1p1f...,['01.00.27'],hdl:21.14100/7a45f846-91d1-3ed7-890a-bb2070a34a35
4,CMIP6.OMIP.NOAA-GFDL.GFDL-OM4p5B.omip1.r1i1p1f...,['01.00.27'],hdl:21.14100/1e3f80c4-9b1a-323f-8614-d70bd65734ce


In [179]:
specs_paths=listdir(prepareSetting["logChunk"])
for specs_path in specs_paths:
    for dirpath, dirnames, logfile_names in walk(join(prepareSetting["logChunk"], specs_path)):
        for logfile_name in logfile_names :
            dset_entry = {"logfile_name":join(dirpath, logfile_name),
                          "creation_date":logfile_name.split(".")[0].split("-")[1],
                          "dset_id":dirpath[len(join(prepareSetting["logChunk"], specs_path))+1:],
                          "specs_version": "01.00."+specs_path}
            pid=dsetslist[dsetslist["dataset_id"]==dset_entry["dset_id"]]["pid"].tolist()
            if not len(pid) == 1 :
                pid = "nan"
            else :
                pid = pid[0]
            parse_file(dset_entry)
            c3s_qc_prepare_dict[pid]=dset_entry

In [181]:
c3s_qc_prepare = pd.DataFrame.from_dict(c3s_qc_prepare_dict, orient="index")
c3s_qc_prepare.index.name="dataset_pid"

In [183]:
c3s_qc_prepare.head()

Unnamed: 0_level_0,logfile_name,creation_date,dset_id,specs_version,checked,failed,passed,errors,max_severity
dataset_pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
hdl:21.14100/9ca03ccf-3034-3733-86d2-a087ad617e88,/mnt/lustre02/work/ik1017/CMIP6/meta/tools/c34...,20200617,CMIP6.CMIP.UA.MCM-UA-1-0.historical.r1i1p1f1.A...,01.00.28,1,0,1,,
hdl:21.14100/f623f351-7e68-3c64-a831-8f9eedce1bc6,/mnt/lustre02/work/ik1017/CMIP6/meta/tools/c34...,20200617,CMIP6.CMIP.UA.MCM-UA-1-0.historical.r1i1p1f1.A...,01.00.28,1,0,1,,
hdl:21.14100/3a9da4ff-de67-3a98-bb0a-54675489289b,/mnt/lustre02/work/ik1017/CMIP6/meta/tools/c34...,20200617,CMIP6.CMIP.UA.MCM-UA-1-0.historical.r1i1p1f1.A...,01.00.28,1,0,1,,
hdl:21.14100/76d40cce-0d62-3d81-a9cb-8f6fb404a1f4,/mnt/lustre02/work/ik1017/CMIP6/meta/tools/c34...,20200617,CMIP6.CMIP.UA.MCM-UA-1-0.abrupt-4xCO2.r1i1p1f1...,01.00.28,5,0,5,,
hdl:21.14100/d24db66b-3995-37c8-b8be-aa414832d832,/mnt/lustre02/work/ik1017/CMIP6/meta/tools/c34...,20200617,CMIP6.CMIP.MIROC.MIROC6.historical.r1i1p1f1.Om...,01.00.28,2,0,2,,


In [184]:
c3serr = c3s_qc_prepare[c3s_qc_prepare["failed"]!= 0]

In [189]:
c3serrcat = c3serr[c3serr["errors"] != "nan"]

In [190]:
c3serrcat.head()

Unnamed: 0_level_0,logfile_name,creation_date,dset_id,specs_version,checked,failed,passed,errors,max_severity
dataset_pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
hdl:21.14100/12871e9a-a001-3209-8e65-e7cdc4dbbb7d,/mnt/lustre02/work/ik1017/CMIP6/meta/tools/c34...,20200617,CMIP6.CMIP.UA.MCM-UA-1-0.piControl.r1i1p1f1.Om...,01.00.28,5,5,0,"[CV FAIL, CV FAIL, CV FAIL, CV FAIL, CV FAIL]",1.0
hdl:21.14100/fd7bdc35-95fc-313d-9264-4b9adc8632bb,/mnt/lustre02/work/ik1017/CMIP6/meta/tools/c34...,20200617,CMIP6.CMIP.UA.MCM-UA-1-0.piControl.r1i1p1f1.Om...,01.00.28,5,5,0,"[CV FAIL, CV FAIL, CV FAIL, CV FAIL, CV FAIL]",1.0
hdl:21.14100/831a29f9-3be1-388d-8d78-1fc93f23f5b9,/mnt/lustre02/work/ik1017/CMIP6/meta/tools/c34...,20200617,CMIP6.AerChemMIP.BCC.BCC-ESM1.ssp370.r1i1p1f1....,01.00.27,1,1,0,[CV FAIL],1.0
hdl:21.14100/4ff1459f-5a24-37ce-a5a6-d4dbb80f7b8a,/mnt/lustre02/work/ik1017/CMIP6/meta/tools/c34...,20200617,CMIP6.AerChemMIP.BCC.BCC-ESM1.ssp370.r1i1p1f1....,01.00.27,1,1,0,[CV FAIL],1.0
hdl:21.14100/a5b95c41-0b86-3f4d-9f97-748e777a8d41,/mnt/lustre02/work/ik1017/CMIP6/meta/tools/c34...,20200617,CMIP6.AerChemMIP.BCC.BCC-ESM1.ssp370.r1i1p1f1....,01.00.27,1,1,0,[CV FAIL],1.0
