In [None]:
import cmip6_cv.PrePARE.PrePARE as pp
#import subprocess
import pandas as pd
import re
from os import listdir, walk
from os.path import isfile, join
import json

## 1. Environment definitions

In [None]:
trunk="/mnt/lustre02/work/ik1017/CMIP6/data/"
dsetsinp = "/home/dkrz/k204210/catalog-to-manifest/catalogs/wg1subset-r1-datasets-pids-clean.csv.gz"
prepareSetting = {
    "exec" : pp.__file__ ,
    "logChunk":"/mnt/lustre02/work/ik1017/CMIP6/meta/c34g-qc-prepare/logs/",
    "cmip6-cmor-table-path" : "/home/dkrz/k204210/cmip6-cmor-tables/Tables" }

## 2. QC settings

In [None]:
errorSeverity=["Passed", "Minor Issue", "Major Issue"]
parsedict={"meta": ["filename", "creation_date", "dset_id", "specs_version"],
           "filenoDict":{"checked": 'files scanned: (\d+)',
                        "failed": 'with error\(s\): (\d+)'
                       },
           "errorDict":{"filename": 2,
                        "Warning" : 1,
                        "CV FAIL" : 1,
                        "Permission denied" : 2,
                        "not understood" : 2,
                        "SKIPPED" : 2},
          }

## 3. Function definitions

In [None]:
def collect_errors(dset_entry) :
    errors=[]
    max_severity=0
    for line in open(dset_entry["logfile_name"]):
        for errorKeyword in parsedict["errorDict"].keys() :
            match = re.findall(errorKeyword, line)
            if match:
                errors.append(errorKeyword)
                max_severity=max(max_severity,int(parsedict["errorDict"][errorKeyword]))
    dset_entry["errors"]=tuple(errors)
    dset_entry["max_severity"]=max_severity

In [None]:
def parse_file(dset_entry):
    checkedFiles=[]
    failedFiles=[]
    for line in open(dset_entry["logfile_name"]):
        match = re.search(parsedict["filenoDict"]["checked"], line)
        if match:
            checkedFiles.append(''.join(match.group(1)))
        match = re.search(parsedict["filenoDict"]["failed"], line)
        if match:
            failedFiles.append(''.join(match.group(1)))
    if not checkedFiles or not failedFiles :
        print(dset_entry["logfile_name"], checkedFiles, failedFiles)
    dset_entry["checked"]=int(checkedFiles[0])
    dset_entry["failed"]=int(failedFiles[0])
    dset_entry["passed"]=dset_entry["checked"]-dset_entry["failed"]
    if not dset_entry["failed"] == 0 :
        collect_errors(dset_entry)        

## 4. Create a Manifest file

In [None]:
c3s_qc_prepare_dict = {}

In [None]:
dsetslist = pd.read_csv(dsetsinp)

In [None]:
dsetslist.head()

In [None]:
specs_paths=listdir(prepareSetting["logChunk"])
for specs_path in specs_paths:
    for dirpath, dirnames, logfile_names in walk(join(prepareSetting["logChunk"], specs_path)):
        for logfile_name in logfile_names :
            dset_entry = {"logfile_name":join(dirpath, logfile_name),
                          "creation_date":logfile_name.split(".")[0].split("-")[1],
                          "dset_id":dirpath[len(join(prepareSetting["logChunk"], specs_path))+1:],
                          "specs_version": "01.00."+specs_path}
            pid=dsetslist[dsetslist["dataset_id"]==dset_entry["dset_id"]]["pid"].tolist()
            if not len(pid) == 1 :
                pid = "nan"
            else :
                pid = pid[0]
            parse_file(dset_entry)
            c3s_qc_prepare_dict[pid]=dset_entry

In [None]:
c3s_qc_prepare = pd.DataFrame.from_dict(c3s_qc_prepare_dict, orient="index")
c3s_qc_prepare.index.name="pid"

## 5. Reformat and subset for first delivery

In [None]:
FirstDelivery="/home/dkrz/k204210/c3s_34g_qc_results/Catalogs/c3s34g-release1-datasets_v0.csv"
dsetslistFirstDelivery = pd.read_csv(FirstDelivery, names=["dataset_id", "pid"])

In [None]:
c3s_qc_prepare_firstdelivery = c3s_qc_prepare[["dset_id", "failed"]].rename(columns={"failed": "qc_status"})
c3s_qc_prepare_firstdelivery.loc[c3s_qc_prepare_firstdelivery.qc_status > 0, 'qc_status'] = "ERROR"
c3s_qc_prepare_firstdelivery['qc_status'] = c3s_qc_prepare_firstdelivery['qc_status'].replace(0, value='pass')

In [None]:
c3s_qc_prepare_firstdelivery = c3s_qc_prepare_firstdelivery[c3s_qc_prepare_firstdelivery.index.isin(dsetslistFirstDelivery["pid"])]

In [None]:
c3s_qc_prepare_firstdelivery[c3s_qc_prepare_firstdelivery["qc_status"] != "pass"]

In [None]:
c3s_qc_prepare_firstdelivery.to_json("/home/dkrz/k204210/c3s_34g_qc_results/QC_Results/QC_prepare.json", orient="index")

In [None]:
c3serr = c3s_qc_prepare[c3s_qc_prepare["failed"]!= 0]

In [None]:
print(len(c3serr))

In [None]:
c3serrgb = c3serr.groupby("errors")

In [None]:
print(c3serr[c3serr['errors'] == ()]["logfile_name"].tolist())

In [None]:
print(c3serr[c3serr['errors'].str.contains("SKIPPED")])