From a2b302d78416c8d214a2a72abc27f1062daf5ef3 Mon Sep 17 00:00:00 2001 From: Heshy Roskes Date: Tue, 13 Sep 2016 21:35:10 +0200 Subject: [PATCH 1/5] - split __createSnippet into 3 functions - filter the file list when possible to avoid unnecessary opening and closing - add function for HipPy file list --- .../python/TkAlAllInOneTool/dataset.py | 139 ++++++++++++------ 1 file changed, 95 insertions(+), 44 deletions(-) diff --git a/Alignment/OfflineValidation/python/TkAlAllInOneTool/dataset.py b/Alignment/OfflineValidation/python/TkAlAllInOneTool/dataset.py index a4b029262bd76..d9550f614e1ca 100644 --- a/Alignment/OfflineValidation/python/TkAlAllInOneTool/dataset.py +++ b/Alignment/OfflineValidation/python/TkAlAllInOneTool/dataset.py @@ -112,42 +112,9 @@ def __chunks( self, theList, n ): "input = cms.untracked.int32(%(nEvents)s) )\n" "%(skipEventsString)s\n") - def __createSnippet( self, jsonPath = None, begin = None, end = None, - firstRun = None, lastRun = None, repMap = None, - crab = False, parent = False ): - if firstRun: - firstRun = int( firstRun ) - if lastRun: - lastRun = int( lastRun ) - if ( begin and firstRun ) or ( end and lastRun ): - msg = ( "The Usage of " - + "'begin' & 'firstRun' " * int( bool( begin and - firstRun ) ) - + "and " * int( bool( ( begin and firstRun ) and - ( end and lastRun ) ) ) - + "'end' & 'lastRun' " * int( bool( end and lastRun ) ) - + "is ambigous." ) - raise AllInOneError( msg ) - if begin or end: - ( firstRun, lastRun ) = self.convertTimeToRun( - begin = begin, end = end, firstRun = firstRun, - lastRun = lastRun ) - if ( firstRun and lastRun ) and ( firstRun > lastRun ): - msg = ( "The lower time/runrange limit ('begin'/'firstRun') " - "chosen is greater than the upper time/runrange limit " - "('end'/'lastRun').") - raise AllInOneError( msg ) - if self.predefined() and (jsonPath or begin or end or firstRun or lastRun): - msg = ( "The parameters 'JSON', 'begin', 'end', 'firstRun', and 'lastRun'" - "only work for official datasets, not predefined _cff.py files" ) - raise AllInOneError( msg ) - goodLumiSecStr = "" - lumiStr = "" + def __lumiSelectionSnippet( self, jsonPath = None, firstRun = None, lastRun = None ): lumiSecExtend = "" if firstRun or lastRun or jsonPath: - goodLumiSecStr = ( "lumiSecs = cms.untracked." - "VLuminosityBlockRange()\n" ) - lumiStr = " lumisToProcess = lumiSecs,\n" if not jsonPath: selectedRunList = self.__getRunList() if firstRun: @@ -224,15 +191,11 @@ def __createSnippet( self, jsonPath = None, begin = None, end = None, else: msg = "You are trying to run a validation without any runs! Check that:" if firstRun or lastRun: - msg += "\n - firstRun and lastRun are correct for this dataset, and there are runs in between containing data" + msg += "\n - firstRun/begin and lastRun/end are correct for this dataset, and there are runs in between containing data" if jsonPath: msg += "\n - your JSON file is correct for this dataset, and the runs contain data" if (firstRun or lastRun) and jsonPath: - msg += "\n - firstRun and lastRun are consistent with your JSON file" - if begin: - msg = msg.replace("firstRun", "begin") - if end: - msg = msg.replace("lastRun", "end") + msg += "\n - firstRun/begin and lastRun/end are consistent with your JSON file" raise AllInOneError(msg) else: @@ -240,23 +203,66 @@ def __createSnippet( self, jsonPath = None, begin = None, end = None, self.__firstusedrun = int(self.__findInJson(self.__getRunList()[0],"run_number")) self.__lastusedrun = int(self.__findInJson(self.__getRunList()[-1],"run_number")) + return lumiSecExtend + + def __fileListSnippet(self, crab=False, parent=False, firstRun=None, lastRun=None, forcerunselection=False): if crab: files = "" else: - splitFileList = list( self.__chunks( self.fileList(), 255 ) ) + splitFileList = list( self.__chunks( self.fileList(firstRun=firstRun, lastRun=lastRun, forcerunselection=forcerunselection), 255 ) ) fileStr = [ "',\n'".join( files ) for files in splitFileList ] fileStr = [ "readFiles.extend( [\n'" + files + "'\n] )" \ for files in fileStr ] files = "\n".join( fileStr ) if parent: - splitParentFileList = list( self.__chunks( self.fileList(parent = True), 255 ) ) + splitParentFileList = list( self.__chunks( self.fileList(parent=True, firstRun=firstRun, lastRun=lastRun, forcerunselection=forcerunselection), 255 ) ) parentFileStr = [ "',\n'".join( parentFiles ) for parentFiles in splitParentFileList ] parentFileStr = [ "secFiles.extend( [\n'" + parentFiles + "'\n] )" \ for parentFiles in parentFileStr ] parentFiles = "\n".join( parentFileStr ) files += "\n\n" + parentFiles + return files + + def __createSnippet( self, jsonPath = None, begin = None, end = None, + firstRun = None, lastRun = None, repMap = None, + crab = False, parent = False ): + + if firstRun: + firstRun = int( firstRun ) + if lastRun: + lastRun = int( lastRun ) + if ( begin and firstRun ) or ( end and lastRun ): + msg = ( "The Usage of " + + "'begin' & 'firstRun' " * int( bool( begin and + firstRun ) ) + + "and " * int( bool( ( begin and firstRun ) and + ( end and lastRun ) ) ) + + "'end' & 'lastRun' " * int( bool( end and lastRun ) ) + + "is ambigous." ) + raise AllInOneError( msg ) + if begin or end: + ( firstRun, lastRun ) = self.convertTimeToRun( + begin = begin, end = end, firstRun = firstRun, + lastRun = lastRun ) + if ( firstRun and lastRun ) and ( firstRun > lastRun ): + msg = ( "The lower time/runrange limit ('begin'/'firstRun') " + "chosen is greater than the upper time/runrange limit " + "('end'/'lastRun').") + raise AllInOneError( msg ) + if self.predefined() and (jsonPath or begin or end or firstRun or lastRun): + msg = ( "The parameters 'JSON', 'begin', 'end', 'firstRun', and 'lastRun'" + "only work for official datasets, not predefined _cff.py files" ) + raise AllInOneError( msg ) + + lumiSecExtend = self.__lumiSelectionSnippet(jsonPath=jsonPath, firstRun=firstRun, lastRun=lastRun) + lumiStr = goodLumiSecStr = "" + if lumiSecExtend: + goodLumiSecStr = "lumiSecs = cms.untracked.VLuminosityBlockRange()\n" + lumiStr = " lumisToProcess = lumiSecs,\n" + + files = self.__fileListSnippet(crab=crab, parent=parent, firstRun=firstRun, lastRun=lastRun, forcerunselection=False) theMap = repMap theMap["files"] = files @@ -820,15 +826,60 @@ def dump_cff( self, outName = None, jsonPath = None, begin = None, theFile.close() return - def fileList( self, parent = False ): + def createdatasetfile_hippy(self, filename, filesperjob, firstrun, lastrun): + with open(filename, "w") as f: + for job in self.__chunks(self.fileList(firstRun=firstrun, lastRun=lastrun, forcerunselection=True), filesperjob): + f.write(",".join("'{}'".format(file) for file in job)) + + @staticmethod + def getrunnumberfromfilename(filename): + parts = filename.split("/") + result = error = None + if parts[0] != "" or parts[1] != "store": + error = "does not start with /store" + elif parts[2] in ["mc", "relval"]: + result = 1 + elif parts[-2] != "00000" or not parts[-1].endswith(".root"): + error = "does not end with 00000/something.root" + elif len(parts) != 12: + error = "should be exactly 11 slashes counting the first one" + else: + runnumberparts = parts[-5:-2] + if not all(len(part)==3 for part in runnumberparts): + error = "the 3 directories {} do not have length 3 each".format("/".join(runnumberparts)) + try: + result = int("".join(runnumberparts)) + except ValueError: + error = "the 3 directories {} do not form an integer".format("/".join(runnumberparts)) + + if error: + error = "could not figure out which run number this file is from:\n{}\n{}".format(filename, error) + raise AllInOneError(error) + + return result + + def fileList(self, parent=False, firstRun=None, lastRun=None, forcerunselection=False): if self.__fileList and not parent: return self.__fileList if self.__parentFileList and parent: return self.__parentFileList - fileList = [ self.__findInJson(fileInfo,"name") \ + fileList = [ self.__findInJson(fileInfo,"name") for fileInfo in self.fileInfoList(parent) ] + if firstRun is not None or lastRun is not None: + if firstRun is None: firstRun = -5 + if lastRun is None: lastrun = float('infinity') + e = None + for filename in fileList[:]: + try: + if not firstRun < self.getrunnumberfromfilename(filename) < lastRun: + fileList.remove(filename) + except AllInOneError as e: + if forcerunselection: raise + print e.message + if e is not None: + print "\nWill include those files. They will be filtered at the CMSSW level anyway." if not parent: self.__fileList = fileList else: From ce82fd536b72e560f1dd85ad8be5e3f0262d60ed Mon Sep 17 00:00:00 2001 From: Heshy Roskes Date: Tue, 13 Sep 2016 22:06:49 +0200 Subject: [PATCH 2/5] script to create file list for hippy --- .../scripts/createfilelist.py | 17 +++++++++++++++++ .../python/TkAlAllInOneTool/dataset.py | 6 +++--- 2 files changed, 20 insertions(+), 3 deletions(-) create mode 100755 Alignment/HIPAlignmentAlgorithm/scripts/createfilelist.py diff --git a/Alignment/HIPAlignmentAlgorithm/scripts/createfilelist.py b/Alignment/HIPAlignmentAlgorithm/scripts/createfilelist.py new file mode 100755 index 0000000000000..9f77df34c8232 --- /dev/null +++ b/Alignment/HIPAlignmentAlgorithm/scripts/createfilelist.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python +from Alignment.OfflineValidation.TkAlAllInOneTool.dataset import Dataset +import argparse +import os +import sys + +parser = argparse.ArgumentParser() +parser.add_argument("outputfilename", help="Goes into $CMSSW_BASE/src/Alignment/HIPAlignmentAlgorithm/data unless an absolute path starting with / is provided. example: ALCARECOTkAlMinBias.dat_example") +parser.add_argument("datasetname", help="example: /ZeroBias/Run2016G-TkAlMinBias-PromptReco-v1/ALCARECO") +parser.add_argument("filesperjob", type=int, help="max number of files in each job") +parser.add_argument("firstrun", type=int, nargs="?", help="first run to use") +parser.add_argument("lastrun", type=int, nargs="?", help="last run to use") +args = parser.parse_args() + +dataset = Dataset(args.datasetname, tryPredefinedFirst=False) +outputfilename = os.path.join(os.environ["CMSSW_BASE"], "src", "Alignment", "HIPAlignmentAlgorithm", "data", args.outputfilename) +dataset.createdatasetfile_hippy(outputfilename, args.filesperjob, args.firstrun, args.lastrun) diff --git a/Alignment/OfflineValidation/python/TkAlAllInOneTool/dataset.py b/Alignment/OfflineValidation/python/TkAlAllInOneTool/dataset.py index d9550f614e1ca..a1dd796d2c018 100644 --- a/Alignment/OfflineValidation/python/TkAlAllInOneTool/dataset.py +++ b/Alignment/OfflineValidation/python/TkAlAllInOneTool/dataset.py @@ -829,7 +829,7 @@ def dump_cff( self, outName = None, jsonPath = None, begin = None, def createdatasetfile_hippy(self, filename, filesperjob, firstrun, lastrun): with open(filename, "w") as f: for job in self.__chunks(self.fileList(firstRun=firstrun, lastRun=lastrun, forcerunselection=True), filesperjob): - f.write(",".join("'{}'".format(file) for file in job)) + f.write(",".join("'{}'".format(file) for file in job)+"\n") @staticmethod def getrunnumberfromfilename(filename): @@ -868,8 +868,8 @@ def fileList(self, parent=False, firstRun=None, lastRun=None, forcerunselection= for fileInfo in self.fileInfoList(parent) ] if firstRun is not None or lastRun is not None: - if firstRun is None: firstRun = -5 - if lastRun is None: lastrun = float('infinity') + if firstRun is None: firstRun = -1 + if lastRun is None: lastRun = float('infinity') e = None for filename in fileList[:]: try: From 2fe508a49fa9cb22a05e31c9c33b1762d9954cfe Mon Sep 17 00:00:00 2001 From: Heshy Roskes Date: Wed, 14 Sep 2016 16:11:02 +0200 Subject: [PATCH 3/5] remove nonsense line (the root file has never been called that as far back as I can tell, and it's copied later anyway https://github.com/cms-sw/cmssw/blob/813e5a/Alignment/OfflineValidation/python/TkAlAllInOneTool/configTemplates.py#L104) --- .../python/TkAlAllInOneTool/geometryComparison.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/Alignment/OfflineValidation/python/TkAlAllInOneTool/geometryComparison.py b/Alignment/OfflineValidation/python/TkAlAllInOneTool/geometryComparison.py index 486fab589b9d4..111ded37a5a34 100644 --- a/Alignment/OfflineValidation/python/TkAlAllInOneTool/geometryComparison.py +++ b/Alignment/OfflineValidation/python/TkAlAllInOneTool/geometryComparison.py @@ -252,9 +252,6 @@ def createScript(self, path): resultingFile = os.path.expandvars( resultingFile ) resultingFile = os.path.abspath( resultingFile ) resultingFile = "root://eoscms//eos/cms" + resultingFile #needs to be AFTER abspath so that it doesn't eat the // - repMap["runComparisonScripts"] += \ - ("xrdcp -f OUTPUT_comparison.root %s\n" - %resultingFile) self.filesToCompare[ name ] = resultingFile else: From ec8d82ac7f88e15b40a0d1c71491928af6a49fcd Mon Sep 17 00:00:00 2001 From: Heshy Roskes Date: Wed, 28 Sep 2016 21:47:41 +0200 Subject: [PATCH 4/5] fix parallel offline --- Alignment/OfflineValidation/scripts/validateAlignments.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Alignment/OfflineValidation/scripts/validateAlignments.py b/Alignment/OfflineValidation/scripts/validateAlignments.py index e447273411296..b42b66ca46cc6 100755 --- a/Alignment/OfflineValidation/scripts/validateAlignments.py +++ b/Alignment/OfflineValidation/scripts/validateAlignments.py @@ -350,6 +350,10 @@ def createMergeScript( path, validations ): repMap["mergeOfflineParJobsScriptPath"] ) repMap["copyMergeScripts"] += ("cp .oO[Alignment/OfflineValidation]Oo./scripts/merge_TrackerOfflineValidation.C .\n" "rfcp %s .\n" % repMap["mergeOfflineParJobsScriptPath"]) + repMap_offline = repMap.copy() + repMap_offline.update(PlottingOptions(config, "offline")) + repMap["copyMergeScripts"] = \ + replaceByMap(repMap["copyMergeScripts"], repMap_offline) if anythingToMerge: # DownloadData is the section which merges output files from parallel jobs From 2cba4fe3156e36fc9dce5ffb49f4b2c7a028b249 Mon Sep 17 00:00:00 2001 From: Heshy Roskes Date: Wed, 28 Sep 2016 21:50:24 +0200 Subject: [PATCH 5/5] move error message to the function that actually gets called --- .../OfflineValidation/python/TkAlAllInOneTool/dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Alignment/OfflineValidation/python/TkAlAllInOneTool/dataset.py b/Alignment/OfflineValidation/python/TkAlAllInOneTool/dataset.py index a1dd796d2c018..f08548e30a4b3 100644 --- a/Alignment/OfflineValidation/python/TkAlAllInOneTool/dataset.py +++ b/Alignment/OfflineValidation/python/TkAlAllInOneTool/dataset.py @@ -251,10 +251,6 @@ def __createSnippet( self, jsonPath = None, begin = None, end = None, "chosen is greater than the upper time/runrange limit " "('end'/'lastRun').") raise AllInOneError( msg ) - if self.predefined() and (jsonPath or begin or end or firstRun or lastRun): - msg = ( "The parameters 'JSON', 'begin', 'end', 'firstRun', and 'lastRun'" - "only work for official datasets, not predefined _cff.py files" ) - raise AllInOneError( msg ) lumiSecExtend = self.__lumiSelectionSnippet(jsonPath=jsonPath, firstRun=firstRun, lastRun=lastRun) lumiStr = goodLumiSecStr = "" @@ -710,6 +706,10 @@ def parentDataset( self ): def datasetSnippet( self, jsonPath = None, begin = None, end = None, firstRun = None, lastRun = None, crab = False, parent = False ): + if self.__predefined and (jsonPath or begin or end or firstRun or lastRun): + msg = ( "The parameters 'JSON', 'begin', 'end', 'firstRun', and 'lastRun' " + "only work for official datasets, not predefined _cff.py files" ) + raise AllInOneError( msg ) if self.__predefined and parent: with open(self.__filename) as f: if "secFiles.extend" not in f.read():