In [1]:
from commons import *

# Read the Data Source Files

In [2]:
dfDatasets = pd.read_csv(FILE_DATASETS)
dfMetafeatures = pd.read_csv(FILE_METAFEATURES, delimiter=";")
datasets = list(dfDatasets["openmlid"].values)

In [3]:
dfDefaultClassifiers = pd.read_csv(FILE_CLASSIFIERS_DEFAULT, delimiter=";")
dfDefaultPreprocessors = pd.read_csv(FILE_PREPROCESSORS_DEFAULT, delimiter=";")
dfDefaultPreprocessors["algorithmoptions"] = ""
dfConfiguredClassifiers = pd.read_csv(FILE_CLASSIFIERS_PARAMETRIZED, delimiter=";")
dfConfiguredPreprocessors = pd.read_csv(FILE_PREPROCESSORS_PARAMETRIZED, delimiter=";")
evaluationGridPoints = getEvaluationGridPoints()

  interactivity=interactivity, compiler=compiler, result=result)


# Create a Single Runtime CSV for Atomic Algorithms

In [4]:
def resolveExceptions(df):
    
    # Set traintime to full traintime +1 (3601) if the exception is a timeout or is dominated by another one that timed out
    print("Len of df is " + str(len(df)))
    timeoutsTrain = {}
    timeoutsTest = {}
    numTimeoutsTrain = 0
    numTimeoutsTest = 0
    failedRuns = df[df["exception"].notnull()]
    failedRunsWithTimeout = failedRuns[failedRuns["exception"].str.contains("AlgorithmTimeouted") | failedRuns["exception"].str.contains("OutOfMemory")]
    pbar = tqdm(total=len(failedRunsWithTimeout))
    df["error"] = [None for i in range(len(df))]
    for i, row in failedRunsWithTimeout.iterrows():
        pbar.update(1)
        exception = str(row["exception"])
        if exception != df.at[i, "exception"]:
            raise Exception("The access via at does not seem to work correctly at index " + str(i) + "! Expected " + str(exception) + " and found " + str(df.at[i, "exception"])  + "\n\n" + str(row) + "\n\n" + str(df.loc[i]) + "\n\nSize of df: " + str(len(df)))
        
        # add the timeout to the respective list, depending on whether it occured in the train or prediction phase
        if "timeout" in exception.lower():# or "memory" in exception.lower():
            df.at[i, "exception"] = np.nan
            df.at[i, "error"] = "timeout" if "timeout" in exception.lower() else "memory"
            isTimeoutDuringTraining = np.isnan(row["fittime"]) or row["fittime"] >= 3600*10**3
            if isTimeoutDuringTraining:
                df.at[i, "fittime"] = 3600*10**3
                openmlid = row["openmlid"]
                algo = row["algorithm"]
                if not openmlid in timeoutsTrain:
                    timeoutsTrain[openmlid] = {}
                if not algo in timeoutsTrain[openmlid]:
                    timeoutsTrain[openmlid][algo] = []
                timeoutsTrain[openmlid][algo].append((row["openmlid"], row["algorithm"], row["algorithmoptions"], row["fitsize"], row["fitattributes"], i))
                numTimeoutsTrain += 1
            else:
                if not np.isnan(row["applicationtimeperkinstances"]):
                    raise Exception("There is a timeout for row " + str(row) + " but fittime < 3600 and there is an application time. This should not happen!")
                openmlid = row["openmlid"]
                algo = row["algorithm"]
                if not openmlid in timeoutsTest:
                    timeoutsTest[openmlid] = {}
                if not algo in timeoutsTest[openmlid]:
                    timeoutsTest[openmlid][algo] = []
                timeoutsTest[openmlid][algo].append((row["openmlid"], row["algorithm"], row["algorithmoptions"], row["fitsize"], row["fitattributes"], i))
                df.at[i, "applicationtimeperkinstances"] = 3600*10**3 # suppose that the test time for the given data is high (apprx by 1h)
                numTimeoutsTest += 1
    pbar.close()
    print("Found " + str(numTimeoutsTrain) + " timeouts during training and " + str(numTimeoutsTest) + " timeouts during prediction. Len of df is " + str(len(df)))
    
    # now iterate over all still failed rows
    failedRuns = df[df["exception"].notna()]
    pbar = tqdm(total=len(failedRuns))
    for i, row in failedRuns.iterrows():
        exception = str(row["exception"])
        if exception != df.at[i, "exception"]:
            raise Exception("The access via at does not seem to work correctly at index " + str(i) + "! Expected " + str(exception) + " and found " + str(df.at[i, "exception"]) + "\n\n" + str(row) + "\n\n" + str(df.loc[i]) + "\n\nSize of df: " + str(len(df)))
        pbar.update(1)

        # if this is not a ExperimentFailurePredictionException, continue
        if not "ExperimentFailurePredictionException" in exception:
            continue

        # check whether we have a timeout prediction exception. Then we can interpolate the values
        if "which failed due to" in exception:# or "memory" in exception:
            openmlid = row["openmlid"]
            algo = row["algorithm"]
            options = row["algorithmoptions"]
            t = (openmlid, algo, options, int(row["fitsize"]), int(row["fitattributes"]), i)

            resolved = False

            # check on dominance by train time
            if openmlid in timeoutsTrain and algo in timeoutsTrain[openmlid]:
                for t2 in timeoutsTrain[openmlid][algo]:
                    # if the datapoint is dominated by another one due to a timeout, set the traintime to that time
                    #if openmlid == 3:
                    #    print(t2[3], t[3], t2[3] <= t[3])
                    #    print(t2[4], t[4], t2[4] <= t[4])
                    #    print(options, t2[2], str(options) == str(t2[2]))
                    if t2[3] <= t[3] and t2[4] <= t[4] and str(t2[2]) == str(options):
                        df.at[i, "fittime"] = 3601*10**3
                        df.at[i, "exception"] = np.nan
                        df.at[i, "error"] = "timeout" if "which failed due to" in exception else "memory"
                        resolved = True
                        timeoutsTrain[openmlid][algo].append(t)
                        break

            # check on dominance by prediction time
            if not resolved and openmlid in timeoutsTest and algo in timeoutsTest[openmlid]:
                for t2 in timeoutsTest[openmlid][algo]:
                    # if the datapoint is dominated by another one due to a timeout, set the traintime to that time
                    if t2[3] <= t[3] and t2[4] <= t[4] and str(t2[2]) == str(options):
                        dominantedIndex = t2[5]
                        #print(dominantedIndex, df.loc[dominantedIndex])
                        #print("Replacing fittime " + str(row["fittime"]) + " by " + str(df.at[dominantedIndex, "fittime"]))
                        df.at[i, "fittime"] = df.at[dominantedIndex, "fittime"]
                        df.at[i, "applicationtimeperkinstances"] = 3601*10**3
                        df.at[i, "exception"] = np.nan
                        df.at[i, "error"] = "timeout" if "which failed due to" in exception else "memory"
                        resolved = True
                        timeoutsTest[openmlid][algo].append(t)
                        break

            # now make sure that the exception got resolved
            if not resolved:
                print("WARNING: Could not resolve the prediction exception for algorithm " + algo + " on openmlid " + str(openmlid) + " and algorithm " + str(algo) + " with options '" + str(options) + "' for " + str(row["fitsize"]) + " instances and " + str(row["fitattributes"]) + " attributes. Here is the exception: " + str(exception))
#                raise Exception("Could not resolve the prediction exception for algorithm " + algo + " on openmlid " + str(openmlid) + " and algorithm " + str(algo) + " with options '" + str(options) + "' for " + str(row["fitsize"]) + " instances and " + str(row["fitattributes"]) + " attributes. Here is the exception: " + str(exception))

    pbar.close()
    
    print("Setting runtime of lines with remaining exceptions (domain-specific) to -1.")
    replaced = 0
    for i, row in tqdm(df[df["exception"].notna()].iterrows()):
        df.at[i, "fittime"] = -1
        df.at[i, "applicationtimeperkinstances"] = np.nan
        exception = df.at[i, "exception"].lower()
        if "smote" in exception:
            df.at[i, "exception"] = np.nan
            df.at[i, "error"] = "smote"
        elif "memory" in exception:
            df.at[i, "exception"] = np.nan
            df.at[i, "error"] = "memory"
        else:
            df.at[i, "exception"] = np.nan
            df.at[i, "error"] = "other"
        replaced += 1
    df = df.drop(columns=["exception"])
    
    print("There are now " + str(np.count_nonzero(df["fittime"].isna())) + " lines where fittime is not available.")
    return df

In [5]:
relevantTrainMetafeatures = ['fitsize', 'fitattributes', 'numattributes', 'numlabels', 'numnumericattributes', 'numsymbolicattributes', 'numberofcategories', 'numericattributesafterbinarization', 'totalvariance', 'attributestocover50pctvariance', 'attributestocover90pctvariance', 'attributestocover95pctvariance', 'attributestocover99pctvariance']

# prepare classifier data frame
dfAlgorithms = pd.concat([dfDefaultClassifiers, dfConfiguredClassifiers, dfDefaultPreprocessors, dfConfiguredPreprocessors], ignore_index = True, sort=False)
allAttributes = ['openmlid', 'totalsize', 'algorithm', 'algorithmoptions', 'seed']
allAttributes.extend(["f1_" + x for x in relevantTrainMetafeatures])
allAttributes.extend(['applicationsize', 'fittime', 'applicationtimeperkinstances', 'exception'])
renameDict = {}
for a in relevantTrainMetafeatures:
    if a != "numattributes":
        renameDict["f1_" + a] =  a
dfClassifiersMod = dfAlgorithms.rename(columns=renameDict)
dfClassifiersMod = dfClassifiersMod[[x.replace("f1_", "") for x in allAttributes]]
dfAtomic = dfClassifiersMod.copy()
print ("Data frame for atomic algorithms ready.")

Data frame for atomic algorithms ready.


### Resolve Exceptions

In [None]:
dfAtomic = resolveExceptions(dfAtomic)
print("Now removing lines where fitattributes is not available.")
dfAtomic = dfAtomic[(dfAtomic["fitattributes"].notnull()) & (dfAtomic["fittime"].notnull())]
print("Ready. Prepared clean dataset.")
dfAtomic = dfAtomic.astype({"fitattributes": int, "fittime": int})

Len of df is 1825559


HBox(children=(FloatProgress(value=0.0, max=92597.0), HTML(value='')))


Found 67238 timeouts during training and 5835 timeouts during prediction. Len of df is 1825559


HBox(children=(FloatProgress(value=0.0, max=234490.0), HTML(value='')))

	Error class: ai.libs.jaicore.experiments.exceptions.ExperimentFailurePredictionException
	Error message: Experiment will fail for the following reason: This has at least as much instances and attributes as 47893, which failed due to a timeout.
	Error trace:
		tpami.basealgorithmlearning.datagathering.ALearnerExperimentEvaluator.checkFail(ALearnerExperimentEvaluator.java:110)
		tpami.basealgorithmlearning.datagathering.AMLAlgorithmExperimentEvaluator.evaluate(AMLAlgorithmExperimentEvaluator.java:109)
		ai.libs.jaicore.experiments.ExperimentRunner.conductExperiment(ExperimentRunner.java:217)
		ai.libs.jaicore.experiments.ExperimentRunner.lambda$randomlyConductExperiments$0(ExperimentRunner.java:104)
		java.base/java.lang.Thread.run(Thread.java:834)
	Error class: ai.libs.jaicore.experiments.exceptions.ExperimentFailurePredictionException
	Error message: Experiment will fail for the following reason: This has at least as much instances and attributes as 47893, which failed due to a timeou

	Error class: ai.libs.jaicore.experiments.exceptions.ExperimentFailurePredictionException
	Error message: Experiment will fail for the following reason: This has at least as much instances and attributes as 16875, which failed due to a timeout.
	Error trace:
		tpami.basealgorithmlearning.datagathering.ALearnerExperimentEvaluator.checkFail(ALearnerExperimentEvaluator.java:110)
		tpami.basealgorithmlearning.datagathering.AMLAlgorithmExperimentEvaluator.evaluate(AMLAlgorithmExperimentEvaluator.java:109)
		ai.libs.jaicore.experiments.ExperimentRunner.conductExperiment(ExperimentRunner.java:217)
		ai.libs.jaicore.experiments.ExperimentRunner.lambda$randomlyConductExperiments$0(ExperimentRunner.java:104)
		java.base/java.lang.Thread.run(Thread.java:834)
	Error class: ai.libs.jaicore.experiments.exceptions.ExperimentFailurePredictionException
	Error message: Experiment will fail for the following reason: This has at least as much instances and attributes as 16875, which failed due to a timeou

### Write Runtime File

In [None]:
dfAtomic.to_csv("data/runtimes/runtimes_atomic.csv", index=False)

# Files Relevant for Using the Approach in ML-Plan
What follows after this is not relevant for the python-based analysis and can be skipped for the first phase.

# Compile One CSV-File per Atomic Algorithm

In [9]:
df = pd.read_csv("data/runtimes/runtimes_atomic.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [18]:
def explodeToFiles(df, binarize):
    algorithms = pd.unique(df["algorithm"])
    df = df[df["fittime"] > 0].copy()
    df["msize"] = df["fitsize"] * df["fitattributes"]
    df["msize2"] = df["fitsize"] * (df["fitattributes"] ** 2)
    for a in tqdm(algorithms):
        print(a)
        columns = ["openmlid", "fitsize", "fitattributes", "msize", "msize2"]
        dfAlg = df[df["algorithm"] == a]
        dfAlgDefaults = dfAlg[dfAlg["algorithmoptions"].isnull()]
        dfAlgParametrized = dfAlg[dfAlg["algorithmoptions"].notnull()]
        
        # create a file for the parametrized version (if applicable)
        if len(dfAlgParametrized) > 0:
            lenBefore = len(dfAlgParametrized)
            dfTarget = dfAlgParametrized.merge(dfAlgDefaults, on=["openmlid", "algorithm", "seed", "fitsize", "fitattributes"], suffixes=("", "_def"))
            if lenBefore != len(dfTarget):
                warning("Size has changed from " + str(lenBefore) + " to " + str(len(dfTarget)) + "! This may come due to errored executions etc. but should be checked.")
            columnsbeforeExplosion = list(dfTarget.columns)
            if len(dfTarget) > 0:
                dfTarget = explodeAlgorithmOptions(dfTarget, binarize=binarize)
                algoParams = [x for x in dfTarget.columns if not x in columnsbeforeExplosion]
                for x in ["searcheroptions", "evaloptions"]:
                    if x in algoParams:
                        algoParams.remove(x)
                columns.extend(algoParams)
                columns.extend(["fittime_def", "applicationtimeperkinstances_def", "fittime", "applicationtimeperkinstances"])
                dfTarget[columns].fillna(-1).astype({"fitattributes": int, "applicationtimeperkinstances": int, "applicationtimeperkinstances_def": int}).to_csv("data/runtimes/atomic/runtimes_" + a + "_parametrized.csv", index=False)
            else:
                print("No file written, no entries for algorithm " + a)
            
        # create a file for the default version
        dfTarget = dfAlgDefaults.rename(columns={"fittime": "fittime_def", "applicationtimeperkinstances": "applicationtimeperkinstances_def"})
        dfTarget[["openmlid", "fitsize", "fitattributes", "msize", "msize2", "fittime_def", "applicationtimeperkinstances_def"]].fillna(-1).astype({"fitattributes": int, "applicationtimeperkinstances_def": int}).to_csv("data/runtimes/atomic/runtimes_" + a + "_default.csv", index=False)

In [None]:
explodeToFiles(df, False)