In [1]:
%%html
<button type="button" class="btn btn-primary btn-md" onclick="initAEC();">Initialise</button>
<script>
function initAEC(){
    Jupyter.notebook.execute_cells([1]);
    Jupyter.notebook.execute_cells([2]);
    Jupyter.notebook.execute_cells([3]);
}
</script>

In [5]:
import json
import requests
from hops import hdfs
import pydoop.hdfs as phdfs
from  pyspark.sql.functions import year, ltrim, rtrim, coalesce, lit, udf
from pyspark.sql.types import StringType, ArrayType
from pyspark.ml.classification import OneVsRest, NaiveBayes, MultilayerPerceptronClassifier, LinearSVC, LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.regression import DecisionTreeRegressor, RandomForestRegressor, LinearRegression, GBTRegressor, GeneralizedLinearRegression
from pyspark.ml.clustering import KMeans, GaussianMixture
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, NGram, RegexTokenizer, PCA, ChiSqSelector, StringIndexer, VectorIndexer, IndexToString, VectorAssembler, OneHotEncoderEstimator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator, RegressionEvaluator, ClusteringEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation, ChiSquareTest
import numpy as np
from pyspark.ml.recommendation import ALS
from pyspark.ml.fpm import FPGrowth

myhdfs = phdfs.hdfs(user = hdfs.project_user())

tempDF = None
masterDF = None
hdfspath = hdfs.project_path()
g=None

# helper function to transform input column, used by FPGrowth
def colToArray(s):
    input = [str(int(x)) for x in s[1:-1].split(",")]
    return input
toarray_udf = udf(colToArray,ArrayType(StringType()))


PARAMETERS = {
    "maxiter":{
        "type":"number",
        "min":10,
        "max":1000,
        "placeholder":"10-1000",
        "label":"Maximum Iterations",
        "title":"Maximum number of iterations the algorithm will perform."
    },
    "regParam":{
        "type":"number",
        "min":0,
        "max":10,
        "step":0.01,
        "placeholder":"e.g. 0.01",
        "label":"Regularisation Parameter",
        "title":"Configuring this parameter correctly is important to avoid over- and under-fitting."
    },
    "elasticnet":{
        "type":"number",
        "min":0.0,
        "step":"0.01",
        "max":1.0,
        "placeholder":"0",
        "label":"Elastic Net Parameter",
        "title":"The elastic net linearly combines the L1 and L2 penalties of the lasso and ridge methods."
    },
    "max_depth":{
        "type":"number",
        "min":1,
        "max":100,
        "placeholder":"1-100",
        "label":"Maximum Depth",
        "title":"The maximum tree depth."
    },
    "max_bins":{
        "type":"number",
        "min":1,
        "max":64,
        "placeholder":"e.g. 32",
        "label":"Maximum Bins",
        "title":"Note that the number of bins cannot be greater than the number of instances N (a rare scenario since the default maxBins value is 32). The tree algorithm automatically reduces the number of bins if the condition is not satisfied"
    },
    "min_instances_per_node":{
        "type":"number",
        "min":1,
        "max":10,
        "placeholder":"1-100",
        "label":"Minimum Instances per Node",
        "title":"Selecting a very small number here may result to major overfitting."
    },
    "min_info_gain":{
        "type":"number",
        "min":0,
        "max":10,
        "step":0.01,
        "placeholder":"e.g. 0.01",
        "label":"Minimum Info Gain",
        "title":"Configuring this parameter correctly is important to avoid over- and under-fitting."
    },
    "step":{
        "type":"number",
        "min":0,
        "max":10,
        "step":0.01,
        "placeholder":"e.g. 0.01",
        "label":"Step",
        "title":"Step"
    },
    "stepSize":{
        "type":"number",
        "min":0,
        "max":10,
        "step":0.01,
        "placeholder":"e.g. 0.01",
        "label":"Step Size",
        "title":"Step Size"
    },
    "convergence_tolerance":{
        "type":"number",
        "min":0,
        "max":10,
        "step":0.01,
        "placeholder":"e.g. 0.01",
        "label":"Convergence Tolerance"
    },
    "nodes_l1":{
        "type":"number",
        "min":1,
        "max":10,
        "placeholder":"e.g. 4",
        "label":"Nodes in 1st hidden layer"
    },
    "nodes_l2":{
        "type":"number",
        "min":1,
        "max":10,
        "placeholder":"e.g. 4",
        "label":"Nodes in 2nd hidden layer"
    },
    "seed":{
        "type":"number",
        "min":0,
        "max":10000,
        "placeholder":"e.g. 0.01",
        "label":"Seed"
    },
    "smoothing":{
        "type":"number",
        "min":0,
        "max":10,
        "step":0.01,
        "placeholder":"e.g. 0.01",
        "label":"Smoothing"
    },
    "num_trees":{
        "type":"number",
        "min":0,
        "max":10,
        "placeholder":"e.g. 4",
        "label":"Number of Trees"
    },
    "distribution_family":{
        "type":"select",
        "options":["Gaussian","Binomial","Poisson","Gamma","Tweedie"],
        "label":"Distribution Family"
    },
    "k":{
        "type":"number",
        "min":2,
        "max":10,
        "placeholder":"e.g. 4",
        "label":"Number of Clusters (k)"
    },
    "pca_k":{
        "type":"number",
        "min":1,
        "max":20,
        "placeholder":"e.g. 4",
        "label":"Number of Principal Components (k)"
    },
    "numTopFeatures":{
        "type":"number",
        "min":1,
        "max":20,
        "placeholder":"e.g. 4",
        "label":"Number of Top Features to Select"
    },
    "ngrams_n":{
        "type":"number",
        "min":2,
        "max":5,
        "placeholder":"e.g. 2",
        "label":"N"
    },
    "tfidf_n":{
        "type":"number",
        "min":2,
        "max":30,
        "placeholder":"e.g. 2",
        "label":"Number of TF-IDF Features"
    },
    "minSupport":{
        "type":"number",
        "min":0,
        "max":1,
        "step":0.001,
        "placeholder":"e.g. 0.01",
        "label":"Minimal Support"
    },
    "minConfidence":{
        "type":"number",
        "min":0,
        "max":1,
        "step":0.001,
        "placeholder":"e.g. 0.01",
        "label":"Minimal Confidence"
    }
}

ALGORITHM_PARAMETERS = {
    "OLS":["maxiter","regParam","elasticnet"],
    "LOGRE":["maxiter","regParam","elasticnet"],
    "DTR":["max_depth","max_bins","min_instances_per_node","min_info_gain"],
    "DTC":["max_depth","max_bins","min_instances_per_node","min_info_gain"],
    "MLP":["stepSize","seed","maxiter","nodes_l2","nodes_l1"], #"convergence_tolerance" removed for now
    "NB":["smoothing"],
    "GLM":["maxiter","regParam","distribution_family"],
    "RFR":["max_depth","max_bins","min_instances_per_node","min_info_gain","num_trees"],
    "RFC":["max_depth","max_bins","min_instances_per_node","min_info_gain","num_trees"],
    "GBTR":["max_depth","max_bins","min_instances_per_node","min_info_gain","maxiter"],
    "GBTC":["max_depth","max_bins","min_instances_per_node","min_info_gain","maxiter"],
    "KMEANS":["k","seed"],
    "GAUSSMIX":["k","seed"],
    "PCA":["pca_k"],
    "ChiSquared":["numTopFeatures"],
    "ALS":["maxiter","regParam"],
    "TOKENIZER":[],
    "N-GRAMS":["ngrams_n"],
    "TF-IDF":["tfidf_n"],
    "FPGrowth":["minConfidence","minSupport"]
}

#TODO check if possible to have the algorithm definitions only in Python - currently also in JS
ALGORITHMS = {
    "DIMRE_FE":["PCA","ChiSquared"],
    "NLP":["TOKENIZER","N-GRAMS","TF-IDF"],
    "RECOM":["ALS"],
    "CLUST":["KMEANS","GAUSSMIX"],
    "CL_REG":["OLS","DTR","DTC","MLP","NB","GLM","RFR","GBTR","RFC","GBTC","LOGRE"],
    "FREQPM":["FPGrowth"]
}


ALGORITHM_DESCRIPTIONS = {
    
}       

# ----------- Input & Output -----------

def read_dataset(selectedFile,separator):
    global tempDF
    global g
    try:
        tempDF = spark.read.csv(selectedFile,header=True, inferSchema=True, sep=separator)
    except Exception as e:
        g=e
    return tempDF.columns

def getTempColumns():
    global tempDF
    return tempDF.columns

# used to retrieve available Datasets also
def find_current_projectID(pDict):
    projectsDict = json.loads(pDict)
    pname = hdfs.project_name()
    return projectsDict[pname] #KeyError should not be possible here, so no try-except...

#used to find csv files in selected dataset
def walk_dataset(dataset_path):
    fileSet = set()
    hdfs_start_path = "hdfs://"+myhdfs.host+":"+str(myhdfs.port)
    for f in myhdfs.walk(hdfs_start_path + dataset_path):
        fileSet.add(f['name'])
    available_files = [x for x in list(fileSet) if x.endswith(".csv")]
    return available_files

#used for preview
def get_tempdf_10first_lines_new():
    result_temp = tempDF   
    result_temp = result_temp.head(10)
    result_final = []
    for i in result_temp:
        k = [i.asDict()[x] for x in tempDF.columns]
        result_final.append(k)
    return result_final


def saveDataframe(savepath, df):
    global g
    hdfs_start_path = "hdfs://"+myhdfs.host+":"+str(myhdfs.port)
    completepath = hdfs_start_path + savepath
    try:
        df.write.save(path=savepath, mode='append', format="parquet")
        return 200
    except Exception as e:
        g=e

# helper function to transform input column, used by FPGrowth
def colToArray(s):
    input = [str(int(x)) for x in s[1:-1].split(",")]
    return input

# ----------- Algorithms -----------

# Algo Select and Configure

def get_algorithm_parameters(algorithm):
    pdict = {}
    if algorithm in ALGORITHM_PARAMETERS:
        for p in ALGORITHM_PARAMETERS[algorithm]:
            pdict[p] = PARAMETERS[p]
    return {algorithm:pdict}



def get_algos_in_family(family):
    return ALGORITHMS[family]

# Algo execution

def get_validator(apply_train_test_split,pipeline,paramGrid,evaluator,evalVal):
    if apply_train_test_split:
        val = TrainValidationSplit(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          trainRatio=float(evalVal)/100) 
    else:
        val = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=int(evalVal))
    return val

#TODO add modelType param
def run_naive_bayes(algo_configuration):
    training = tempDF
    smoothing = float(algo_configuration["Smoothing"])
    apply_train_test_split = True if algo_configuration["eval"] == 'train-test' else False
    evalVal = algo_configuration["evalVal"]
    nb = NaiveBayes(modelType="multinomial")    
    pipeline = Pipeline(stages=[nb])
    paramGrid = ParamGridBuilder() \
    .addGrid(nb.smoothing, [smoothing, smoothing]) \
    .build()
    evaluator=MulticlassClassificationEvaluator()
    val = get_validator(apply_train_test_split,pipeline,paramGrid,evaluator,evalVal)
    assembler = VectorAssembler(inputCols=algo_configuration["features"],outputCol="features")
    output = assembler.transform(training)
    output = output.withColumnRenamed(algo_configuration["label"],"label").select("label","features")
    cvModel = val.fit(output)
    final_model = cvModel.bestModel
    final_model.write().overwrite().save(algo_configuration["output_path"])
    resDict = {"Theta":str(final_model.stages[0].theta).encode("utf-8"),
              "Output":"The created model has been saved in the selected output folder."}
    return {"results":resDict}

def run_decision_tree_classifier(algo_configuration):
    training = tempDF
    minInfoGain = float(algo_configuration["Minimum Info Gain"])
    maxDepth = int(algo_configuration["Maximum Depth"])
    maxBins = int(algo_configuration["Maximum Bins"])
    minInstancesPerNode = int(algo_configuration["Minimum Instances per Node"])
    apply_train_test_split = True if algo_configuration["eval"] == 'train-test' else False
    evalVal = algo_configuration["evalVal"]
    assembler = VectorAssembler(inputCols=algo_configuration["features"],outputCol="features")
    output = assembler.transform(training)
    output = output.withColumnRenamed(algo_configuration["label"],"label").select("label","features")
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(output)
    featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(output)
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    paramGrid = ParamGridBuilder() \
    .addGrid(dt.minInfoGain, [minInfoGain, minInfoGain]) \
    .addGrid(dt.maxDepth, [maxDepth, maxDepth]) \
    .addGrid(dt.maxBins, [maxBins, maxBins]) \
    .addGrid(dt.minInstancesPerNode, [minInstancesPerNode, minInstancesPerNode]) \
    .build()
    evaluator=MulticlassClassificationEvaluator()
    val = get_validator(apply_train_test_split,pipeline,paramGrid,evaluator,evalVal)
    cvModel = val.fit(output)
    final_model = cvModel.bestModel
    final_model.write().overwrite().save(algo_configuration["output_path"])
    treeDict = {"Tree":final_model.stages[2].toDebugString.encode("utf-8"),
               "Output":"The created model has been saved in the selected output folder."}
    return {"results":treeDict}

def run_decision_tree_regressor(algo_configuration):
    global g
    g = algo_configuration
    training = tempDF
    minInfoGain = float(algo_configuration["Minimum Info Gain"])
    maxDepth = int(algo_configuration["Maximum Depth"])
    maxBins = int(algo_configuration["Maximum Bins"])
    minInstancesPerNode = int(algo_configuration["Minimum Instances per Node"])
    apply_train_test_split = True if algo_configuration["eval"] == 'train-test' else False
    evalVal = algo_configuration["evalVal"]
    assembler = VectorAssembler(inputCols=algo_configuration["features"],outputCol="features")
    output = assembler.transform(training)
    output = output.withColumnRenamed(algo_configuration["label"],"label").select("label","features")
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(output)
    featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(output)
    dt = DecisionTreeRegressor(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    paramGrid = ParamGridBuilder() \
    .addGrid(dt.minInfoGain, [minInfoGain, minInfoGain]) \
    .addGrid(dt.maxDepth, [maxDepth, maxDepth]) \
    .addGrid(dt.maxBins, [maxBins, maxBins]) \
    .addGrid(dt.minInstancesPerNode, [minInstancesPerNode, minInstancesPerNode]) \
    .build()
    evaluator=RegressionEvaluator()
#     val = TrainValidationSplit(estimator=pipeline,
#                            estimatorParamMaps=paramGrid,
#                            evaluator=evaluator,
#                            trainRatio=0.8) 
    val = get_validator(apply_train_test_split,pipeline,paramGrid,evaluator,evalVal)
    cvModel = val.fit(output)
    final_model = cvModel.bestModel
    final_model.write().overwrite().save(algo_configuration["output_path"])
    treeDict = {"Tree":final_model.stages[2].toDebugString.encode("utf-8"),
               "Output":"The created model has been saved in the selected output folder."}
    return {"results":treeDict}

#TODO add the remaining parameters & give a "prettier" result
def run_random_forest_classifier(algo_configuration):
    training = tempDF
#     elasticNetParam = float(algo_configuration["Elastic Net Parameter"])
#     regParam = float(algo_configuration["Regularisation Parameter"])
    maxDepth = int(algo_configuration["Maximum Depth"])
    maxBins = int(algo_configuration["Maximum Bins"])
    minInstancesPerNode = int(algo_configuration["Minimum Instances per Node"])
    numTrees = int(algo_configuration["Number of Trees"])
    minInfoGain = float(algo_configuration["Minimum Info Gain"])
    apply_train_test_split = True if algo_configuration["eval"] == 'train-test' else False
    evalVal = algo_configuration["evalVal"]
    
    assembler = VectorAssembler(inputCols=algo_configuration["features"],outputCol="features")
    output = assembler.transform(training)
    output = output.withColumnRenamed(algo_configuration["label"],"label").select("label","features")
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(output)
    featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(output)
    
    rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])
    
    paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [numTrees, numTrees]) \
    .addGrid(rf.maxDepth, [maxDepth, maxDepth]) \
    .addGrid(rf.maxBins, [maxBins, maxBins]) \
    .addGrid(rf.minInfoGain, [minInfoGain, minInfoGain]) \
    .addGrid(rf.minInstancesPerNode, [minInstancesPerNode, minInstancesPerNode]) \
    .build()
    evaluator=MulticlassClassificationEvaluator()
    val = get_validator(apply_train_test_split,pipeline,paramGrid,evaluator,evalVal)
    cvModel = val.fit(output)
    final_model = cvModel.bestModel
    final_model.write().overwrite().save(algo_configuration["output_path"])
    treeDict = {"Tree":final_model.stages[2].toDebugString.encode("utf-8"),
               "Output":"The created model has been saved in the selected output folder."}
    return {"results":treeDict}

def run_random_forest_regressor(algo_configuration):
    training = tempDF
    maxDepth = int(algo_configuration["Maximum Depth"])
    maxBins = int(algo_configuration["Maximum Bins"])
    minInstancesPerNode = int(algo_configuration["Minimum Instances per Node"])
    numTrees = int(algo_configuration["Number of Trees"])
    minInfoGain = float(algo_configuration["Minimum Info Gain"])
    apply_train_test_split = True if algo_configuration["eval"] == 'train-test' else False
    evalVal = algo_configuration["evalVal"]
    
    assembler = VectorAssembler(inputCols=algo_configuration["features"],outputCol="features")
    output = assembler.transform(training)
    output = output.withColumnRenamed(algo_configuration["label"],"label").select("label","features")
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(output)
    featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(output)
    
    rf = RandomForestRegressor(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])
    
    paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [numTrees, numTrees]) \
    .addGrid(rf.maxDepth, [maxDepth, maxDepth]) \
    .addGrid(rf.maxBins, [maxBins, maxBins]) \
    .addGrid(rf.minInfoGain, [minInfoGain, minInfoGain]) \
    .addGrid(rf.minInstancesPerNode, [minInstancesPerNode, minInstancesPerNode]) \
    .build()
    evaluator=RegressionEvaluator()
    val = get_validator(apply_train_test_split,pipeline,paramGrid,evaluator,evalVal)
    cvModel = val.fit(output)
    final_model = cvModel.bestModel
    final_model.write().overwrite().save(algo_configuration["output_path"])
    treeDict = {"Tree":final_model.stages[2].toDebugString.encode("utf-8"),
               "Output":"The created model has been saved in the selected output folder."}
    return {"results":treeDict}

# TODO: add note that it currently only supports binary classification
def run_gradient_boosted_tree_classifier(algo_configuration):
    training = tempDF
    maxDepth = int(algo_configuration["Maximum Depth"])
    maxBins = int(algo_configuration["Maximum Bins"])
    minInstancesPerNode = int(algo_configuration["Minimum Instances per Node"])
    maxIter = int(algo_configuration["Maximum Iterations"])
    minInfoGain = float(algo_configuration["Minimum Info Gain"])
    apply_train_test_split = True if algo_configuration["eval"] == 'train-test' else False
    evalVal = algo_configuration["evalVal"]
    
    assembler = VectorAssembler(inputCols=algo_configuration["features"],outputCol="features")
    output = assembler.transform(training)
    output = output.withColumnRenamed(algo_configuration["label"],"label").select("label","features")
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(output)
    featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(output)
    
    gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt])
    
    paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [maxIter, maxIter]) \
    .addGrid(gbt.maxDepth, [maxDepth, maxDepth]) \
    .addGrid(gbt.maxBins, [maxBins, maxBins]) \
    .addGrid(gbt.minInfoGain, [minInfoGain, minInfoGain]) \
    .addGrid(gbt.minInstancesPerNode, [minInstancesPerNode, minInstancesPerNode]) \
    .build()
    evaluator=BinaryClassificationEvaluator()
    val = get_validator(apply_train_test_split,pipeline,paramGrid,evaluator,evalVal)
    cvModel = val.fit(output)
    final_model = cvModel.bestModel
    final_model.write().overwrite().save(algo_configuration["output_path"])
    treeDict = {"Tree":final_model.stages[2].toDebugString.encode("utf-8"),
               "Output":"The created model has been saved in the selected output folder."}
    return {"results":treeDict}

#TODO note that this is very demanding in time (and resources?)
def run_gradient_boosted_tree_regressor(algo_configuration):
    training = tempDF
    maxDepth = int(algo_configuration["Maximum Depth"])
    maxBins = int(algo_configuration["Maximum Bins"])
    minInstancesPerNode = int(algo_configuration["Minimum Instances per Node"])
    maxIter = int(algo_configuration["Maximum Iterations"])
    minInfoGain = float(algo_configuration["Minimum Info Gain"])
    apply_train_test_split = True if algo_configuration["eval"] == 'train-test' else False
    evalVal = algo_configuration["evalVal"]
    
    assembler = VectorAssembler(inputCols=algo_configuration["features"],outputCol="features")
    output = assembler.transform(training)
    output = output.withColumnRenamed(algo_configuration["label"],"label").select("label","features")
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(output)
    featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(output)
    
    gbt = GBTRegressor(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt])
    
    paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [maxIter, maxIter]) \
    .addGrid(gbt.maxDepth, [maxDepth, maxDepth]) \
    .addGrid(gbt.maxBins, [maxBins, maxBins]) \
    .addGrid(gbt.minInfoGain, [minInfoGain, minInfoGain]) \
    .addGrid(gbt.minInstancesPerNode, [minInstancesPerNode, minInstancesPerNode]) \
    .build()
    evaluator=RegressionEvaluator()
    val = get_validator(apply_train_test_split,pipeline,paramGrid,evaluator,evalVal)
    cvModel = val.fit(output)
    final_model = cvModel.bestModel
    final_model.write().overwrite().save(algo_configuration["output_path"])
    treeDict = {"Tree":final_model.stages[2].toDebugString.encode("utf-8"),
               "Output":"The created model has been saved in the selected output folder."}
    return {"results":treeDict}

def run_logistic_regression(algo_configuration):
    training = tempDF
    elasticNetParam = float(algo_configuration["Elastic Net Parameter"])
    regParam = float(algo_configuration["Regularisation Parameter"])
    maxIter = int(algo_configuration["Maximum Iterations"])
    apply_train_test_split = True if algo_configuration["eval"] == 'train-test' else False
    evalVal = algo_configuration["evalVal"]
    lr = LogisticRegression()
    pipeline = Pipeline(stages=[lr])
    paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [regParam, regParam]) \
    .addGrid(lr.elasticNetParam, [elasticNetParam, elasticNetParam]) \
    .addGrid(lr.maxIter, [maxIter, maxIter]) \
    .build()
    evaluator=MulticlassClassificationEvaluator()
    val = get_validator(apply_train_test_split,pipeline,paramGrid,evaluator,evalVal)
    assembler = VectorAssembler(inputCols=algo_configuration["features"],outputCol="features")
    output = assembler.transform(training)
    output = output.withColumnRenamed(algo_configuration["label"],"label").select("label","features")
    cvModel = val.fit(output)
    final_model = cvModel.bestModel
    final_model.write().overwrite().save(algo_configuration["output_path"])
    trainingSummary = final_model.stages[0].summary
    summaryDict = {}
    summaryDict["accuracy"] = trainingSummary.accuracy
    summaryDict["falsePositiveRate"] = trainingSummary.weightedFalsePositiveRate
    summaryDict["truePositiveRate"] = trainingSummary.weightedTruePositiveRate
    summaryDict["fMeasure"] = trainingSummary.weightedFMeasure()
    summaryDict["precision"] = trainingSummary.weightedPrecision
    summaryDict["recall"] = trainingSummary.weightedRecall
    summaryDict["Output"] = "The created model has been saved in the selected output folder."
    return {"results":summaryDict}

def run_linear_regression(algo_configuration):
    training = tempDF
    elasticNetParam = float(algo_configuration["Elastic Net Parameter"])
    regParam = float(algo_configuration["Regularisation Parameter"])
    maxIter = int(algo_configuration["Maximum Iterations"])
    apply_train_test_split = True if algo_configuration["eval"] == 'train-test' else False
    evalVal = algo_configuration["evalVal"]
    lr = LinearRegression()
    pipeline = Pipeline(stages=[lr])
    paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [regParam, regParam]) \
    .addGrid(lr.elasticNetParam, [elasticNetParam, elasticNetParam]) \
    .addGrid(lr.maxIter, [maxIter, maxIter]) \
    .build()
    evaluator=RegressionEvaluator()
    val = get_validator(apply_train_test_split,pipeline,paramGrid,evaluator,evalVal)
    assembler = VectorAssembler(inputCols=algo_configuration["features"],outputCol="features")
    output = assembler.transform(training)
    output = output.withColumnRenamed(algo_configuration["label"],"label").select("label","features")
    cvModel = val.fit(output)
    final_model = cvModel.bestModel
    final_model.write().overwrite().save(algo_configuration["output_path"])
    trainingSummary = final_model.stages[0].summary
    summaryDict = {}
    summaryDict["Number of Iterations"] = trainingSummary.totalIterations
    summaryDict["Objective History"] = str(trainingSummary.objectiveHistory)
    summaryDict["RMSE"] = str(trainingSummary.rootMeanSquaredError)
    summaryDict["r2"] = str(trainingSummary.r2)
    summaryDict["Deviance Residuals (first 10)"] = str(np.array(trainingSummary.residuals.head(10)))
    summaryDict["Output"] = "The created model has been saved in the selected output folder."
    return {"results":summaryDict}

def run_generalized_linear_regression(algo_configuration):
    training = tempDF
    family = algo_configuration["Distribution Family"].lower()
    regParam = float(algo_configuration["Regularisation Parameter"])
    maxIter = int(algo_configuration["Maximum Iterations"])
    apply_train_test_split = True if algo_configuration["eval"] == 'train-test' else False
    evalVal = algo_configuration["evalVal"]
    gr = GeneralizedLinearRegression(family=family)
    pipeline = Pipeline(stages=[gr])
    paramGrid = ParamGridBuilder() \
    .addGrid(gr.regParam, [regParam, regParam]) \
    .addGrid(gr.maxIter, [maxIter, maxIter]) \
    .build()
    evaluator=RegressionEvaluator()
    val = get_validator(apply_train_test_split,pipeline,paramGrid,evaluator,evalVal)
    assembler = VectorAssembler(inputCols=algo_configuration["features"],outputCol="features")
    output = assembler.transform(training)
    output = output.withColumnRenamed(algo_configuration["label"],"label").select("label","features")
    cvModel = val.fit(output)
    final_model = cvModel.bestModel
    final_model.write().overwrite().save(algo_configuration["output_path"])
    trainingSummary = final_model.stages[0].summary
    summaryDict = {}
    summaryDict["Coefficient Standard Errors"] = trainingSummary.coefficientStandardErrors
    summaryDict["T Values"] = trainingSummary.tValues
    summaryDict["P Values"] = trainingSummary.pValues
    summaryDict["Dispersion"] = trainingSummary.dispersion
    summaryDict["Null Deviance"] = trainingSummary.nullDeviance
    summaryDict["Residual Degree Of Freedom Null"] = str(trainingSummary.residualDegreeOfFreedomNull)
    summaryDict["Deviance"] = str(trainingSummary.deviance)
    summaryDict["Residual Degree Of Freedom"] = str(trainingSummary.residualDegreeOfFreedom)
    summaryDict["AIC"] = trainingSummary.aic
    summaryDict["Deviance Residuals (first 10)"] = str(np.array(trainingSummary.residuals().head(10)))
    summaryDict["Output"] = "The created model has been saved in the selected output folder."
    return {"results":summaryDict}

#TODO explain a bit what happens with hidden layers & notify about class labeling issue
def run_multilayer_perceptron_classifier(algo_configuration):
    training = tempDF
    stepSize = float(algo_configuration["Step Size"])
    seed = int(algo_configuration["Seed"])
    maxIter = int(algo_configuration["Maximum Iterations"])
    apply_train_test_split = True if algo_configuration["eval"] == 'train-test' else False
    evalVal = algo_configuration["evalVal"]
    
    assembler = VectorAssembler(inputCols=algo_configuration["features"],outputCol="features")
    output = assembler.transform(training)
    output = output.withColumnRenamed(algo_configuration["label"],"label").select("label","features")
    
    lastLayerNodes = output.select("label").distinct().count()
    firstLayerNodes = len(algo_configuration["features"])
    layers = [firstLayerNodes, 5, 4, lastLayerNodes]
    
    trainer = MultilayerPerceptronClassifier(layers=layers,blockSize=128, seed=1234)
    pipeline = Pipeline(stages=[trainer])
    paramGrid = ParamGridBuilder() \
    .addGrid(trainer.maxIter, [maxIter, maxIter]) \
    .addGrid(trainer.seed, [seed, seed]) \
    .addGrid(trainer.stepSize, [stepSize, stepSize]) \
    .build()
    evaluator=MulticlassClassificationEvaluator()
    val = get_validator(apply_train_test_split,pipeline,paramGrid,evaluator,evalVal)
    cvModel = val.fit(output)
    final_model = cvModel.bestModel
    final_model.write().overwrite().save(algo_configuration["output_path"])
    resDict = {"Weights":str(final_model.stages[0].weights).encode("utf-8"),
              "Output":"The created model has been saved in the selected output folder."}
    return {"results":resDict}

def run_kmeans_clustering(algo_configuration):
    training = tempDF
    k = float(algo_configuration["Number of Clusters (k)"])
    seed = int(algo_configuration["Seed"])
    assembler = VectorAssembler(inputCols=algo_configuration["features"],outputCol="features")
    output = assembler.transform(training)
    kmeans = KMeans().setK(k).setSeed(seed)
    model = kmeans.fit(output)
    model.write().overwrite().save(algo_configuration["output_path"])
    predictions = model.transform(output)
    saveDataframe(algo_configuration["output_path"]+"/kmeans_predictions",predictions)
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)
    resultDict = {}
    resultDict["Silhouette with squared euclidean distance"] = str(silhouette)
    centers = model.clusterCenters()
    centers_string = ""
    for center in centers:
        centers_string+=str(center)
    resultDict["Centers"] = centers_string
    resultDict["Output"] = "The cluster predictions and the created model have been saved in the selected output folder."
    return {"results":resultDict}

def run_gaussian_mixtures(algo_configuration):
    training = tempDF
    k = float(algo_configuration["Number of Clusters (k)"])
    seed = int(algo_configuration["Seed"])
    assembler = VectorAssembler(inputCols=algo_configuration["features"],outputCol="features")
    output = assembler.transform(training)
    gmm = GaussianMixture().setK(k).setSeed(seed)
    model = gmm.fit(output)
    model.write().overwrite().save(algo_configuration["output_path"])
    gaussiansDF = np.array(model.gaussiansDF.select('mean').collect())
    saveDataframe(algo_configuration["output_path"]+"/gaussian_mixtures",gaussiansDF)
    resultDict = {}
    resultDict["Gaussians Mean"] = str(gaussiansDF)
    resultDict["Output"] = "The created model and the Gaussians have been saved in the selected output folder."
    return {"results":resultDict}

def run_pca(algo_configuration):
    training = tempDF
    k = float(algo_configuration["Number of Principal Components (k)"])
    assembler = VectorAssembler(inputCols=algo_configuration["features"],outputCol="features")
    output = assembler.transform(training)
    pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures")
    model = pca.fit(output)
    result = model.transform(output).select("pcaFeatures")
    model.write().overwrite().save(algo_configuration["output_path"])
    saveDataframe(algo_configuration["output_path"]+"/pca_features",result)
    resultDict = {}
    resultDict["PCA Features (first 10 rows)"] = str(np.array(result.head(20)))
    resultDict["Output"] = "The created model and the PCA features have been saved in the selected output folder."
    return {"results":resultDict}

def run_chisquare(algo_configuration):
    training = tempDF
    numTopFeatures = float(algo_configuration["Number of Top Features to Select"])
    assembler = VectorAssembler(inputCols=algo_configuration["features"],outputCol="features")
    output = assembler.transform(training)
    selector = ChiSqSelector(numTopFeatures=numTopFeatures, featuresCol="features",
                         outputCol="selectedFeatures", labelCol=algo_configuration["label"])
    model = selector.fit(output)
    feature_names = [output.columns[x] for x in model.selectedFeatures]
    result = model.transform(output)
    model.write().overwrite().save(algo_configuration["output_path"])
    saveDataframe(algo_configuration["output_path"]+"/chisquare_result",result)
    resultDict = {}
    resultDict["Selected Features"] = feature_names
    resultDict["Output"] = "The created model and the Dataframe containing current result features have been saved in the selected output folder."
    return {"results":resultDict}

def run_als(algo_configuration):
    ratings = tempDF
    (training, test) = ratings.randomSplit([0.8, 0.2])
    regParam = float(algo_configuration["Regularisation Parameter"])
    maxIter = int(algo_configuration["Maximum Iterations"])
    userCol = algo_configuration["user"]
    itemCol = algo_configuration["item"]
    ratingCol = algo_configuration["rating"]
    
    # set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    als = ALS(maxIter=maxIter, regParam=regParam, userCol=userCol, itemCol=itemCol, ratingCol=ratingCol,
          coldStartStrategy="drop")
    model = als.fit(training)
    model.write().overwrite().save(algo_configuration["output_path"])
    # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol=ratingCol,
                                predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    # Generate top 10 movie recommendations for each user
    userRecs = model.recommendForAllUsers(10)
    # Generate top 10 user recommendations for each item
    itemRecs = model.recommendForAllItems(10)
    saveDataframe(algo_configuration["output_path"]+"/als_recommendations_for_users",userRecs)
    saveDataframe(algo_configuration["output_path"]+"/als_recommendations_for_items",itemRecs)
    resultDict = {}
    resultDict["RMSE (from 80%-20% train-test split)"] = rmse
    resultDict["Output"] = "The created model has been saved in the selected output folder. To facilitate its evaluation, two dataframes containing generated recommendations have been created and saved as parquet files: one containing the top 10 recommended items for all users and one containing the top 10 recommnded users for each item."
    return {"results":resultDict}

def run_tokenizer(algo_configuration):
    sentenceDF = tempDF
    regexTokenizer = RegexTokenizer(inputCol=algo_configuration["textcol"], outputCol="result-words", pattern="\\W")
    regexTokenized = regexTokenizer.transform(sentenceDF)
    saveDataframe(algo_configuration["output_path"],regexTokenized)
    return {"results":{"Result":"The tokenized version of the input file has been saved in the specified folder."}}

def run_ngrams(algo_configuration):
    sentenceDF = tempDF
    regexTokenizer = RegexTokenizer(inputCol=algo_configuration["textcol"], outputCol="result-words", pattern="\\W")
    regexTokenized = regexTokenizer.transform(sentenceDF)
    n = int(algo_configuration["N"])
    ngram = NGram(n=n, inputCol="result-words", outputCol="result-ngrams")
    ngramDataFrame = ngram.transform(regexTokenized).drop("result-words") #TODO save this
    saveDataframe(algo_configuration["output_path"],ngramDataFrame)
    return {"results":{"Result":"The result file with the n-grams has been saved in the specified folder."}}

def run_tfidf(algo_configuration):
    sentenceDF = tempDF
    regexTokenizer = RegexTokenizer(inputCol=algo_configuration["textcol"], outputCol="temp-words", pattern="\\W")
    regexTokenized = regexTokenizer.transform(sentenceDF)
    n = int(algo_configuration["Number of TF-IDF Features"])
    hashingTF = HashingTF(inputCol="temp-words", outputCol="rawFeatures", numFeatures=n)
    featurizedData = hashingTF.transform(regexTokenized)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    idfModel.write().overwrite().save(algo_configuration["output_path"])
    rescaledData = idfModel.transform(featurizedData).drop("words","rawFeatures")
    saveDataframe(algo_configuration["output_path"]+"/tfidf_features",rescaledData)
    return {"results":{"Result":"The created model and the result file with the TF-IDF features have been saved in the specified folder."}}

def run_fpgrowth(algo_configuration):
    df= tempDF
    df = df.withColumn("fpinputcol",toarray_udf(df.items))
    fpGrowth = FPGrowth(itemsCol="fpinputcol", minSupport=0.5, minConfidence=0.6)
    model = fpGrowth.fit(df)
    #frequent itemsets
    freqitemsetsDF = model.freqItemsets.orderBy("freq", ascending = [0])
    #generated association rules.
    associationRulesDF = model.associationRules
    # consequents as prediction
    predictionDF = model.transform(df)
    model.write().overwrite().save(algo_configuration["output_path"])
    saveDataframe(algo_configuration["output_path"]+"/fpgrowth_associationRules",associationRulesDF)
    saveDataframe(algo_configuration["output_path"]+"/fpgrowth_predictions",predictionDF)
    return {"results":{"Result":"The model has been saved in the specified output folder. Also, two created Dataframes have been saved in the same folder as parquet files: one contains the generated association rules and the other contains item predictions."}}

#TODO: differentiate binary with multiclass classification!!!
#TODO: SOS classification needs classes from 2..#classes-1
def run_algo(algo,data):
    global g
    try:
        algo_configuration = json.loads(data)
        result = {"results":{"result":"No result"}}
        if algo == "LOGRE":
            result = run_logistic_regression(algo_configuration)
        elif algo == "DTC":
            result = run_decision_tree_classifier(algo_configuration)
        elif algo == "NB":
            result = run_naive_bayes(algo_configuration)
        elif algo == "RFC":
            result = run_random_forest_classifier(algo_configuration)
        elif algo == "GBTC":
            result = run_gradient_boosted_tree_classifier(algo_configuration)
        elif algo == "MLP":
            result = run_multilayer_perceptron_classifier(algo_configuration)
        elif algo == "DTR":
            result = run_decision_tree_regressor(algo_configuration)
        elif algo == "RFR":
            result = run_random_forest_regressor(algo_configuration)
        elif algo == "GBTR":
            result = run_gradient_boosted_tree_regressor(algo_configuration)
        elif algo == "OLS":
            result = run_linear_regression(algo_configuration)
        elif algo == "GLM":
            result = run_generalized_linear_regression(algo_configuration)
        elif algo == "KMEANS":
            result = run_kmeans_clustering(algo_configuration)
        elif algo == "GAUSSMIX":
            result = run_gaussian_mixtures(algo_configuration)
        elif algo == "PCA":
            result = run_pca(algo_configuration)
        elif algo == "ChiSquared":
            result = run_chisquare(algo_configuration)
        elif algo == "ALS":
            result = run_als(algo_configuration)
        elif algo == "TOKENIZER":
            result = run_tokenizer(algo_configuration)
        elif algo == "N-GRAMS":
            result = run_ngrams(algo_configuration)
        elif algo == "FPGrowth":
            result =run_fpgrowth(algo_configuration)
    except Exception as e:
        g=e
        result = {"results":{"ERROR":str(e).replace("'","").replace("\"","").encode("utf-8")}}
    return result
    


# ----------- Error Handling -----------

# TODO for dubugging, show in UI
def sanity_check():
    for a in ALGORITHMS:
        for algo in ALGORITHMS[a]:
            if algo not in ALGORITHM_PARAMETERS:
                error = "INIT ERROR: No parameters defined for {}".format(algo)
                return error
    for p in ALGORITHM_PARAMETERS:
        for parameter in ALGORITHM_PARAMETERS[p]:
            if parameter not in PARAMETERS:
                error = "INIT ERROR: Parameter {} not specified in parameters dict".format(parameter)
                return error


# ----------- Overview -----------


        


In [6]:
%%html

<style>
.tagclass {
    background:grey;
    border:none;
}
.fieldcell{
    white-space:pre-wrap;
}
td{
  text-align:left;
}

table{
    table-layout: auto !important;
}

td{
    word-wrap:break-word
}


.paragraph .form-horizontal .form-group {
    margin-right: 0px !important;
    margin-left: 0px !important;
}

#list2 { width:320px; }
#list2 ol { font-style:italic; font-family:Georgia, Times, serif; font-size:18px; color:#337ab7;  }
#list2 ol li { }
#list2 ol li p { padding:8px; font-style:normal; font-family:Arial; font-size:13px; color:black; border-left: 1px solid #999; }
#list2 ol li.applied p em { color:lightgray;  }
#list2 ol li p em { display:block; }


.qblabel {
    font-style:italic; font-family:Georgia, Times, serif; font-size:18px; color:#337ab7; 
}

.aec-btn {
    background-color: Transparent;
    background-repeat:no-repeat;
    border: none;
    cursor:pointer;
    overflow: hidden;
    outline:none;
    font-style:italic; 
    font-family:Georgia, Times, serif; 
    font-size:20px; 
    color:#435066;
    padding: 10px 12px 14px 12px;
}

.aec-btn:hover {
    background-color: #ddd;
}

/* Create an active/current tablink class */
.aec-btn.active {
    background-color: #ccc;
}

#aec div{
    padding: 2px 4px 2px 16px;
}

</style>
<button type="button" class="btn aec-btn" onclick="openTab('aec-file-selector',this);">Input File</button>
<button type="button" class="btn aec-btn" onclick="openTab('aec-algo-selector',this);">Algorithm Selection & Configuration</button>
<button type="button" class="btn aec-btn" onclick="openTab('aec-save-conf',this);">Output File</button>
<button type="button" class="btn aec-btn" onclick="updateOverview();openTab('aec-overview',this);">Overview</button>
<!-- <button type="button" class="btn aec-btn" onclick="alert('Not yet ready!')">Test Model</button> -->

<div id="aec">
<div id="aec-overview" style="display:none;">
<p>
<label style="display: inline-block;width: 20%;" class="qblabel">Selected Input File:</label>
    <div id="overview-input" style="display: inline-block;width: 70%;"></div>
</p>
<p>
<label style="width: 20%;" class="qblabel">Selected Algorithm:</label>
    <div id="overview-algorithm" style="display: inline-block;width: 70%;"></div>
</p>
<p>
<label style="width: 20%;" class="qblabel">Selected Output File:</label>
    <div id="overview-output" style="display: inline-block;width: 70%;"></div>
</p>
<button class="btn btn-primary" id="filters-done" onclick="executeAlgo();"> Execute </button>
<div id="algo-results"></div>
</div>



<div id="aec-file-selector" style="display:none;">

   <div class="form-group">
      <label style="width: 20%;" class="qblabel">Available Datasets</label>
      <div style="display: inline-block;width: 60%;">
         <select style="margin-left:4px !important;" id="datasetSelect" class="form-control" >
         </select> 
      </div>
      <button type="button" class="btn btn-primary btn-md" onclick="selectDataset()"> Apply </button>
    <button type="button" class="btn btn-primary btn-md" onclick="updateBrowser();"> Refresh </button>
   </div>
   <div class="form-group">
      <label style="width: 20%;" class="qblabel">Available Files</label>
      <div style="display: inline-block;width: 60%;">
         <select style="margin-left:4px !important;" id="fileSelect" class="form-control" >
         </select> 
      </div>
   </div>
   <div class="form-group">
      <label style="width: 20%;" class="qblabel">CSV Separator</label>
      <div style="display: inline-block;width: 60%;">
         <select style="margin-left:4px !important;" id="sepSelect" class="form-control" >
            <option value=",">,</option>
            <option value=";">;</option>
         </select>
      </div>
      <button type="button" class="btn btn-primary btn-md" onclick="selectDatafile()"> Apply </button>
        <button type="button" class="btn btn-primary btn-md" onclick="getTempDFpreview();">Refresh Preview</button>
   </div>


<div id="data-preview"  style="display:none">
  <div class="panel-body">
  <table class="table">
    <tbody id="preview-table">
    </tbody>
  </table>
  </div>
</div>
<div class="form-group">
        <button type="button" class="btn btn-primary btn-md" onclick="openTab('aec-algo-selector',this);"> Next </button>
</div>
</div>
<hr><hr>
<div class="form-horizontal" id="aec-algo-selector" style="display:none;">
    <div class="form-group">
        <label style="width: 20%;text-align:left;" class="control-label col-sm-2 qblabel" >Algorithm Family</label>
        <div class="col-sm-10" style="display: inline-block;width: 70%;">
            <select style="margin-left:4px !important;" class="form-control" id="algoFamilySelect" onchange="loadAlgorithms();$('.algo_configuration').hide();" >
                <option value="" disabled selected></option>
                <option value="DIMRE_FE">DIMENSIONALITY REDUCTION/FEATURE EXTRACTION/SELECTION</option>
                <option value="NLP">NLP FUNCTIONS</option>
                <option value="RECOM">RECOMMENDERS</option>
                <option value="CLUST">CLUSTERING</option>
                <option value="CL_REG">CLASSIFICATION/REGRESSION</option>
                <option value="FREQPM">FREQUENT PATTERN MINING</option>
            </select>
        </div>
    </div>
    <div class="form-group">
        <label style="width: 20%;text-align:left;" class="control-label col-sm-2 qblabel" >Algorithm</label>
        <div class="col-sm-10" style="display: inline-block;width: 70%;">
            <select style="margin-left:4px !important;" class="form-control" id="algoSelect" onchange="populate_algo_conf_form()">
                    <option value="" disabled selected>-- select --</option>
            </select>
        </div>
    </div>

<div class="form-horizontal algo_configuration " id="algocommon_DIMRE_FE" style="display:none" >
<p class="algoDescription"></p>
<h3>Configuration</h3>

    <div class="form-group">
        <label class="control-label col-sm-2" >Feature Columns</label>
        <div class="col-sm-10">
            <select multiple class="form-control multiple-column-selector df-col" >
            </select>
        </div>
        </div>
<div class="form-horizontal algo_configuration" id="algo_PCA" style="display:none" ></div>
<div class="form-horizontal algo_configuration" id="algo_ChiSquared" style="display:none" >
<div class="form-group">
        <label class="control-label col-sm-2" >Label Column</label>
        <div class="col-sm-10">
            <select class="form-control single-column-selector df-col" >
            </select>
        </div>
    </div>
</div>
</div>
<div class="form-horizontal algo_configuration " id="algocommon_NLP" style="display:none" >
<p class="algoDescription"></p>
<div class="form-group">
        <label class="control-label col-sm-2">Text Column</label>
        <div class="col-sm-10">
            <select class="form-control single-column-selector df-col" id="nlp-text-col">
            </select>
        </div>
    </div>
<div class="form-horizontal algo_configuration" id="algo_TOKENIZER" style="display:none" ></div>
<div class="form-horizontal algo_configuration" id="algo_N-GRAMS" style="display:none" ></div>
<div class="form-horizontal algo_configuration" id="algo_TF-IDF" style="display:none" >TF-IDF</div>
</div>
<div class="form-horizontal algo_configuration " id="algocommon_RECOM" style="display:none" >
<p class="algoDescription">  </p>
<div class="form-horizontal algo_configuration" id="algo_ALS" style="display:none" >
<div class="form-group">
        <label class="control-label col-sm-2">User Column</label>
        <div class="col-sm-10">
            <select class="form-control single-column-selector df-col" id="als-user-id-col">
            </select>
        </div>
    </div>
    
    <div class="form-group">
        <label class="control-label col-sm-2">Item Column</label>
        <div class="col-sm-10">
            <select class="form-control single-column-selector df-col" id="als-item-id-col">
            </select>
        </div>
    </div>
    
    <div class="form-group">
        <label class="control-label col-sm-2">Rating Column</label>
        <div class="col-sm-10">
            <select class="form-control single-column-selector df-col" id="als-rating-col">
            </select>
        </div>
    </div>
</div>
</div>


<div class="form-horizontal algo_configuration " id="algocommon_FREQPM" style="display:none" >
<p class="algoDescription">  </p>

<h3>Configuration</h3>

<div class="form-horizontal algo_configuration" id="algo_FPGrowth" style="display:none" >
<div class="form-group">
        <label class="control-label col-sm-2" >Items Column</label>
        <div class="col-sm-10">
            <select class="form-control single-column-selector df-col" id="fpginput-col" >
            </select>
        </div>
    </div>
</div>
</div>

<div class="form-horizontal algo_configuration " id="algocommon_CLUST" style="display:none" >
<p class="algoDescription"></p>

<h3>Configuration</h3>

    <div class="form-group">
        <label class="control-label col-sm-2">Feature Columns</label>
        <div class="col-sm-10">
            <select multiple class="form-control multiple-column-selector df-col" >
            </select>
        </div>
        </div>
<div class="form-horizontal algo_configuration" id="algo_KMEANS" style="display:none" ></div>
<div class="form-horizontal algo_configuration" id="algo_GAUSSMIX" style="display:none" ></div>
</div>

    
<div class="form-horizontal algo_configuration " id="algocommon_CL_REG" style="display:none" >
<p> Note that for the classification algorithms, the label column should contain integers from 0 to N-1, where N the number of classes. </p>
<p class="algoDescription"></p>    
<h3>Configuration</h3>

<div class="form-group">
        <label class="control-label col-sm-2">Label Column</label>
        <div class="col-sm-10">
            <select class="form-control single-column-selector df-col" >
            </select>
        </div>
    </div>
    <div class="form-group">
        <label class="control-label col-sm-2" >Feature Columns</label>
        <div class="col-sm-10">
            <select multiple class="form-control multiple-column-selector df-col" >
            </select>
        </div>
        </div>
<div class="form-horizontal algo_configuration" id="algo_OLS" style="display:none" ></div>
<div class="form-horizontal algo_configuration" id="algo_DTR" style="display:none" ></div>
<div class="form-horizontal algo_configuration" id="algo_DTC" style="display:none" ></div>
<div class="form-horizontal algo_configuration" id="algo_MLP" style="display:none" > </div>
<div class="form-horizontal algo_configuration" id="algo_NB" style="display:none" ></div>
<div class="form-horizontal algo_configuration" id="algo_GLM" style="display:none" ></div>
<div class="form-horizontal algo_configuration" id="algo_RFR" style="display:none" ></div>
<div class="form-horizontal algo_configuration" id="algo_RFC" style="display:none" ></div>
<div class="form-horizontal algo_configuration" id="algo_GBTR" style="display:none" ></div>
<div class="form-horizontal algo_configuration" id="algo_GBTC" style="display:none" ></div>
<div class="form-horizontal algo_configuration" id="algo_LOGRE" style="display:none" ></div>

<div class="form-horizontal " id="CL_REG_EVAL" >

<div class="form-group">

<div class="btn-group btn-group-toggle" data-toggle="buttons">
  <label class="btn btn-secondary eval-label" onclick="selectEvalMethod('train-test-form',this);" style="font-size:16px">
    <input type="radio" name="validation-method" id="train-test" > 
    Train-Test Split
  </label>
  <label class="btn btn-secondary eval-label" onclick="selectEvalMethod('k-fold-form',this);" style="font-size:16px">
    <input type="radio" name="validation-method" > 
    K-Fold Cross Validation
  </label>
</div>

<div class="form-group eval-form" id="k-fold-form" style="display:none">
        <label class="control-label col-sm-2">Number of Folds for Cross-Validation</label>
    <div class="col-sm-10">
<input type="number" min="2" max="10" step="1" value="2"></input>
</div>
</div>

<div class="form-group eval-form" id="train-test-form" style="display:none">
        <label class="control-label col-sm-2" >Percentage % of input data to be used for training</label>
    <div class="col-sm-10">
<input type="number" min="10" max="100" step="5" value="80"></input>
</div>
</div>

</div>

</div>

</div>
<div class="form-group">
    <button type="button" class="btn btn-primary btn-md" onclick="openTab('aec-save-conf',this);"> Next </button>
</div>
</div>

  <div style="display:none;" id="aec-save-conf" >
  <div class="form-group" >
              <label style="width: 20%;" class="qblabel">Select Dataset</label>
              <div style="display: inline-block;width: 60%;">
                <select style="margin-left:4px !important;" id="datasetSaveSelect" class="form-control" >
                  
                </select> 

              </div>
                <button type="button" class="btn btn-primary btn-md" onclick="updateBrowser();"> Update Browser </button>

            </div>
             <div class="form-group">
              <label style="width: 20%;" class="qblabel">Result Folder Name</label>
              <div style="display: inline-block;width: 60%;">
                <input style="margin-left:4px !important;" id="folderNameSelect" class="form-control" >
                  
                </input> 
              </div>
            </div>
            <!--
            <div class="form-group">
              <label style="width: 20%;" class="qblabel">Include Header</label>
              <div style="display: inline-block;width: 60%;">
            <input type="checkbox" id="saveHeaderCheckbox" value="with" checked>                   
                </input> 
              </div>
            </div>
            -->
            <div class="form-group">
            <p>If the folder you specified exists, <b> it will be overwritten </b> with the results of the last algorithm execution.</p>
                <button type="button" class="btn btn-primary btn-md" onclick="updateOverview();openTab('aec-overview',this);"> Done </button>
</div>
</div>
</div>


<script>

var tempDF_columns = []
var datafile_options = []

var currentDFpath = ""
var lines

var algorithm_list = {
    "PCA":"Principal Component Analysis (PCA)",
    "ChiSquared":"Chi-squared test",
    "TOKENIZER":"Tokenizer",
    "N-GRAMS":"N-Grams",
    "TF-IDF":"TF-IDF",
    "ALS":"Collaborative Filtering (ALS)",
    "KMEANS":"K-MEANS",
    "GAUSSMIX":"Gaussian Mixtures",
    "OLS":"Linear Regression",
    "DTR":"Decision Trees Regression",
    "DTC":"Decision Trees Classifier",
    "MLP":"Multi-Layer Perceptron",
    "NB":"Naive Bayes",
    "GLM":"Generalized Linear Models (GLM)",
    "RFR":"Random Forest Regressor (RFR)",
    "GBTR":"Gradient-boosted tree Regression (GBTR)",
    "RFC":"Random Forest Classifier (RFC)",
    "GBTC":"Gradient-boosted tree Classifier (GBTC)",
    "LOGRE":"Logistic Regression",
    "FPGrowth":"FP Growth"
}

var algorithm_descriptions = {
    "PCA":"Principal Component Analysis (PCA) is a statistical procedure that uses an orthogonal transformation to \
        convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables\
        called principal components. A PCA class trains a model to project vectors to a low-dimensional space using PCA. \
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-features.html#pca\" target=\"_blank\"> More info...</a>",
    "ChiSquared":"Chi-squared stands for Chi-Squared feature selection. It operates on labeled data with categorical \
        features. ChiSqSelector uses the Chi-Squared test of independence to decide which features to choose.test\
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-features.html#chisqselector\" target=\"_blank\">More info...</a>",
    "TOKENIZER":"Tokenization is the process of taking text (such as a sentence) and breaking it into individual terms \
        (usually words). Here we use RegexTokenizer that converts the input string to lowercase, \
        removes stopwords and then splits it by white spaces. \
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-features.html#tokenizer\" target=\"_blank\"> More info...</a>",
    "N-GRAMS":"An n-gram is a sequence of n tokens (typically words) for some integer n. The NGram class can be used to \
        transform input features into n-grams. The parameter n is used to determine the number of terms in each n-gram. \
        The output will consist of a sequence of n-grams where each n-gram is represented by a space-delimited string \
        of n consecutive words. If the input sequence contains fewer than n strings, no output is produced. \
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-features.html#n-gram\" target=\"_blank\"> More info...</a>",
    "TF-IDF":"Term frequency-inverse document frequency (TF-IDF) is a feature vectorization method widely used in \
        text mining to reflect the importance of a term to a document in the corpus. \
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-features.html#tf-idf\" target=\"_blank\"> More info...</a>",
    "ALS":"Collaborative filtering is commonly used for recommender systems. These techniques aim to fill in the \
        missing entries of a user-item association matrix. spark.ml currently supports model-based collaborative filtering,\
        in which users and products are described by a small set of latent factors that can be used to predict missing \
        entries. spark.ml uses the alternating least squares (ALS) algorithm to learn these latent factors. \
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-collaborative-filtering.html\" target=\"_blank\"> More info...</a>",
    "KMEANS":"K-means is one of the most commonly used clustering algorithms that clusters the data points into a \
        predefined number of clusters (K). \
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-clustering.html#k-means\" target=\"_blank\"> More info...</a>",
    "GAUSSMIX":"A Gaussian Mixture Model represents a composite distribution whereby points are drawn from one of \
        k Gaussian sub-distributions, each with its own probability. The spark.ml implementation uses the \
        expectation-maximization algorithm to induce the maximum-likelihood model given a set of samples. \
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-clustering.html#gaussian-mixture-model-gmm\" target=\"_blank\"> More info...</a>",
    "OLS":"Ordinary Least squares (OLS) is the simplest and most common linear regressor. The learning objective of OLS \
        is to minimize the sum of squared residuals, in order to estimate the coefficients of the linear regression \
        expression. \
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-classification-regression.html#linear-regression\" target=\"_blank\">More info...</a>",
    "DTR":"Decision trees are a popular family of classification and regression methods. \
        More information about the spark.ml implementation can be found \
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-classification-regression.html#decision-trees\" target=\"_blank\"> More info...</a>",
    "DTC":"Decision trees are a popular family of classification and regression methods. \
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-classification-regression.html#decision-trees\" target=\"_blank\"> More info...</a>",
    "MLP":"Multilayer perceptron classifier (MLPC) is a classifier based on the feedforward artificial neural network. \
        MLPC consists of multiple layers of nodes. Each layer is fully connected to the next layer in the network. \
        Nodes in the input layer represent the input data. The number of nodes N in the output layer corresponds to the \
        number of classes. Here we use two inner layers for which the number of nodes can be selected by the user.\
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-classification-regression.html#multilayer-perceptron-classifier\" target=\"_blank\"> More info...</a>",
    "NB":"Naive Bayes classifiers are a family of simple probabilistic classifiers based on applying Bayes’ theorem \
        with strong (naive) independence assumptions between the features.  \
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-classification-regression.html#naive-bayes\" target=\"_blank\"> More info...</a>",
    "GLM":"Contrasted with linear regression where the output is assumed to follow a Gaussian distribution, generalized linear models (GLM) are specifications of \
        linear models where the response variable follows some distribution from the exponential family of distributions. \
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-classification-regression.html#generalized-linear-regression\" target=\"_blank\"> More info...</a>",
    "RFR":"Random forests are a popular family of classification and regression methods. \
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-classification-regression.html#random-forests\" target=\"_blank\"> More info...</a>",
    "GBTR":"Gradient-boosted trees (GBTs) are a popular regression method using ensembles of decision trees. \
        Parameter configuration should be performed with caution, as parameters significantly impact the algorithm's execution time.\
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-classification-regression.html#gradient-boosted-tree-regression\" target=\"_blank\"> More info...</a>",
    "RFC":"Random forests are a popular family of classification and regression methods. \
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-classification-regression.html#random-forests\" target=\"_blank\"> More info...</a>",
    "GBTC":"Gradient-boosted trees (GBTs) are a popular classification and regression method using ensembles \
        of decision trees. <b> The current implementation supports only binary classification. </b>\
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-classification-regression.html#gradient-boosted-tree-classifier\" target=\"_blank\"> More info...</a>",
    "LOGRE":"<p><b>Logistic regression</b> is a popular method to predict a categorical response. \
        It is a special case of Generalized Linear models that predicts the probability of the outcomes. \
        In spark.ml logistic regression can be used to predict a binary outcome by using binomial logistic regression, \
        or it can be used to predict a multiclass outcome by using multinomial logistic regression. \
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-classification-regression.html#logistic-regression\" target=\"_blank\"> More info...</a></p>",
    "FPGrowth":"Mining frequent items, itemsets, subsequences, or other substructures is usually among the first steps to analyze a large-scale dataset, \
        which has been an active research topic in data mining for years. The <b>FP Growth</b> algorithm  is described in \
        the paper Han et al., Mining frequent patterns without candidate generation, where “FP” stands for frequent pattern.\
        Given a dataset of transactions, the first step of FP-growth is to calculate item frequencies and identify frequent \
        items. Different from Apriori-like algorithms designed for the same purpose, the second step of FP-growth uses \
        a suffix tree (FP-tree) structure to encode transactions without generating candidate sets explicitly, which are \
        usually expensive to generate. After the second step, the frequent itemsets can be extracted from the FP-tree. \
        <a href=\"https://spark.apache.org/docs/2.3.2/ml-frequent-pattern-mining.html#fp-growth\" target=\"_blank\"> More info...</a>"
}

var parameters = []
var selected_algorithm;
var selected_eval = "train-test"

<!-- Helpers -->

<!-- https://www.jstips.co/en/javascript/picking-and-rejecting-object-properties/  -->
function reject(obj, keys) {
    return Object.keys(obj)
        .filter(k => !keys.includes(k))
        .map(k => Object.assign({}, {[k]: obj[k]}))
        .reduce((res, o) => Object.assign(res, o), {});
}

function openTab(tab,btn){
    $(".aec-btn").not(btn).removeClass('active');
    $(btn).toggleClass('active');
    $("#aec").children().not("#"+tab).hide()
    $("#"+tab).slideToggle("fast");
}

<!-- helper: adds options to help populate algo option dropdowns -->
function algoOptionGenerator(instance,paramObj){  
    instance
    .append($("<option></option>")
            .attr("value",paramObj)
            .text(algorithm_list[paramObj]));
};
                                           
<!-- helper: adds options to help populate column dropdowns -->
function columnOptionGenerator(instance,paramObj){  
    instance
    .append($("<option></option>")
            .attr("value",paramObj)
            .text(paramObj));
};


<!-- Part 1 - Input -->

<!-- call python to read file from hdfs, reset filter dropdown and populate column dropdowns -->
<!-- aec required -->  
function selectDatafile(){
    var kernel = IPython.notebook.kernel;
    var callbacks = {
        iopub : {
             output : handle_output,
        }
    }
    dataset = $('#fileSelect').val();
    currentDFpath = $('#fileSelect').val();
    separator = $('#sepSelect').val();
    var msg_id = kernel.execute('read_dataset("'+dataset+'","'+separator+'")', callbacks);
}

<!-- aec required -->  
function selectDataset(){
    var kernel = IPython.notebook.kernel;
    var callbacks = {
        iopub : {
             output : getCSVfilesFromFS,
        }
    }
    dataset = $('#datasetSelect').val();
    var msg_id = kernel.execute('walk_dataset("'+dataset+'")', callbacks);
}

<!-- aec required -->  
function getCSVfilesFromFS(data){
    datafile_options = data.content.text;
    datafilesObtained(JSON.parse(datafile_options.replace(/u\'/g,'"').replace(/\'/g,'"')))
}
                                                          
<!-- aec required -->                                                              
function populateAvailableDatasetsList(data){
    $.get("http://bbc6.sics.se:8080/hopsworks-api/api/project/"+data.content.text+"/dataset/getContent", function(data,status){
        var user_datasets = []
        skipthem = ["Jupyter","Logs","Models","notebook","Resources"]
        data.forEach(function(element){
            if(skipthem.includes(element.name) || element.name.endsWith(".db")){
                return;
            } 
            user_datasets.push(element.path)
        });
        var dropl = document.getElementById("datasetSelect");
        dropl.options.length = 0;
        for (var i = 0; i < user_datasets.length; i++) { 
            dropl.options[dropl.options.length] = new Option(user_datasets[i].substring(user_datasets[i].lastIndexOf("/")+1),user_datasets[i]);
        }
        var droplsave = document.getElementById("datasetSaveSelect");
        droplsave.options.length = 0;
        for (var i = 0; i < user_datasets.length; i++) { 
            droplsave.options[droplsave.options.length] = new Option(user_datasets[i].substring(user_datasets[i].lastIndexOf("/")+1),user_datasets[i]);
        }
    })
}
                                                             
<!-- aec required -->                                                               
function updateBrowser() {
        $.get("http://bbc6.sics.se:8080/hopsworks-api/api/project", function(data,status){
            var pdict = {}
            data.forEach(function(e){
                pdict[e.project.name]=e.project.id;
            });
            var kernel = IPython.notebook.kernel;
            var callbacks = {
                iopub : {
                    output : populateAvailableDatasetsList,
                }
            }
            var pdict_parameter = (JSON.stringify(pdict))
            var msg_id = kernel.execute('find_current_projectID(\''+pdict_parameter+'\')', callbacks);
        });    
} 
                                                             
function datafilesObtained(options){
    var dropl = document.getElementById("fileSelect");
    dropl.options.length = 0;
    for (opt in options) {
        dropl.options[dropl.options.length] = new Option(options[opt]);
    }
}                                                             

                                                          
<!-- Part 1.1 Preview -->
function getTempDFpreview(){
    var kernel = IPython.notebook.kernel;
    var callbacks = {
        iopub : {
             output : show_preview_new,
        }
    }
    var msg_id = kernel.execute('get_tempdf_10first_lines_new()', callbacks); 
}

function show_preview_new(data){
    lines = data.content.text.slice(1,-1).replace(/ u\'/g,'"').replace(/\'/g,'"').replace(/\[/g,'').split('],');
    $('#preview-table').empty();
    newline="<tr>"
    $.each( JSON.parse(tempDF_columns.replace(/\'/g,'"')), function( key, val ) {
        newline = newline.concat("<th>"+ val +"</th>")
    });
    newline = newline.concat("</tr>")
    $("#preview-table").append(newline)
    $.each( lines, function( key, val ) {
        newline = "<tr>"
        valarray = val.split(',');
        $.each( valarray, function( k, v ) {
            newline = newline.concat("<td>"+ v +"</td>")
        });
        newline = newline.concat("</tr>")
        $("#preview-table").append(newline)
    });
    
    
    if ($('#data-preview').is(":hidden")){$('#data-preview').show()}
}
 

<!-- Part 2 - Algo Population & Selection -->

<!-- Part 3 - Algo Configuration -->

                       
function add_conf_forms(data){
    dataDict = JSON.parse(data.content.text.replace(/'/g,'"'));
    var temp_algo = Object.keys(dataDict)[0];
    params = dataDict[temp_algo];
    for(var p in params){
        var $label = $("<label>",{"class":"control-label col-sm-2",text:params[p]["label"]});
        var attributes = reject(params[p],["label"]);
        var attrDict = {};
        if(attributes["type"]!=="select"){
        for(attr in attributes){attrDict[attr]=attributes[attr]}
        $("#algo_".concat(temp_algo)).append(
            $("<div/>",{class:"form-group"}).append(
                $label,
                $("<div/>",{"class":"col-sm-10"}).append(
                    $("<input/>",attrDict
                     )
                )
            )
        );}
        else{
            var newselect = $('<select>')
            $(attributes.options).each(function() { newselect.append($("<option>").attr('value',this).text(this));});
            var select_div = $($("#algo_".concat(temp_algo)).append(
            $("<div/>",{class:"form-group"}).append(
                $label,
                $("<div/>",{"class":"col-sm-10"}).append(
                    $(newselect)
                )
            )
        ));
            
        }
    };
    
}

                                                    
<!-- add tempDF columns to the dropdown selectors -->
<!-- aec required -->  
function handle_output(data){
    tempDF_columns = data.content.text;
    addColumnFields(tempDF_columns);
}


function initAlgoConfigurations(){
    for(var k in algorithm_list){
        var kernel = IPython.notebook.kernel;
        var callbacks = {
        iopub : {
             output : add_conf_forms,
        }
    }
    var msg_id = kernel.execute('get_algorithm_parameters("'+k+'")', callbacks);
    }
}
                                                   
function loadAlgorithms(){
    var kernel = IPython.notebook.kernel;
    var callbacks = {
        iopub : {
             output : showAlgorithmList,
        }
    }
    family = $('#algoFamilySelect').val();
    var msg_id = kernel.execute('get_algos_in_family("'+family+'")', callbacks);
}
                                                          
                                                          
function showAlgorithmList(data){
    cols = JSON.parse(data.content.text.replace(/\'/g,'"'))
    $("#algoSelect").children('option:not(:first)').remove();
    for(var i = 0; i < cols.length; i++){
        algoOptionGenerator($("#algoSelect"),cols[i].trim());
    }  
    $("#algoSelect")[0].selectedIndex = 0;
}
                                                
<!-- populate column dropdowns -->
function addColumnFields(columns){
    arraycols = JSON.parse(columns.replace(/\'/g,'"'))
    $('.df-col').each(function(index){
        if($(this).hasClass("empty-option-allowed")){
            $(this).children('option:not(:first)').remove();
        }
        else{
            $(this).children('option').remove();
        }
        
        for(var i = 0; i < arraycols.length; i++){
            columnOptionGenerator($(this),arraycols[i].trim());
        }
        
    });
    
}
                                           
function populate_algo_conf_form(){
    selected_algorithm = $("#algoSelect").val();
    selected_algorithm_family = $("#algoFamilySelect").val();
    algoDiv = "#algo_".concat(selected_algorithm);
    familyDiv = "#algocommon_".concat(selected_algorithm_family);
    $(".algo_configuration").hide();
    $(familyDiv.concat(" .algoDescription")).html(algorithm_descriptions[selected_algorithm])
    $(familyDiv).show();
    $(algoDiv).show();
}
                                           
function selectEvalMethod(form,label){
    $(".eval-form").not("#"+form).hide()
    $("#"+form).show(); 
    selected_eval = form.replace("-form","");
}


<!-- Part 4 - Algo Execution -->


function executeAlgo(){
    
    var algoParamDict = {}
    
    $("#algo-results").html("-- pending --")
    
    selected_algorithm = $("#algoSelect").val();
    selected_algorithm_family = $("#algoFamilySelect").val();
    algoDiv = "#algo_".concat(selected_algorithm);
    familyDiv = "#algocommon_".concat(selected_algorithm_family);
    
    if(selected_algorithm === "ALS"){
        user_col = $("#als-user-id-col").val()
        algoParamDict.user = user_col
        item_col = $("#als-item-id-col").val()
        algoParamDict.item = item_col
        rating_col = $("#als-rating-col").val()
        algoParamDict.rating = rating_col
        
    }
    
    else if(selected_algorithm_family==="NLP"){
        text_col = $("#nlp-text-col").val()
        algoParamDict.textcol = text_col
    }
    
    else if(selected_algorithm==="FPGrowth"){
        items_col = $("#fpginput-col").val()
        algoParamDict.itemscol = items_col
    }
    
    else{
    
    if(selected_algorithm_family==="CL_REG"){
        label_col = $(familyDiv+" .single-column-selector.df-col").val()
        algoParamDict.label = label_col
        if(selected_eval==="k-fold"){
            algoParamDict.eval = "k-fold"
            algoParamDict.evalVal = ($("#k-fold-form div input")[0].value) 
        }
        else{
            algoParamDict.eval = "train-test"
            algoParamDict.evalVal = ($("#train-test-form div input")[0].value) 
        }
    }
    
    else if(selected_algorithm==="ChiSquared"){
        label_col = $(algoDiv+" .single-column-selector.df-col").val()
        algoParamDict.label = label_col
    }
    
    feature_cols = $(familyDiv+" .multiple-column-selector.df-col").val()
    algoParamDict.features=feature_cols
    }
    
    $(algoDiv+" .form-group").each(function( index ) {
        var label = $(this).children("label")[0];
        var value = $($(this).children("div")[0]).children()[0];
        algoParamDict[label.innerText]=value.value
    });
    
    
    out_dataset = $("#datasetSaveSelect").val()
    out_folder = ($("#folderNameSelect").val() !== "" ? $("#folderNameSelect").val() : "unknown_folder_AEC")
    out_path = out_dataset.concat("/",out_folder)
    algoParamDict["output_path"]=out_path
    lines = algoParamDict
    agh = (JSON.stringify(algoParamDict))
    
    var kernel = IPython.notebook.kernel;
    var callbacks = {
        iopub : {
             output : algo_run_show,
        }
    }
    var msg_id = kernel.execute('run_algo("'+selected_algorithm+'",\''+agh+'\')', callbacks); 
}

function algo_run_show(data){
    lines = data;
    if(data.content.name === "stderr"){
        $("#algo-results").text(data.content.text)
    }
    else{
        try{
            results = JSON.parse(data.content.text.replace(/'/g,'"'))["results"];
            var newcontent = '<br/><label style="width: 20%;" class="qblabel">Results Summary</label>'
            $.each( results, function( key, val ) {
                newcontent = newcontent.concat("<p><b>",key,":</b>",val,"</p>")
            });
            $("#algo-results").html(newcontent);
        }
        catch(err) {
            $("#algo-results").text("Oups! Something went wrong: ".concat(data.content.text))
        }  
    }
    
}

<!-- Part 5 - Output -->

<!-- Part 6 - Overview -->
function updateOverview(){
    out_dataset = ($("#datasetSaveSelect").val() !== "" ? $("#datasetSaveSelect").val() : "--- missing ---")
    out_folder = ($("#folderNameSelect").val() !== "" ? $("#folderNameSelect").val() : "--- missing ---")
    in_file = (currentDFpath !== "" ? currentDFpath : "---missing---")
    $("#overview-algorithm").text($("#algoSelect :selected").text());
    $("#overview-input").text(in_file);
    $("#overview-output").html("Dataset: ".concat(out_dataset," </br>Folder: ",out_folder));
};

                                                   
                                                   
<!-- Part 7 - Errors -->

                                                      
</script>

In [7]:
%%javascript
updateBrowser();
initAlgoConfigurations();

<IPython.core.display.Javascript object>