***GENERATED CODE FOR randomforest PIPELINE.***

***DON'T EDIT THIS CODE.***

***CONNECTOR FUNCTIONS TO READ DATA.***

In [None]:
import os
import pandas as pd
from hdfs3 import HDFileSystem
import datetime
import logging
import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)


class HDFSConnector:

    def fetch(spark, config):
        ################### INPUT HADOOP HOST PORT TO CONNECT WITH ###############################
        hdfs_server = str(os.environ['HDFS_SERVER'])
        hdfs_port = int(os.environ['HDFS_PORT'])
        hdfs = HDFileSystem(host=hdfs_server, port=hdfs_port)
        with hdfs.open(eval(config)['url']) as f:
            dfPd = pd.read_csv(f, error_bad_lines=False)
        df = spark.createDataFrame(dfPd)
        display(df.limit(2).toPandas())
        return df

    def put(df, spark, config):
        return df.write.format('csv').options(header='true' if eval(config)["is_header"] == "Use Header Line" else 'false',
                                              delimiter=eval(config)["delimiter"]).save(("%s %s") % (datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")+"_", eval(config)['url']))


***TRANSFORMATIONS FUNCTIONS THAT WILL BE APPLIED ON DATA***

In [None]:
import json
from pyspark.ml.feature import Binarizer
from pyspark.sql.functions import round
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import mean, stddev, min, max, col


class CleanseData:
    # def __init__(self,df):
    #     #print()

    def cleanValueForFE(self, value):
        if value == None:
            return ""
        elif str(value) == 'nan':
            return "nan"
        else:
            return value

    def replaceByMean(self, feature, df, mean_=-1):
        df1 = df
        df1 = df1.dropna()
        meanValue = self.cleanValueForFE(df1.select(
            mean(col(feature.name)).alias('mean')).collect()[0]["mean"])
        df = df.fillna(meanValue, subset=[feature.name])
        df.withColumn(feature.name, when(col(feature.name) == " ",
                      meanValue).otherwise(col(feature.name).cast("Integer")))
        return df

    def replaceByMax(self, feature, df, max_=-1):
        df1 = df
        df1 = df1.dropna()
        maxValue = self.cleanValueForFE(df1.select(
            max(col(feature.name)).alias('max')).collect()[0]["max"])
        df = df.fillna(maxValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", maxValue).otherwise(col(feature.name)))
        return df

    def replaceByMin(self, feature, df, min_=-1):
        df1 = df
        df1 = df1.dropna()
        minValue = self.cleanValueForFE(df1.select(
            min(col(feature.name)).alias('min')).collect()[0]["min"])
        df = df.fillna(minValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", minValue).otherwise(col(feature.name)))
        return df

    def replaceByStandardDeviation(self, feature, df, stddev_=-1):
        df1 = df
        df1 = df1.dropna()
        stddevValue = self.cleanValueForFE(df1.select(
            stddev(col(feature.name)).alias('stddev')).collect()[0]["stddev"])
        df = df.fillna(stddevValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", stddevValue).otherwise(col(feature.name)))
        return df

    def replaceDateRandomly(self, feature, df):
        df1 = df
        df1 = df1.dropna()
        fillValue = self.cleanValueForFE(
            df.where(col(feature.name).isNotNull()).head(1)[0][feature.name])
        df = df.fillna(str(fillValue), subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", fillValue).otherwise(col(feature.name)))
        # print("CleanseData:replaceDateRandomly Schema : ", df.#printSchema())
        return df

    def replaceNullValues(self, fList, df):
        featuresList = df.schema.fields
        for featureObj in fList:
            for feat in featuresList:
                if featureObj["feature"] in feat.name:
                    featureName = feat
                    if "mean" in featureObj["replaceby"]:
                        df = self.replaceByMean(featureName, df)
                    elif "max" in featureObj["replaceby"]:
                        df = self.replaceByMax(featureName, df)
                    elif "min" in featureObj["replaceby"]:
                        df = self.replaceByMin(featureName, df)
                    elif "stddev" in featureObj["replaceby"]:
                        df = self.replaceByStandardDeviation(featureName, df)
                    elif "random" in featureObj["replaceby"]:
                        df = self.replaceDateRandomly(featureName, df)
        return df


def StringIndexerTransform(df, params, transformationData={}):
    dfReturn = df
    feature = params["feature"]

    dfReturn = dfReturn.fillna({feature: ''})
    outcol = feature + "_stringindexer"
    indexer = StringIndexer(
        inputCol=feature, outputCol=outcol, handleInvalid="skip")
    indexed = indexer.fit(dfReturn).transform(dfReturn)
    dfReturn = indexed
    distinct_values_list = dfReturn.select(
        outcol).distinct().rdd.map(lambda r: r[0]).collect()
    len_distinct_values_list = len(distinct_values_list)
    if len_distinct_values_list <= 4:
        changed_type_df = dfReturn.withColumn(
            outcol, dfReturn[outcol].cast(IntegerType()))
        return changed_type_df
    return dfReturn


def BinarizerTransform(df, params, transformationData={}):
    dfReturn = df
    transform_params = params
    feature = transform_params['feature']
    outcol = feature + "_binarizer"
    dfReturn = dfReturn.withColumn("feature_cast", dfReturn[feature].cast("double")).drop(feature)\
        .withColumnRenamed("feature_cast", feature)

    dfReturn = dfReturn.fillna({feature: 0.0})
    binarizer = Binarizer(threshold=float(
        transformationData['threshold']), inputCol=feature, outputCol=outcol)
    binarizedDataFrame = binarizer.transform(dfReturn)

    # binarizedDataFrame=binarizedDataFrame.drop(feature).withColumnRenamed(outcol,feature)

    dfReturn = binarizedDataFrame
    dfReturn = dfReturn.withColumn(feature, round(dfReturn[feature], 2))

    return dfReturn


class TransformationMain:
    # TODO: change df argument in run with following
    def run(transformationDF, config):
        configObj = json.loads(config)
        featureData = configObj["FE"]
        transformationDF = CleanseData().replaceNullValues(featureData, transformationDF)
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Surname', 'transformation_label': 'String Indexer'}], 'feature': 'Surname', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
                                                  'count': '500', 'mean': '', 'stddev': '', 'min': 'Ahmed', 'max': 'Zetticci', 'missing': '0', 'distinct': '418'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Surname'}, {'feature_label': 'Surname', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Surname')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Geography', 'transformation_label': 'String Indexer'}], 'feature': 'Geography', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'France', 'max': 'Spain', 'missing': '0', 'distinct': '3'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Geography'}, {'feature_label': 'Geography', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Geography')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Gender', 'transformation_label': 'String Indexer'}], 'feature': 'Gender', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'Female', 'max': 'Male', 'missing': '0', 'distinct': '2'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Gender'}, {'feature_label': 'Gender', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Gender')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Balance', 'threshold': 74362.842, 'transformation_label': 'Binarizer'}], 'feature': 'Balance', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '500', 'mean': '74363.15', 'stddev': '62553.58', 'min': '0.0', 'max': '213146.2', 'missing': '0'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'Balance'}, {'feature_label': 'Balance', 'threshold': 74362.842, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('Balance')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'EstimatedSalary', 'threshold': 103007.368, 'transformation_label': 'Binarizer'}], 'feature': 'EstimatedSalary', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '500', 'mean': '103007.86', 'stddev': '57581.16', 'min': '371.05', 'max': '199725.39', 'missing': '0'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'EstimatedSalary'}, {'feature_label': 'EstimatedSalary', 'threshold': 103007.368, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('EstimatedSalary')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Card Type', 'transformation_label': 'String Indexer'}], 'feature': 'Card Type', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'DIAMOND', 'max': 'SILVER', 'missing': '0', 'distinct': '4'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Card Type'}, {'feature_label': 'Card Type', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Card Type')
        display(transformationDF.limit(2).toPandas())
        return transformationDF


***AUTOML FUNCTIONS***

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import pyspark
from sklearn.metrics import accuracy_score
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
import xgboost as xgb
import numpy as np
import lightgbm as lgb
import optuna
from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()


class GradientBoostingClassifierOptunaDriverClass:
    def set_tag(trial, df_new, features, label):
        def objective(trial):
            cat_col = df_new.select_dtypes(
                exclude=['int', 'float']).columns.values
            for c in cat_col:
                df_new[c] = df_new[c].astype('category')
                df_new[c] = lbl.fit_transform(df_new[c].astype(str))
            XRaw = df_new[features].values
            yRaw = df_new[label].values
            X_train, X_test, y_train, y_test = train_test_split(
                XRaw, yRaw, random_state=42)
            max_depth = trial.suggest_int("max_depth", 2, 32)
            n_est = trial.suggest_int('n_estimators', 1, 50)
            max_features = trial.suggest_categorical(
                'max_features', ['auto', 'sqrt', 'log2'])
            max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 10)
            classifier_obj = GradientBoostingClassifier(max_depth=max_depth, n_estimators=n_est,
                                                        max_features=max_features, max_leaf_nodes=max_leaf_nodes)

            classifier_obj.fit(X_train, y_train)
            preds = classifier_obj.predict(X_test)
            acc = accuracy_score(np.round(preds), y_test).mean()
            return acc
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=10, timeout=600)
        best_parameters = study.best_trial.params

        return best_parameters


class RandomForestClassifierOptunaDriverClass:
    def set_tag(trial, df_new, features, label):
        def objective(trial):
            cat_col = df_new.select_dtypes(
                exclude=['int', 'float']).columns.values
            for c in cat_col:
                df_new[c] = df_new[c].astype('category')
                df_new[c] = lbl.fit_transform(df_new[c].astype(str))
            XRaw = df_new[features].values
            yRaw = df_new[label].values
            X_train, X_test, y_train, y_test = train_test_split(
                XRaw, yRaw, random_state=42)

            rf_max_depth = int(trial.suggest_int("max_depth", 2, 15))
            n_est = trial.suggest_int('n_estimators', 1, 50)
            max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 10)
            max_features = trial.suggest_categorical(
                'max_features', ['auto', 'sqrt', 'log2'])
            classifier_obj = RandomForestClassifier(
                max_depth=rf_max_depth, n_estimators=n_est, max_features=max_features, max_leaf_nodes=max_leaf_nodes)
            classifier_obj.fit(X_train, y_train)
            preds = classifier_obj.predict(X_test)
            acc = accuracy_score(np.round(preds), y_test).mean()
            return acc
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=10, timeout=600)
        best_parameters = study.best_trial.params

        return best_parameters


class LightGBMClassifierOptunaDriverClass:
    def set_tag(trial, df_new, features, label):
        def objective(trial):
            cat_col = df_new.select_dtypes(
                exclude=['int', 'float']).columns.values
            for c in cat_col:
                df_new[c] = df_new[c].astype('category')
                df_new[c] = lbl.fit_transform(df_new[c].astype(str))
            XRaw = df_new[features].values
            yRaw = df_new[label].values
            X_train, X_test, y_train, y_test = train_test_split(
                XRaw, yRaw, random_state=42)
            param = {
                'task': 'train',
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'binary_logloss',
                "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
                "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
                "num_leaves": trial.suggest_int("num_leaves", 2, 50),
                "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
                "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
                "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
                "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
            }
            y_train_copy = [item for sublist in y_train for item in sublist]
            if len(list(set(tuple(y_train_copy)))) > 2:
                param['num_class'] = len(list(set(y_train_copy)))
                param['objective'] = 'multiclass'
                param['metric'] = 'multi_logloss'

            gbm = lgb.LGBMClassifier(**param)
            gbm.fit(X_train, y_train)

            preds = gbm.predict(X_test)
            acc = accuracy_score(np.round(preds), y_test).mean()

            return acc

        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=10, timeout=600)
        best_parameters = study.best_trial.params

        return best_parameters


class xgboostClassifierOptunaDriverClass:
    def set_tag(trial, df_new, features, label):
        def objective(trial):
            cat_col = df_new.select_dtypes(
                exclude=['int', 'float']).columns.values
            for c in cat_col:
                df_new[c] = df_new[c].astype('category')
                df_new[c] = lbl.fit_transform(df_new[c].astype(str))
            XRaw = df_new[features].values
            yRaw = df_new[label].values
            X_train, X_test, y_train, y_test = train_test_split(
                XRaw, yRaw, random_state=42)

            n_estimators = trial.suggest_int('n_estimators', 1, 50)
            max_depth = trial.suggest_int('max_depth', 1, 10)
            min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
            scale_pos_weight = trial.suggest_int('scale_pos_weight', 1, 50)
            subsample = trial.suggest_discrete_uniform(
                'subsample', 0.5, 0.9, 0.1)
            colsample_bytree = trial.suggest_discrete_uniform(
                'colsample_bytree', 0.5, 0.9, 0.1)

            xgboost_tuna = xgb.XGBClassifier(random_state=42,
                                             n_estimators=n_estimators,
                                             max_depth=max_depth,
                                             min_child_weight=min_child_weight,
                                             scale_pos_weight=scale_pos_weight,
                                             subsample=subsample,
                                             colsample_bytree=colsample_bytree,
                                             )
            xgboost_tuna.fit(X_train, y_train)
            tuna_pred_test = xgboost_tuna.predict(X_test)

            return ((accuracy_score(y_test, tuna_pred_test)))

        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=10, timeout=600)
        best_parameters = study.best_trial.params

        return best_parameters


class ElasticNetOptunaDriverClass:
    def set_tag(trial, df_new, features, label):
        def objective(trial):
            cat_col = df_new.select_dtypes(
                exclude=['int', 'float']).columns.values
            for c in cat_col:
                df_new[c] = df_new[c].astype('category')
                df_new[c] = lbl.fit_transform(df_new[c].astype(str))
            XRaw = df_new[features].values
            yRaw = df_new[label].values
            X_train, X_test, y_train, y_test = train_test_split(
                XRaw, yRaw, random_state=42)

            l1_ratio = trial.suggest_loguniform("l1_ratio", 0.1, 1)
            alpha = trial.suggest_loguniform("alpha", 0.1, 1)
            selection = trial.suggest_categorical(
                "selection", ['cyclic', 'random'])
            e_net = ElasticNet(
                alpha=alpha, l1_ratio=l1_ratio, selection=selection)
            e_net.fit(X_train, y_train)
            predictions = e_net.predict(X_test)
            loss = mean_absolute_error(predictions, y_test)
            return loss
        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=20, timeout=600)
        best_parameters = study.best_trial.params

        return best_parameters


class PolynomialregressionOptunaDriverClass:
    def set_tag(trial, XRaw, yRaw):
        def objective(trial):
            X_train, X_test, y_train, y_test = train_test_split(
                XRaw, yRaw, random_state=42)
            degree = trial.suggest_int("degree", 2, 5)
            polynomial = PolynomialFeatures(degree=degree)
            x_train_transformed = polynomial.fit_transform(X_train)
            x_test_transformed = polynomial.fit_transform(X_test)
            clf = LinearRegression()
            clf.fit(x_train_transformed, y_train)
            predictions = clf.predict(x_test_transformed)
            loss = mean_absolute_error(predictions, y_test)
            return loss
        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=5, timeout=600)
        best_parameters = study.best_trial.params

        return best_parameters


def gradientboostingclassifier(sparkDF, features, labels):
    sparkDF.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
    df = (sparkDF.toPandas())
    class_optuna = GradientBoostingClassifierOptunaDriverClass()
    optuna_parameters = class_optuna.set_tag(df, features, labels)
    model_gbm_sklearn = GradientBoostingClassifier(**optuna_parameters)
    X_train, X_test, y_train, y_test = train_test_split(
        df[features], df[labels])
    model_gbm_sklearn.fit(X_train, y_train)
    display(" Accuracy of Model : %s" %
            model_gbm_sklearn.score(X_test, y_test))

    data = {'model': model_gbm_sklearn,
            'X_test': X_test,
            'y_test': y_test,
            'label': labels,
            'columnNames': df.columns}
    return data


***READING DATAFRAME***

In [None]:
############## CREATE SPARK SESSION ############################ ENTER YOUR SPARK MASTER IP AND PORT TO CONNECT TO SERVER ################
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[1]').getOrCreate()
#%run randomforestHooks.ipynb
try:
	#sourcePreExecutionHook()

	customerchurnrecords = HDFSConnector.fetch(spark, "{'url': '/FileStore/platform/uploadedSourceFiles/Customer-Churn-Records.csv', 'filename': 'Customer-Churn-Records.csv', 'delimiter': ',', 'file_type': 'Delimeted', 'dbfs_token': '', 'dbfs_domain': '', 'is_header': 'Use Header Line', 'server_url': '/numtraPlatform/NumtraPlatformV3/uploads/platform/', 'results_url': 'http://ml.colaberry.com:44040/api/read/hdfs'}")

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***TRANSFORMING DATAFRAME***

In [None]:
#%run randomforestHooks.ipynb
try:
	#transformationPreExecutionHook()

	autofe = TransformationMain.run(customerchurnrecords,json.dumps( {"FE": [{"transformationsData": [{"transformation_label": "novalue"}], "feature": "RowNumber", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "250.5", "stddev": "144.48", "min": "1", "max": "500", "missing": "0"}, "updatedLabel": "RowNumber"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "CustomerId", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "15693572.05", "stddev": "70877.62", "min": "15566111", "max": "15813844", "missing": "0"}, "updatedLabel": "CustomerId"}, {"transformationsData": [{"feature_label": "Surname", "transformation_label": "String Indexer"}], "feature": "Surname", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "Ahmed", "max": "Zetticci", "missing": "0", "distinct": "418"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Surname"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "CreditScore", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "647.4", "stddev": "101.14", "min": "376", "max": "850", "missing": "0"}, "updatedLabel": "CreditScore"}, {"transformationsData": [{"feature_label": "Geography", "transformation_label": "String Indexer"}], "feature": "Geography", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "France", "max": "Spain", "missing": "0", "distinct": "3"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Geography"}, {"transformationsData": [{"feature_label": "Gender", "transformation_label": "String Indexer"}], "feature": "Gender", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "Female", "max": "Male", "missing": "0", "distinct": "2"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Gender"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Age", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "38.11", "stddev": "9.96", "min": "19", "max": "80", "missing": "0"}, "updatedLabel": "Age"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Tenure", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "5.13", "stddev": "2.99", "min": "0", "max": "10", "missing": "0"}, "updatedLabel": "Tenure"}, {"transformationsData": [{"feature_label": "Balance", "threshold": 74362.842, "transformation_label": "Binarizer"}], "feature": "Balance", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "74363.15", "stddev": "62553.58", "min": "0.0", "max": "213146.2", "missing": "0"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "Balance"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "NumOfProducts", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "1.53", "stddev": "0.57", "min": "1", "max": "4", "missing": "0"}, "updatedLabel": "NumOfProducts"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "HasCrCard", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.71", "stddev": "0.46", "min": "0", "max": "1", "missing": "0"}, "updatedLabel": "HasCrCard"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "IsActiveMember", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.49", "stddev": "0.5", "min": "0", "max": "1", "missing": "0"}, "updatedLabel": "IsActiveMember"}, {"transformationsData": [{"feature_label": "EstimatedSalary", "threshold": 103007.368, "transformation_label": "Binarizer"}], "feature": "EstimatedSalary", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "103007.86", "stddev": "57581.16", "min": "371.05", "max": "199725.39", "missing": "0"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "EstimatedSalary"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Exited", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.2", "stddev": "0.4", "min": "0", "max": "1", "missing": "0"}, "updatedLabel": "Exited"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Complain", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.21", "stddev": "0.41", "min": "0", "max": "1", "missing": "0"}, "updatedLabel": "Complain"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Satisfaction Score", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "3.03", "stddev": "1.34", "min": "1", "max": "5", "missing": "0"}, "updatedLabel": "Satisfaction Score"}, {"transformationsData": [{"feature_label": "Card Type", "transformation_label": "String Indexer"}], "feature": "Card Type", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "DIAMOND", "max": "SILVER", "missing": "0", "distinct": "4"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Card Type"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Point Earned", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "595.12", "stddev": "227.38", "min": "119", "max": "994", "missing": "0"}, "updatedLabel": "Point Earned"}]}))

	#transformationPostExecutionHook(autofe)

except Exception as ex: 
	logging.error(ex)


***TRAIN MODEL***

In [None]:
#%run randomforestHooks.ipynb
try:
	#mlPreExecutionHook()

	dataAutoML=gradientboostingclassifier(autofe, ["RowNumber", "CustomerId", "CreditScore", "Age", "Tenure", "NumOfProducts", "HasCrCard", "IsActiveMember", "Complain", "Satisfaction Score", "Point Earned", "Surname_stringindexer", "Geography_stringindexer", "Gender_stringindexer", "Balance_binarizer", "EstimatedSalary_binarizer", "Card Type_stringindexer"], "Exited")

	#mlPostExecutionHook(dataAutoML)

except Exception as ex: 
	logging.error(ex)
#spark.stop()
