***GENERATED CODE FOR ociancleanup PIPELINE.***

***DON'T EDIT THIS CODE.***

***CONNECTOR FUNCTIONS TO READ DATA.***

In [None]:
import os
import datetime
import logging
import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)


class HDFSConnector:

    def fetch(spark, config):
        ################### INPUT HADOOP HOST PORT TO CONNECT WITH ###############################
        hdfs_server = str(os.environ['HDFS_SERVER'])
        hdfs_port = int(os.environ['HDFS_PORT'])
        df = spark.read.options(header='true', inferschema='true').csv(
            f"hdfs://{hdfs_server}:{hdfs_port}{eval(config)['url']}", header='true')
        display(df.limit(2).toPandas())
        return df

    def put(df, spark, config):
        return df.write.format('csv').options(header='true' if eval(config)["is_header"] == "Use Header Line" else 'false',
                                              delimiter=eval(config)["delimiter"]).save(("%s %s") % (datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")+"_", eval(config)['url']))


***TRANSFORMATIONS FUNCTIONS THAT WILL BE APPLIED ON DATA***

In [None]:
import json
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import mean, stddev, min, max, col


class CleanseData:
    # def __init__(self,df):
    #     #print()

    def cleanValueForFE(self, value):
        if value == None:
            return ""
        elif str(value) == 'nan':
            return "nan"
        else:
            return value

    def replaceByMean(self, feature, df, mean_=-1):
        df1 = df
        df1 = df1.dropna()
        meanValue = self.cleanValueForFE(df1.select(
            mean(col(feature.name)).alias('mean')).collect()[0]["mean"])
        df = df.fillna(meanValue, subset=[feature.name])
        df.withColumn(feature.name, when(col(feature.name) == " ",
                      meanValue).otherwise(col(feature.name).cast("Integer")))
        return df

    def replaceByMax(self, feature, df, max_=-1):
        df1 = df
        df1 = df1.dropna()
        maxValue = self.cleanValueForFE(df1.select(
            max(col(feature.name)).alias('max')).collect()[0]["max"])
        df = df.fillna(maxValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", maxValue).otherwise(col(feature.name)))
        return df

    def replaceByMin(self, feature, df, min_=-1):
        df1 = df
        df1 = df1.dropna()
        minValue = self.cleanValueForFE(df1.select(
            min(col(feature.name)).alias('min')).collect()[0]["min"])
        df = df.fillna(minValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", minValue).otherwise(col(feature.name)))
        return df

    def replaceByStandardDeviation(self, feature, df, stddev_=-1):
        df1 = df
        df1 = df1.dropna()
        stddevValue = self.cleanValueForFE(df1.select(
            stddev(col(feature.name)).alias('stddev')).collect()[0]["stddev"])
        df = df.fillna(stddevValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", stddevValue).otherwise(col(feature.name)))
        return df

    def replaceDateRandomly(self, feature, df):
        df1 = df
        df1 = df1.dropna()
        fillValue = self.cleanValueForFE(
            df.where(col(feature.name).isNotNull()).head(1)[0][feature.name])
        df = df.fillna(str(fillValue), subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", fillValue).otherwise(col(feature.name)))
        # print("CleanseData:replaceDateRandomly Schema : ", df.#printSchema())
        return df

    def replaceNullValues(self, fList, df):
        featuresList = df.schema.fields
        for featureObj in fList:
            for feat in featuresList:
                if featureObj["feature"] in feat.name:
                    featureName = feat
                    if "mean" in featureObj["replaceby"]:
                        df = self.replaceByMean(featureName, df)
                    elif "max" in featureObj["replaceby"]:
                        df = self.replaceByMax(featureName, df)
                    elif "min" in featureObj["replaceby"]:
                        df = self.replaceByMin(featureName, df)
                    elif "stddev" in featureObj["replaceby"]:
                        df = self.replaceByStandardDeviation(featureName, df)
                    elif "random" in featureObj["replaceby"]:
                        df = self.replaceDateRandomly(featureName, df)
        return df


def StringIndexerTransform(df, params, transformationData={}):
    dfReturn = df
    feature = params["feature"]

    dfReturn = dfReturn.fillna({feature: ''})
    outcol = feature + "_stringindexer"
    indexer = StringIndexer(
        inputCol=feature, outputCol=outcol, handleInvalid="skip")
    indexed = indexer.fit(dfReturn).transform(dfReturn)
    dfReturn = indexed
    distinct_values_list = dfReturn.select(
        outcol).distinct().rdd.map(lambda r: r[0]).collect()
    len_distinct_values_list = len(distinct_values_list)
    if len_distinct_values_list <= 4:
        changed_type_df = dfReturn.withColumn(
            outcol, dfReturn[outcol].cast(IntegerType()))
        return changed_type_df
    return dfReturn


class TransformationMain:
    # TODO: change df argument in run with following
    def run(transformationDF, config):
        configObj = json.loads(config)
        featureData = configObj["FE"]
        transformationDF = CleanseData().replaceNullValues(featureData, transformationDF)
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Zone', 'transformation_label': 'String Indexer'}], 'feature': 'Zone', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
                                                  'count': '500', 'mean': '', 'stddev': '', 'min': 'Accomack County, VA, USA', 'max': 'Washington County, RI, USA', 'missing': '2', 'distinct': '38'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Zone'}, {'feature_label': 'Zone', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Zone')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'State', 'transformation_label': 'String Indexer'}], 'feature': 'State', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'Arizona, USA', 'max': 'Texas, USA', 'missing': '0', 'distinct': '28'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'State'}, {'feature_label': 'State', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('State')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Country', 'transformation_label': 'String Indexer'}], 'feature': 'Country', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'United States', 'max': 'United States', 'missing': '0', 'distinct': '1'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Country'}, {'feature_label': 'Country', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Country')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Cleanup Type', 'transformation_label': 'String Indexer'}], 'feature': 'Cleanup Type', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'Land (beach, shoreline and inland)', 'max': 'Watercraft (powerboat, sailboat, kayak or canoe)', 'missing': '0', 'distinct': '1'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Cleanup Type'}, {'feature_label': 'Cleanup Type', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Cleanup Type')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Cleanup Date', 'transformation_label': 'String Indexer'}], 'feature': 'Cleanup Date', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': '1/10/2015 0:00', 'max': '8/8/2015 0:00', 'missing': '0', 'distinct': '14'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Cleanup Date'}, {'feature_label': 'Cleanup Date', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Cleanup Date')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Group Name', 'transformation_label': 'String Indexer'}], 'feature': 'Group Name', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '341', 'mean': '', 'stddev': '', 'min': ' Jen', 'max': 'test group', 'missing': '159', 'distinct': '128'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Group Name'}, {'feature_label': 'Group Name', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Group Name')
        display(transformationDF.limit(2).toPandas())
        return transformationDF


***AUTOML FUNCTIONS***

In [None]:
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor
import pyspark


def functionRegression(sparkDF, listOfFeatures, label):
    sparkDF.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
    df = sparkDF.toPandas()
    X = (df.drop(label, axis=1))[listOfFeatures].values
    y = df[label].values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=1, test_size=0.1)
    tpotModel = TPOTRegressor(verbosity=3, generations=10, max_time_mins=5,
                              n_jobs=-1, random_state=25, population_size=15, use_dask=True)
    tpotModel.fit(X_train, y_train)
    display(" Error rate of Model : %s" % tpotModel.score(X_test, y_test))
    data = {'model': tpotModel,
            'X_test': X_test,
            'y_test': y_test,
            'label': label,
            'columnNames': listOfFeatures}
    return data


***READING DATAFRAME***

In [None]:
############## CREATE SPARK SESSION ############################ ENTER YOUR SPARK MASTER IP AND PORT TO CONNECT TO SERVER ################
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[1]').getOrCreate()
#%run ociancleanupHooks.ipynb
try:
	#sourcePreExecutionHook()

	ocu = HDFSConnector.fetch(spark, "{'url': '/FileStore/platform/uploadedSourceFiles/OCU.csv', 'filename': 'OCU.csv', 'delimiter': ',', 'file_type': 'Delimeted', 'dbfs_token': '', 'dbfs_domain': '', 'is_header': 'Use Header Line', 'server_url': '/numtraPlatform/NumtraPlatformV3/uploads/platform/', 'results_url': 'http://ml.colaberry.com:44040/api/read/hdfs'}")

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***TRANSFORMING DATAFRAME***

In [None]:
#%run ociancleanupHooks.ipynb
try:
	#transformationPreExecutionHook()

	ociancleanupautofe = TransformationMain.run(ocu,json.dumps( {"FE": [{"transformationsData": [{"transformation_label": "novalue"}], "feature": "ID", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "1599.98", "stddev": "5716.98", "min": "92", "max": "48781", "missing": "0"}, "updatedLabel": "ID"}, {"transformationsData": [{"feature_label": "Zone", "transformation_label": "String Indexer"}], "feature": "Zone", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "Accomack County, VA, USA", "max": "Washington County, RI, USA", "missing": "2", "distinct": "38"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Zone"}, {"transformationsData": [{"feature_label": "State", "transformation_label": "String Indexer"}], "feature": "State", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "Arizona, USA", "max": "Texas, USA", "missing": "0", "distinct": "28"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "State"}, {"transformationsData": [{"feature_label": "Country", "transformation_label": "String Indexer"}], "feature": "Country", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "United States", "max": "United States", "missing": "0", "distinct": "1"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Country"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "GPS_Lat ", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "32.83", "stddev": "3.08", "min": "26.88461", "max": "42.79918", "missing": "0"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "GPS_Lat "}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "GPS_Long", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "-91.36", "stddev": "22.52", "min": "-158.11241", "max": "-70.37840605", "missing": "0"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "GPS_Long"}, {"transformationsData": [{"feature_label": "Cleanup Type", "transformation_label": "String Indexer"}], "feature": "Cleanup Type", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "Land (beach, shoreline and inland)", "max": "Watercraft (powerboat, sailboat, kayak or canoe)", "missing": "0", "distinct": "1"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Cleanup Type"}, {"transformationsData": [{"feature_label": "Cleanup Date", "transformation_label": "String Indexer"}], "feature": "Cleanup Date", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "1/10/2015 0:00", "max": "8/8/2015 0:00", "missing": "0", "distinct": "14"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Cleanup Date"}, {"transformationsData": [{"feature_label": "Group Name", "transformation_label": "String Indexer"}], "feature": "Group Name", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "341", "mean": "", "stddev": "", "min": " Jen", "max": "test group", "missing": "159", "distinct": "128"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Group Name"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Adults", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "2.81", "stddev": "12.82", "min": "0", "max": "215", "missing": "0"}, "updatedLabel": "Adults"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Children", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "6.58", "stddev": "33.04", "min": "0", "max": "583", "missing": "0"}, "updatedLabel": "Children"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "People", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "37.83", "stddev": "215.25", "min": "0", "max": "3800", "missing": "0"}, "updatedLabel": "People"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Pounds", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "946.96", "stddev": "6321.74", "min": "0.0", "max": "110250.0", "missing": "0"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Pounds"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Miles", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "6.39", "stddev": "23.01", "min": "0.0", "max": "309.3185", "missing": "0"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Miles"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "# of bags", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "23.79", "stddev": "194.16", "min": "0", "max": "3670", "missing": "0"}, "updatedLabel": "# of bags"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Cigarette Butts", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "79.05", "stddev": "473.31", "min": "0", "max": "8904", "missing": "0"}, "updatedLabel": "Cigarette Butts"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Food Wrappers (candy, chips, etc#)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "4.82", "stddev": "6.97", "min": "0", "max": "52", "missing": "0"}, "updatedLabel": "Food Wrappers (candy, chi..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Take Out/Away Containers (Plastic)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "4.76", "stddev": "18.66", "min": "0", "max": "244", "missing": "0"}, "updatedLabel": "Take Out/Away Containers ..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Take Out/Away Containers (Foam)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "3.1", "stddev": "14.46", "min": "0", "max": "249", "missing": "0"}, "updatedLabel": "Take Out/Away Containers ..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Bottle Caps (Plastic)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "35.82", "stddev": "309.01", "min": "0", "max": "6655", "missing": "0"}, "updatedLabel": "Bottle Caps (Plastic)"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Bottle Caps (Metal)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "18.78", "stddev": "307.82", "min": "0", "max": "6862", "missing": "0"}, "updatedLabel": "Bottle Caps (Metal)"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Lids (Plastic)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "5.41", "stddev": "21.81", "min": "0", "max": "362", "missing": "0"}, "updatedLabel": "Lids (Plastic)"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Straws, Stirrers", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "1.95", "stddev": "3.3", "min": "0", "max": "30", "missing": "0"}, "updatedLabel": "Straws, Stirrers"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Forks, Knives, Spoons", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "4.2", "stddev": "16.57", "min": "0", "max": "228", "missing": "0"}, "updatedLabel": "Forks, Knives, Spoons"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Beverage Bottles (Plastic)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "22.37", "stddev": "95.6", "min": "0", "max": "1402", "missing": "0"}, "updatedLabel": "Beverage Bottles (Plastic..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Beverage Bottles (Glass)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "1.57", "stddev": "4.43", "min": "0", "max": "54", "missing": "0"}, "updatedLabel": "Beverage Bottles (Glass)"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Beverage Cans", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "1.51", "stddev": "2.73", "min": "0", "max": "21", "missing": "0"}, "updatedLabel": "Beverage Cans"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Grocery Bags (Plastic)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "1.58", "stddev": "2.68", "min": "0", "max": "30", "missing": "0"}, "updatedLabel": "Grocery Bags (Plastic)"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Other Plastic Bags", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.43", "stddev": "2.26", "min": "0", "max": "27", "missing": "0"}, "updatedLabel": "Other Plastic Bags"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Paper Bags", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.26", "stddev": "1.39", "min": "0", "max": "20", "missing": "0"}, "updatedLabel": "Paper Bags"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Cups, Plates (Paper)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "3.19", "stddev": "26.32", "min": "0", "max": "550", "missing": "0"}, "updatedLabel": "Cups, Plates (Paper)"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Cups, Plates (Plastic)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "5.83", "stddev": "23.6", "min": "0", "max": "363", "missing": "0"}, "updatedLabel": "Cups, Plates (Plastic)"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Cups, Plates (Foam)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "3.84", "stddev": "25.18", "min": "0", "max": "469", "missing": "0"}, "updatedLabel": "Cups, Plates (Foam)"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Fishing Buoys, Pots & Traps", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.97", "stddev": "7.13", "min": "0", "max": "117", "missing": "0"}, "updatedLabel": "Fishing Buoys, Pots & Tra..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Fishing Net & Pieces", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.1", "stddev": "1.07", "min": "0", "max": "20", "missing": "0"}, "updatedLabel": "Fishing Net & Pieces"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Fishing Line (1 yard/meter = 1 piece)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.85", "stddev": "10.03", "min": "0", "max": "164", "missing": "0"}, "updatedLabel": "Fishing Line (1 yard/mete..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Rope (1 yard/meter = 1 piece)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "2.34", "stddev": "15.39", "min": "0", "max": "264", "missing": "0"}, "updatedLabel": "Rope (1 yard/meter = 1 pi..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Fishing Gear (Clean Swell)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "1.71", "stddev": "5.5", "min": "0", "max": "79", "missing": "0"}, "updatedLabel": "Fishing Gear (Clean Swell..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "6-Pack Holders", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.38", "stddev": "2.79", "min": "0", "max": "54", "missing": "0"}, "updatedLabel": "6-Pack Holders"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Other Plastic/Foam Packaging", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.57", "stddev": "3.22", "min": "0", "max": "39", "missing": "0"}, "updatedLabel": "Other Plastic/Foam Packag..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Other Plastic Bottles (oil, bleach, etc#)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.06", "stddev": "0.51", "min": "0", "max": "7", "missing": "0"}, "updatedLabel": "Other Plastic Bottles (oi..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Strapping Bands", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.09", "stddev": "0.7", "min": "0", "max": "11", "missing": "0"}, "updatedLabel": "Strapping Bands"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Tobacco Packaging/Wrap", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "3.56", "stddev": "28.51", "min": "0", "max": "528", "missing": "0"}, "updatedLabel": "Tobacco Packaging/Wrap"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Other Packaging (Clean Swell)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "1.94", "stddev": "4.95", "min": "0", "max": "41", "missing": "0"}, "updatedLabel": "Other Packaging (Clean Sw..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Appliances (refrigerators, washers, etc#)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.18", "stddev": "1.26", "min": "0", "max": "21", "missing": "0"}, "updatedLabel": "Appliances (refrigerators..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Balloons", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "1.67", "stddev": "9.5", "min": "0", "max": "175", "missing": "0"}, "updatedLabel": "Balloons"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Cigar Tips", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "3.83", "stddev": "29.97", "min": "0", "max": "558", "missing": "0"}, "updatedLabel": "Cigar Tips"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Cigarette Lighters", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.06", "stddev": "0.52", "min": "0", "max": "7", "missing": "0"}, "updatedLabel": "Cigarette Lighters"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Construction Materials", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.22", "stddev": "1.44", "min": "0", "max": "20", "missing": "0"}, "updatedLabel": "Construction Materials"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Fireworks", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.31", "stddev": "4.36", "min": "0", "max": "95", "missing": "0"}, "updatedLabel": "Fireworks"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Tires", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "2.11", "stddev": "20.27", "min": "0", "max": "351", "missing": "0"}, "updatedLabel": "Tires"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Toys", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "1.78", "stddev": "2.75", "min": "0", "max": "22", "missing": "0"}, "updatedLabel": "Toys"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Other Trash (Clean Swell)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "5.4", "stddev": "11.12", "min": "0", "max": "72", "missing": "0"}, "updatedLabel": "Other Trash (Clean Swell)"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Condoms", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.05", "stddev": "0.69", "min": "0", "max": "15", "missing": "0"}, "updatedLabel": "Condoms"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Diapers", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.24", "stddev": "1.55", "min": "0", "max": "21", "missing": "0"}, "updatedLabel": "Diapers"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Syringes", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.02", "stddev": "0.21", "min": "0", "max": "4", "missing": "0"}, "updatedLabel": "Syringes"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Tampons/Tampon Applicators", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.05", "stddev": "0.44", "min": "0", "max": "6", "missing": "0"}, "updatedLabel": "Tampons/Tampon Applicator..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Personal Hygiene (Clean Swell)", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.75", "stddev": "2.06", "min": "0", "max": "23", "missing": "0"}, "updatedLabel": "Personal Hygiene (Clean S..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Foam Pieces", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "23.29", "stddev": "122.25", "min": "0", "max": "1424", "missing": "0"}, "updatedLabel": "Foam Pieces"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Glass Pieces", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.78", "stddev": "6.11", "min": "0", "max": "118", "missing": "0"}, "updatedLabel": "Glass Pieces"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Plastic Pieces", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "54.33", "stddev": "236.47", "min": "0", "max": "3968", "missing": "0"}, "updatedLabel": "Plastic Pieces"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Total Items Collected", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "49.43", "stddev": "53.12", "min": "0", "max": "420", "missing": "0"}, "updatedLabel": "Total Items Collected"}]}))

	#transformationPostExecutionHook(ociancleanupautofe)

except Exception as ex: 
	logging.error(ex)


***TRAIN MODEL***

In [None]:
#%run ociancleanupHooks.ipynb
try:
	#mlPreExecutionHook()

	dataAutoML=functionRegression(ociancleanupautofe, ["GPS_Lat ", "GPS_Long", "Adults", "Children", "People", "Pounds", "Miles", "# of bags", "Cigarette Butts", "Food Wrappers (candy, chips, etc#)", "Take Out/Away Containers (Plastic)", "Take Out/Away Containers (Foam)", "Bottle Caps (Plastic)", "Bottle Caps (Metal)", "Lids (Plastic)", "Straws, Stirrers", "Forks, Knives, Spoons", "Beverage Bottles (Plastic)", "Beverage Bottles (Glass)", "Beverage Cans", "Grocery Bags (Plastic)", "Other Plastic Bags", "Paper Bags", "Cups, Plates (Paper)", "Cups, Plates (Plastic)", "Cups, Plates (Foam)", "Fishing Buoys, Pots & Traps", "Fishing Net & Pieces", "Fishing Line (1 yard/meter = 1 piece)", "Rope (1 yard/meter = 1 piece)", "Fishing Gear (Clean Swell)", "6-Pack Holders", "Other Plastic/Foam Packaging", "Other Plastic Bottles (oil, bleach, etc#)", "Strapping Bands", "Tobacco Packaging/Wrap", "Other Packaging (Clean Swell)", "Appliances (refrigerators, washers, etc#)", "Balloons", "Cigar Tips", "Cigarette Lighters", "Construction Materials", "Fireworks", "Tires", "Toys", "Other Trash (Clean Swell)", "Condoms", "Diapers", "Syringes", "Tampons/Tampon Applicators", "Personal Hygiene (Clean Swell)", "Foam Pieces", "Glass Pieces", "Plastic Pieces", "Total Items Collected", "Zone_stringindexer", "State_stringindexer", "Country_stringindexer", "Cleanup Type_stringindexer", "Cleanup Date_stringindexer", "Group Name_stringindexer"], "ID")

	#mlPostExecutionHook(dataAutoML)

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***PREDICT ON TRAINED MODEL***

In [None]:
import pandas as pd
import numpy as np
import sklearn.metrics

try:
    model=dataAutoML ['model']
    X_test=dataAutoML['X_test']
    y_test=dataAutoML['y_test']
    label=dataAutoML['label']
    columnNames=dataAutoML['columnNames']
    if label in columnNames:
        columnNames.remove(label)
    predicted=label+"_predicted"
    y_predicted=model.predict(X_test)
    df =pd.DataFrame(X_test , columns=columnNames)
    df[label]=y_test
    df[predicted]=y_predicted
    columnNames.insert(0,predicted)
    columnNames.insert(0,label)
    df = df[columnNames]
    R2 = np.round(sklearn.metrics.r2_score(y_test, y_predicted), 1)
    Mean_Squared_Error = np.round(sklearn.metrics.mean_squared_error(y_test, y_predicted), 1)
    Mean_Absolute_Error = np.round(sklearn.metrics.mean_absolute_error(y_test, y_predicted), 1)
    display(" R2 score of Prediction on test data    : %s"%R2)
    display(" Mean Squared Error of Prediction on test data    : %s"%Mean_Squared_Error)
    display(" Mean Absolute Error of Prediction on test data   : %s"%Mean_Absolute_Error)
    display(df.head())
except Exception as ex:
    logging.error(ex)

spark.stop()

