***GENERATED CODE FOR gettingfile PIPELINE.***

***DON'T EDIT THIS CODE.***

***CONNECTOR FUNCTIONS TO READ DATA.***

In [None]:
import os
import datetime
import logging
import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)


class HDFSConnector:

    def fetch(spark, config):
        ################### INPUT HADOOP HOST PORT TO CONNECT WITH ###############################
        hdfs_server = str(os.environ['HDFS_SERVER'])
        hdfs_port = int(os.environ['HDFS_PORT'])
        df = spark.read.options(header='true', inferschema='true').csv(
            f"hdfs://{hdfs_server}:{hdfs_port}{eval(config)['url']}", header='true')
        display(df.limit(2).toPandas())
        return df

    def put(df, spark, config):
        return df.write.format('csv').options(header='true' if eval(config)["is_header"] == "Use Header Line" else 'false',
                                              delimiter=eval(config)["delimiter"]).save(("%s %s") % (datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")+"_", eval(config)['url']))


***TRANSFORMATIONS FUNCTIONS THAT WILL BE APPLIED ON DATA***

In [None]:
from pyspark.sql.functions import dayofmonth, month, year, col
import json
from pyspark.ml.feature import Binarizer
from pyspark.sql.functions import round
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import mean, stddev, min, max, col


class CleanseData:
    # def __init__(self,df):
    #     #print()

    def cleanValueForFE(self, value):
        if value == None:
            return ""
        elif str(value) == 'nan':
            return "nan"
        else:
            return value

    def replaceByMean(self, feature, df, mean_=-1):
        df1 = df
        df1 = df1.dropna()
        meanValue = self.cleanValueForFE(df1.select(
            mean(col(feature.name)).alias('mean')).collect()[0]["mean"])
        df = df.fillna(meanValue, subset=[feature.name])
        df.withColumn(feature.name, when(col(feature.name) == " ",
                      meanValue).otherwise(col(feature.name).cast("Integer")))
        return df

    def replaceByMax(self, feature, df, max_=-1):
        df1 = df
        df1 = df1.dropna()
        maxValue = self.cleanValueForFE(df1.select(
            max(col(feature.name)).alias('max')).collect()[0]["max"])
        df = df.fillna(maxValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", maxValue).otherwise(col(feature.name)))
        return df

    def replaceByMin(self, feature, df, min_=-1):
        df1 = df
        df1 = df1.dropna()
        minValue = self.cleanValueForFE(df1.select(
            min(col(feature.name)).alias('min')).collect()[0]["min"])
        df = df.fillna(minValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", minValue).otherwise(col(feature.name)))
        return df

    def replaceByStandardDeviation(self, feature, df, stddev_=-1):
        df1 = df
        df1 = df1.dropna()
        stddevValue = self.cleanValueForFE(df1.select(
            stddev(col(feature.name)).alias('stddev')).collect()[0]["stddev"])
        df = df.fillna(stddevValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", stddevValue).otherwise(col(feature.name)))
        return df

    def replaceDateRandomly(self, feature, df):
        df1 = df
        df1 = df1.dropna()
        fillValue = self.cleanValueForFE(
            df.where(col(feature.name).isNotNull()).head(1)[0][feature.name])
        df = df.fillna(str(fillValue), subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", fillValue).otherwise(col(feature.name)))
        # print("CleanseData:replaceDateRandomly Schema : ", df.#printSchema())
        return df

    def replaceNullValues(self, fList, df):
        featuresList = df.schema.fields
        for featureObj in fList:
            for feat in featuresList:
                if featureObj["feature"] in feat.name:
                    featureName = feat
                    if "mean" in featureObj["replaceby"]:
                        df = self.replaceByMean(featureName, df)
                    elif "max" in featureObj["replaceby"]:
                        df = self.replaceByMax(featureName, df)
                    elif "min" in featureObj["replaceby"]:
                        df = self.replaceByMin(featureName, df)
                    elif "stddev" in featureObj["replaceby"]:
                        df = self.replaceByStandardDeviation(featureName, df)
                    elif "random" in featureObj["replaceby"]:
                        df = self.replaceDateRandomly(featureName, df)
        return df


def StringIndexerTransform(df, params, transformationData={}):
    dfReturn = df
    feature = params["feature"]

    dfReturn = dfReturn.fillna({feature: ''})
    outcol = feature + "_stringindexer"
    indexer = StringIndexer(
        inputCol=feature, outputCol=outcol, handleInvalid="skip")
    indexed = indexer.fit(dfReturn).transform(dfReturn)
    dfReturn = indexed
    distinct_values_list = dfReturn.select(
        outcol).distinct().rdd.map(lambda r: r[0]).collect()
    len_distinct_values_list = len(distinct_values_list)
    if len_distinct_values_list <= 4:
        changed_type_df = dfReturn.withColumn(
            outcol, dfReturn[outcol].cast(IntegerType()))
        return changed_type_df
    return dfReturn


def ExtractDateTransform(df, params, transformationData={}):
    transform_params = params
    dfReturn = df
    feature = transform_params['feature']
    dfReturn = dfReturn.fillna({feature: ''})
    dfReturn = dfReturn.withColumn(
        feature+'dayofmonth', dayofmonth(col(feature)))
    dfReturn = dfReturn.withColumn(feature+'month', month(col(feature)))
    dfReturn = dfReturn.withColumn(feature+'year', year(col(feature)))
    return dfReturn


def BinarizerTransform(df, params, transformationData={}):
    dfReturn = df
    transform_params = params
    feature = transform_params['feature']
    outcol = feature + "_binarizer"
    dfReturn = dfReturn.withColumn("feature_cast", dfReturn[feature].cast("double")).drop(feature)\
        .withColumnRenamed("feature_cast", feature)

    dfReturn = dfReturn.fillna({feature: 0.0})
    binarizer = Binarizer(threshold=float(
        transformationData['threshold']), inputCol=feature, outputCol=outcol)
    binarizedDataFrame = binarizer.transform(dfReturn)

    # binarizedDataFrame=binarizedDataFrame.drop(feature).withColumnRenamed(outcol,feature)

    dfReturn = binarizedDataFrame
    dfReturn = dfReturn.withColumn(feature, round(dfReturn[feature], 2))

    return dfReturn


class TransformationMain:
    # TODO: change df argument in run with following
    def run(transformationDF, config):
        configObj = json.loads(config)
        featureData = configObj["FE"]
        transformationDF = CleanseData().replaceNullValues(featureData, transformationDF)
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'id', 'transformation_label': 'String Indexer'}], 'feature': 'id', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
                                                  'count': '5839', 'mean': '', 'stddev': '', 'min': 'AV13V_i2glJLPUi8PFgb', 'max': 'AWIm0C3TYSSHbkXwx3S6', 'missing': '0', 'distinct': '45'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'id'}, {'feature_label': 'id', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('id')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'asins', 'transformation_label': 'String Indexer'}], 'feature': 'asins', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '5839', 'mean': '', 'stddev': '', 'min': 'B00009WCBT', 'max': 'B075WKS4D8', 'missing': '0', 'distinct': '45'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'asins'}, {'feature_label': 'asins', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('asins')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'brand', 'transformation_label': 'String Indexer'}], 'feature': 'brand', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '5839', 'mean': '', 'stddev': '', 'min': 'Alpine', 'max': 'Yamaha', 'missing': '0', 'distinct': '35'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'brand'}, {'feature_label': 'brand', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('brand')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'categories', 'transformation_label': 'String Indexer'}], 'feature': 'categories', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {'count': '5839', 'mean': '', 'stddev': '', 'min': 'Audio & Video Accessories,TV Mounts,TV Accessories & Parts,Electronics,A/V Presentation,Accessories & Supplies,TV Ceiling & Wall Mounts',
                                                                                                                                                                                                                                                                      'max': 'stone products,electronics,Parts & Accessories,brick manufacturing,landmark stone,Car Electronics,brick manufacturing process,eBay Motors,natural stone,brick designs,Digital Media Receivers,Car Stereo Receivers,Vehicle Electronics & GPS,brick,Electronics Features,Video In-Dash Units w/o GPS,Car Video,thin brick,Car Audio In-Dash Units,Car Audio,Consumer Electronics,brick sizes,glen gery,manufactured stone,Car Video Units W/out GPS/Nav,Apple CarPlay Receivers,In-Dash with GPS,brick colors,Car Electronics & GPS', 'missing': '0', 'distinct': '44'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'categories'}, {'feature_label': 'categories', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('categories')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'colors', 'transformation_label': 'String Indexer'}], 'feature': 'colors', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '5839', 'mean': '', 'stddev': '', 'min': 'Black', 'max': 'White', 'missing': '0', 'distinct': '16'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'colors'}, {'feature_label': 'colors', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('colors')
        transformationDF = ExtractDateTransform(transformationDF, {'transformationsData': [{'feature_label': 'dateAdded', 'transformation_label': 'Extract Date'}], 'feature': 'dateAdded', 'type': 'date', 'selected': 'True', 'replaceby': 'random', 'stats': {
            'count': '', 'mean': '', 'stddev': '', 'min': '', 'max': '', 'missing': '0'}, 'transformation': [{'transformation': 'Extract Date', 'selectedAsDefault': 1}], 'generated': 'False', 'updatedLabel': 'dateAdded'}, {'feature_label': 'dateAdded', 'transformation_label': 'Extract Date'})
        transformationDF = transformationDF.drop('dateAdded')
        transformationDF = ExtractDateTransform(transformationDF, {'transformationsData': [{'feature_label': 'dateUpdated', 'transformation_label': 'Extract Date'}], 'feature': 'dateUpdated', 'type': 'date', 'selected': 'True', 'replaceby': 'random', 'stats': {
            'count': '', 'mean': '', 'stddev': '', 'min': '', 'max': '', 'missing': '0'}, 'transformation': [{'transformation': 'Extract Date', 'selectedAsDefault': 1}], 'generated': 'False', 'updatedLabel': 'dateUpdated'}, {'feature_label': 'dateUpdated', 'transformation_label': 'Extract Date'})
        transformationDF = transformationDF.drop('dateUpdated')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'dimension', 'transformation_label': 'String Indexer'}], 'feature': 'dimension', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '5839', 'mean': '', 'stddev': '', 'min': '0.75 in x 3.25 in x 2 in', 'max': '9.2 x 7.6 x 5.2 inches', 'missing': '0', 'distinct': '30'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'dimension'}, {'feature_label': 'dimension', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('dimension')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'ean', 'threshold': 2147483647, 'transformation_label': 'Binarizer'}], 'feature': 'ean', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '5839', 'mean': '2147483647.0', 'stddev': '0.0', 'min': '2.147483647E9', 'max': '2.147483647E9', 'missing': '0'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'ean'}, {'feature_label': 'ean', 'threshold': 2147483647, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('ean')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'imageURLs', 'transformation_label': 'String Indexer'}], 'feature': 'imageURLs', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {'count': '5839', 'mean': '', 'stddev': '', 'min': 'http://i.ebayimg.com/00/$T2eC16dHJGQE9noMbUGIBRFRjWkmIQ~~_10.JPG?set_11.JPG?set_id=807,http://static.bhphoto.com/images/multiple_images/thumbnails/1398431703000_IMG_387337.jpg,https://i.ebayimg.com/images/g/M3MAAOSw4GVYPE-5/s-l300.jpg,http://i.ebayimg.com/images/g/M3MAAOSw4GVYPE-5/s-l64.jpg,http://static.bhphoto.com/images/multiple_images/thumbnails/1398431703000_IMG_387335.jpg,http://i.ebayimg.com/images/g/HWwAAOSwrklU~zA3/s-l64.jpg,http://i.ebayimg.com/images/g/POQAAOSweW5U~zA2/s-l64.jpg,http://i.ebayimg.com/images/g/CBwAAOSwEeFU~zAw/s-l64.jpg,http://i.ebayimg.com/images/g/11wAAOSweW5VJsJ2/s-l64.jpg,http://static.bhphoto.com/images/images500x500/samsung_eb_p310siwesta_universal_battery_pack_3100mah_1398432017000_1046376.jpg,http://i.ebayimg.com/images/g/X2gAAOSwFe5X0F9F/s-l64.jpg,http://i.ebayimg.com/images/g/-FEAAOSwqu9U~zAy/s-l64.jpg,http://pisces.bbystatic.com/image2/BestBuy_US/images/products/5573/5573022_sa.jpg,https://images-na.ssl-images-amazon.com/images/I/61sGC6ThWGL._SL1500_.jpg,https://pisces.bbystatic.com/image2/BestBuy_US/images/products/5573/5573022_sa.jpg,http://static.bhphoto.com/images/smallimages/1398432017000_1046376.jpg,http://static.bhphoto.com/images/multiple_images/thumbnails/1398431703000_IMG_387336.jpg,http://i.ebayimg.com/images/g/JzcAAOSwBahU~zAz/s-l64.jpg,http://static.bhphoto.com/images/multiple_images/thumbnails/1398431703000_IMG_387334.jpg,http://i.ebayimg.com/images/g/2QoAAOSw-kdX0F8q/s-l64.jpg',
                                                                                                                                                                                                                                                                    'max': 'https://static.bhphoto.com/images/itemImgPlaceholder.jpg,https://static.bhphoto.com/images/images500x500/jbl_v700nxtwht_everest_elite_700_around_ear_1462290298000_1240681.jpg,https://images-na.ssl-images-amazon.com/images/I/61KvF2V1psL._SL1182_.jpg,https://images-na.ssl-images-amazon.com/images/I/6182Y0cLFzL._SL1216_.jpg,https://images-na.ssl-images-amazon.com/images/I/61BLgM5ZrNL._SL1000_.jpg,https://images-na.ssl-images-amazon.com/images/I/61iRIvyc4UL._SL1043_.jpg,https://images-na.ssl-images-amazon.com/images/I/6145Bk%2BCA7L._SL1500_.jpg,https://images-na.ssl-images-amazon.com/images/I/41I8t3Q1zJL._SL1000_.jpg,https://images-na.ssl-images-amazon.com/images/I/61c9cmVRlLL._SL1345_.jpg,https://images-na.ssl-images-amazon.com/images/I/51l%2BeB2GuSL._SL1000_.jpg,https://pisces.bbystatic.com/image2/BestBuy_US/images/products/4403/4403709_ra.jpg,https://pisces.bbystatic.com/image2/BestBuy_US/images/products/4403/4403709_rd.jpg,https://i5.walmartimages.com/asr/28c0277a-eea5-453f-8224-761b97bb9e24_1.edd2d096d9e81bc238c44aaf0edb88c4.jpeg?odnHeight=450&odnWidth=450&odnBg=FFFFFF,https://i5.walmartimages.com/asr/0d8f3a61-da23-407a-8e53-c49192431cec_1.af197ae7fdc037da8174e56520d8371a.jpeg?odnHeight=450&odnWidth=450&odnBg=FFFFFF,https://i5.walmartimages.com/asr/78b820b7-3241-4516-b505-054e20b05304_1.f32e62172cf6257b3be39510dd959103.jpeg?odnHeight=450&odnWidth=450&odnBg=FFFFFF,http://pisces.bbystatic.com/image2/BestBuy_US/images/products/4403/4403709_ra.jpg,http://pisces.bbystatic.com/image2/BestBuy_US/images/products/4403/4403709_rd.jpg,http://images.frys.com/art/product/box_shots/8808212.box.GIF', 'missing': '0', 'distinct': '38'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'imageURLs'}, {'feature_label': 'imageURLs', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('imageURLs')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'keys', 'transformation_label': 'String Indexer'}], 'feature': 'keys', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {'count': '5839', 'mean': '', 'stddev': '', 'min': '0027242896000,sony/mexm100bt,sonymexm100bt160wrmsmarinecdreceiverwithbluetoothblacksiriusxmready/b01ceat9zu,027242896000,sonyindashcddmreceiverbuiltinbluetoothsatelliteradioreadywithdetachablefaceplateblack/5495318,sonyindashcddmreceiverbuiltinbluetoothsatelliteradioreadywithdetachablefaceplateblack/b01ceat9zu',
                                                                                                                                                                                                                                                          'max': 'yamaha40wmicrocomponentsystemblack/yacrx332bl,yamaha40wmicrocomponentsystemblack/2877396,yamaha40wmicrocomponentsystemblack/162508611472,yamaha40wmicrocomponentsystemblack/b005yxxs4i,yamaha/crx332bl,027108939599,crx322cdreceiver/yacrx332bl', 'missing': '0', 'distinct': '37'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'keys'}, {'feature_label': 'keys', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('keys')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'manufacturer', 'transformation_label': 'String Indexer'}], 'feature': 'manufacturer', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '3768', 'mean': '', 'stddev': '', 'min': 'Allround Software', 'max': 'iHome', 'missing': '2071', 'distinct': '18'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'manufacturer'}, {'feature_label': 'manufacturer', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('manufacturer')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'manufacturerNumber', 'transformation_label': 'String Indexer'}], 'feature': 'manufacturerNumber', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '5839', 'mean': '', 'stddev': '', 'min': '45W SLIM AC ADAPTER - 88801419', 'max': 'XPS8920-7529SLV-PUS', 'missing': '0', 'distinct': '44'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'manufacturerNumber'}, {'feature_label': 'manufacturerNumber', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('manufacturerNumber')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'name', 'transformation_label': 'String Indexer'}], 'feature': 'name', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '5839', 'mean': '', 'stddev': '', 'min': '"Alpine - 6-1/2"" 2-Way Coaxial Car Speakers with Polypropylene Cones (Pair) - Black"', 'max': 'iHome Rechargeable Splash Proof Stereo Bluetooth Speaker - Black (IBT33BC)', 'missing': '0', 'distinct': '37'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'name'}, {'feature_label': 'name', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('name')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'primaryCategories', 'transformation_label': 'String Indexer'}], 'feature': 'primaryCategories', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '5839', 'mean': '', 'stddev': '', 'min': 'Electronics', 'max': 'Electronics', 'missing': '0', 'distinct': '1'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'primaryCategories'}, {'feature_label': 'primaryCategories', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('primaryCategories')
        transformationDF = ExtractDateTransform(transformationDF, {'transformationsData': [{'feature_label': 'reviewsdate', 'transformation_label': 'Extract Date'}], 'feature': 'reviewsdate', 'type': 'date', 'selected': 'True', 'replaceby': 'random', 'stats': {
            'count': '', 'mean': '', 'stddev': '', 'min': '', 'max': '', 'missing': '57'}, 'transformation': [{'transformation': 'Extract Date', 'selectedAsDefault': 1}], 'generated': 'False', 'updatedLabel': 'reviewsdate'}, {'feature_label': 'reviewsdate', 'transformation_label': 'Extract Date'})
        transformationDF = transformationDF.drop('reviewsdate')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'reviewsdateSeen', 'transformation_label': 'String Indexer'}], 'feature': 'reviewsdateSeen', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '5839', 'mean': '', 'stddev': '', 'min': '2014-10-22T00:00:00Z', 'max': '2018-05-27T00:00:00Z,2018-05-26T00:00:00Z', 'missing': '0', 'distinct': '811'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'reviewsdateSeen'}, {'feature_label': 'reviewsdateSeen', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('reviewsdateSeen')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'reviewssourceURLs', 'transformation_label': 'String Indexer'}], 'feature': 'reviewssourceURLs', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '5839', 'mean': '', 'stddev': '', 'min': 'http://reviews.bestbuy.com/3545/1416147/reviews.htm?format=embedded', 'max': 'https://www.walmart.com/reviews/product/51933274', 'missing': '0', 'distinct': '1962'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'reviewssourceURLs'}, {'feature_label': 'reviewssourceURLs', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('reviewssourceURLs')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'reviewstext', 'transformation_label': 'String Indexer'}], 'feature': 'reviewstext', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {'count': '5839', 'mean': '', 'stddev': '', 'min': '"...power supply is a little low for an XPS with a 7th gen i7. All the specs are great, and video card is more that adequate, but if the XPS series is ""built"" to allow for upgrades',
                                                                                                                                                                                                                                                                        'max': 'yamaha 5.1 speaker sistema sound is very good, all in te box, I buy a little more wire.', 'missing': '0', 'distinct': '5760'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'reviewstext'}, {'feature_label': 'reviewstext', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('reviewstext')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'reviewstitle', 'transformation_label': 'String Indexer'}], 'feature': 'reviewstitle', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {'count': '5839', 'mean': '', 'stddev': '', 'min': ' ""Whatch DVD"" etc. You can also have custom settings like; "" ""Play Pandora"" click your choice and it does all the work.If something doesn\'t come on right. Like if you don\'t point it right',
                                                                                                                                                                                                                                                                          'max': 'you need to hold it out like the statue of liberty while all the steps happen ...', 'missing': '0', 'distinct': '4460'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'reviewstitle'}, {'feature_label': 'reviewstitle', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('reviewstitle')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'reviewsusername', 'transformation_label': 'String Indexer'}], 'feature': 'reviewsusername', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '5839', 'mean': '2004474.55', 'stddev': '6415011.39', 'min': ' ""Hey', 'max': 'zznj', 'missing': '0', 'distinct': '5277'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'reviewsusername'}, {'feature_label': 'reviewsusername', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('reviewsusername')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'sourceURLs', 'transformation_label': 'String Indexer'}], 'feature': 'sourceURLs', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '5839', 'mean': '', 'stddev': '', 'min': ' ""Battery high/low""', 'max': 'weatherworrier', 'missing': '0', 'distinct': '141'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'sourceURLs'}, {'feature_label': 'sourceURLs', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('sourceURLs')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'upc', 'transformation_label': 'String Indexer'}], 'feature': 'upc', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '5839', 'mean': '399675369461.03', 'stddev': '367815926707.34', 'min': ' BluRay Player', 'max': 'wNg11', 'missing': '0', 'distinct': '121'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'upc'}, {'feature_label': 'upc', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('upc')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'weight', 'transformation_label': 'String Indexer'}], 'feature': 'weight', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '5839', 'mean': '464461852205.3', 'stddev': '374729125974.19', 'min': ' & decided on the 40"" HD...PIC is RAZOR', 'max': 'scapula', 'missing': '0', 'distinct': '127'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'weight'}, {'feature_label': 'weight', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('weight')
        display(transformationDF.limit(2).toPandas())
        return transformationDF


***CONNECTOR FUNCTIONS TO WRITE DATA.***

In [None]:
import datetime
import requests
import datetime
import logging
import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)


class NumtraConnector:

    def put(inStages, inStagesData, stageId, spark, config):
        path = eval(config)['server_url']
        baseType = eval(config)['baseType']
        results_url = eval(config)['results_url']
        server = eval(config)['server']
        originalfile = eval(config)['orignalKey']
        eval(config)['pathOnly']
        filename = eval(config)['filename']
        eval(config)['ser']
        eval(config)['user']
        eval(config)['password']
        eval(config)['authSource']
        eval(config)['user_id']
        eval(config)['parent_id']
        eval(config)['project_id']
        time = str(int(datetime.datetime.now().timestamp()))

        inStagesData[inStages[0]]

        print(path)
        print(baseType)
        print(results_url)
        print(server)
        print(originalfile)
        print(filename)

        args = {
            'url': path,
            'baseType': baseType,
            'originalfile': originalfile,
            'filename': time + filename
        }

        response = requests.post(results_url, args)
        return response


***READING DATAFRAME***

In [None]:
############## CREATE SPARK SESSION ############################ ENTER YOUR SPARK MASTER IP AND PORT TO CONNECT TO SERVER ################
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[1]').getOrCreate()
#%run gettingfileHooks.ipynb
try:
	#sourcePreExecutionHook()

	electronicsproductdatanewupdate = HDFSConnector.fetch(spark, "{'url': '/FileStore/platform/uploadedSourceFiles/ElectronicsProductDataNewUpdate.csv', 'filename': 'ElectronicsProductDataNewUpdate.csv', 'delimiter': ',', 'file_type': 'Delimeted', 'dbfs_token': '', 'dbfs_domain': '', 'is_header': 'Use Header Line', 'server_url': '/numtraPlatform/NumtraPlatformV3/uploads/platform/', 'results_url': 'http://ml.colaberry.com:44040/api/read/hdfs'}")

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***TRANSFORMING DATAFRAME***

In [None]:
#%run gettingfileHooks.ipynb
try:
	#transformationPreExecutionHook()

	autofe = TransformationMain.run(electronicsproductdatanewupdate,json.dumps( {"FE": [{"transformationsData": [{"feature_label": "id", "transformation_label": "String Indexer"}], "feature": "id", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "", "stddev": "", "min": "AV13V_i2glJLPUi8PFgb", "max": "AWIm0C3TYSSHbkXwx3S6", "missing": "0", "distinct": "45"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "id"}, {"transformationsData": [{"feature_label": "asins", "transformation_label": "String Indexer"}], "feature": "asins", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "", "stddev": "", "min": "B00009WCBT", "max": "B075WKS4D8", "missing": "0", "distinct": "45"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "asins"}, {"transformationsData": [{"feature_label": "brand", "transformation_label": "String Indexer"}], "feature": "brand", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "", "stddev": "", "min": "Alpine", "max": "Yamaha", "missing": "0", "distinct": "35"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "brand"}, {"transformationsData": [{"feature_label": "categories", "transformation_label": "String Indexer"}], "feature": "categories", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "", "stddev": "", "min": "Audio & Video Accessories,TV Mounts,TV Accessories & Parts,Electronics,A/V Presentation,Accessories & Supplies,TV Ceiling & Wall Mounts", "max": "stone products,electronics,Parts & Accessories,brick manufacturing,landmark stone,Car Electronics,brick manufacturing process,eBay Motors,natural stone,brick designs,Digital Media Receivers,Car Stereo Receivers,Vehicle Electronics & GPS,brick,Electronics Features,Video In-Dash Units w/o GPS,Car Video,thin brick,Car Audio In-Dash Units,Car Audio,Consumer Electronics,brick sizes,glen gery,manufactured stone,Car Video Units W/out GPS/Nav,Apple CarPlay Receivers,In-Dash with GPS,brick colors,Car Electronics & GPS", "missing": "0", "distinct": "44"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "categories"}, {"transformationsData": [{"feature_label": "colors", "transformation_label": "String Indexer"}], "feature": "colors", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "", "stddev": "", "min": "Black", "max": "White", "missing": "0", "distinct": "16"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "colors"}, {"transformationsData": [{"feature_label": "dateAdded", "transformation_label": "Extract Date"}], "feature": "dateAdded", "type": "date", "selected": "True", "replaceby": "random", "stats": {"count": "", "mean": "", "stddev": "", "min": "", "max": "", "missing": "0"}, "transformation": [{"transformation": "Extract Date", "selectedAsDefault": 1}], "generated": "False", "updatedLabel": "dateAdded"}, {"transformationsData": [{"feature_label": "dateUpdated", "transformation_label": "Extract Date"}], "feature": "dateUpdated", "type": "date", "selected": "True", "replaceby": "random", "stats": {"count": "", "mean": "", "stddev": "", "min": "", "max": "", "missing": "0"}, "transformation": [{"transformation": "Extract Date", "selectedAsDefault": 1}], "generated": "False", "updatedLabel": "dateUpdated"}, {"transformationsData": [{"feature_label": "dimension", "transformation_label": "String Indexer"}], "feature": "dimension", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "", "stddev": "", "min": "0.75 in x 3.25 in x 2 in", "max": "9.2 x 7.6 x 5.2 inches", "missing": "0", "distinct": "30"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "dimension"}, {"transformationsData": [{"feature_label": "ean", "threshold": 2147483647, "transformation_label": "Binarizer"}], "feature": "ean", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "5839", "mean": "2147483647.0", "stddev": "0.0", "min": "2.147483647E9", "max": "2.147483647E9", "missing": "0"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "ean"}, {"transformationsData": [{"feature_label": "imageURLs", "transformation_label": "String Indexer"}], "feature": "imageURLs", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "", "stddev": "", "min": "http://i.ebayimg.com/00/$T2eC16dHJGQE9noMbUGIBRFRjWkmIQ~~_10.JPG?set_11.JPG?set_id=807,http://static.bhphoto.com/images/multiple_images/thumbnails/1398431703000_IMG_387337.jpg,https://i.ebayimg.com/images/g/M3MAAOSw4GVYPE-5/s-l300.jpg,http://i.ebayimg.com/images/g/M3MAAOSw4GVYPE-5/s-l64.jpg,http://static.bhphoto.com/images/multiple_images/thumbnails/1398431703000_IMG_387335.jpg,http://i.ebayimg.com/images/g/HWwAAOSwrklU~zA3/s-l64.jpg,http://i.ebayimg.com/images/g/POQAAOSweW5U~zA2/s-l64.jpg,http://i.ebayimg.com/images/g/CBwAAOSwEeFU~zAw/s-l64.jpg,http://i.ebayimg.com/images/g/11wAAOSweW5VJsJ2/s-l64.jpg,http://static.bhphoto.com/images/images500x500/samsung_eb_p310siwesta_universal_battery_pack_3100mah_1398432017000_1046376.jpg,http://i.ebayimg.com/images/g/X2gAAOSwFe5X0F9F/s-l64.jpg,http://i.ebayimg.com/images/g/-FEAAOSwqu9U~zAy/s-l64.jpg,http://pisces.bbystatic.com/image2/BestBuy_US/images/products/5573/5573022_sa.jpg,https://images-na.ssl-images-amazon.com/images/I/61sGC6ThWGL._SL1500_.jpg,https://pisces.bbystatic.com/image2/BestBuy_US/images/products/5573/5573022_sa.jpg,http://static.bhphoto.com/images/smallimages/1398432017000_1046376.jpg,http://static.bhphoto.com/images/multiple_images/thumbnails/1398431703000_IMG_387336.jpg,http://i.ebayimg.com/images/g/JzcAAOSwBahU~zAz/s-l64.jpg,http://static.bhphoto.com/images/multiple_images/thumbnails/1398431703000_IMG_387334.jpg,http://i.ebayimg.com/images/g/2QoAAOSw-kdX0F8q/s-l64.jpg", "max": "https://static.bhphoto.com/images/itemImgPlaceholder.jpg,https://static.bhphoto.com/images/images500x500/jbl_v700nxtwht_everest_elite_700_around_ear_1462290298000_1240681.jpg,https://images-na.ssl-images-amazon.com/images/I/61KvF2V1psL._SL1182_.jpg,https://images-na.ssl-images-amazon.com/images/I/6182Y0cLFzL._SL1216_.jpg,https://images-na.ssl-images-amazon.com/images/I/61BLgM5ZrNL._SL1000_.jpg,https://images-na.ssl-images-amazon.com/images/I/61iRIvyc4UL._SL1043_.jpg,https://images-na.ssl-images-amazon.com/images/I/6145Bk%2BCA7L._SL1500_.jpg,https://images-na.ssl-images-amazon.com/images/I/41I8t3Q1zJL._SL1000_.jpg,https://images-na.ssl-images-amazon.com/images/I/61c9cmVRlLL._SL1345_.jpg,https://images-na.ssl-images-amazon.com/images/I/51l%2BeB2GuSL._SL1000_.jpg,https://pisces.bbystatic.com/image2/BestBuy_US/images/products/4403/4403709_ra.jpg,https://pisces.bbystatic.com/image2/BestBuy_US/images/products/4403/4403709_rd.jpg,https://i5.walmartimages.com/asr/28c0277a-eea5-453f-8224-761b97bb9e24_1.edd2d096d9e81bc238c44aaf0edb88c4.jpeg?odnHeight=450&odnWidth=450&odnBg=FFFFFF,https://i5.walmartimages.com/asr/0d8f3a61-da23-407a-8e53-c49192431cec_1.af197ae7fdc037da8174e56520d8371a.jpeg?odnHeight=450&odnWidth=450&odnBg=FFFFFF,https://i5.walmartimages.com/asr/78b820b7-3241-4516-b505-054e20b05304_1.f32e62172cf6257b3be39510dd959103.jpeg?odnHeight=450&odnWidth=450&odnBg=FFFFFF,http://pisces.bbystatic.com/image2/BestBuy_US/images/products/4403/4403709_ra.jpg,http://pisces.bbystatic.com/image2/BestBuy_US/images/products/4403/4403709_rd.jpg,http://images.frys.com/art/product/box_shots/8808212.box.GIF", "missing": "0", "distinct": "38"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "imageURLs"}, {"transformationsData": [{"feature_label": "keys", "transformation_label": "String Indexer"}], "feature": "keys", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "", "stddev": "", "min": "0027242896000,sony/mexm100bt,sonymexm100bt160wrmsmarinecdreceiverwithbluetoothblacksiriusxmready/b01ceat9zu,027242896000,sonyindashcddmreceiverbuiltinbluetoothsatelliteradioreadywithdetachablefaceplateblack/5495318,sonyindashcddmreceiverbuiltinbluetoothsatelliteradioreadywithdetachablefaceplateblack/b01ceat9zu", "max": "yamaha40wmicrocomponentsystemblack/yacrx332bl,yamaha40wmicrocomponentsystemblack/2877396,yamaha40wmicrocomponentsystemblack/162508611472,yamaha40wmicrocomponentsystemblack/b005yxxs4i,yamaha/crx332bl,027108939599,crx322cdreceiver/yacrx332bl", "missing": "0", "distinct": "37"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "keys"}, {"transformationsData": [{"feature_label": "manufacturer", "transformation_label": "String Indexer"}], "feature": "manufacturer", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "3768", "mean": "", "stddev": "", "min": "Allround Software", "max": "iHome", "missing": "2071", "distinct": "18"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "manufacturer"}, {"transformationsData": [{"feature_label": "manufacturerNumber", "transformation_label": "String Indexer"}], "feature": "manufacturerNumber", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "", "stddev": "", "min": "45W SLIM AC ADAPTER - 88801419", "max": "XPS8920-7529SLV-PUS", "missing": "0", "distinct": "44"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "manufacturerNumber"}, {"transformationsData": [{"feature_label": "name", "transformation_label": "String Indexer"}], "feature": "name", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "", "stddev": "", "min": "\"Alpine - 6-1/2\"\" 2-Way Coaxial Car Speakers with Polypropylene Cones (Pair) - Black\"", "max": "iHome Rechargeable Splash Proof Stereo Bluetooth Speaker - Black (IBT33BC)", "missing": "0", "distinct": "37"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "name"}, {"transformationsData": [{"feature_label": "primaryCategories", "transformation_label": "String Indexer"}], "feature": "primaryCategories", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "", "stddev": "", "min": "Electronics", "max": "Electronics", "missing": "0", "distinct": "1"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "primaryCategories"}, {"transformationsData": [{"feature_label": "reviewsdate", "transformation_label": "Extract Date"}], "feature": "reviewsdate", "type": "date", "selected": "True", "replaceby": "random", "stats": {"count": "", "mean": "", "stddev": "", "min": "", "max": "", "missing": "57"}, "transformation": [{"transformation": "Extract Date", "selectedAsDefault": 1}], "generated": "False", "updatedLabel": "reviewsdate"}, {"transformationsData": [{"feature_label": "reviewsdateSeen", "transformation_label": "String Indexer"}], "feature": "reviewsdateSeen", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "", "stddev": "", "min": "2014-10-22T00:00:00Z", "max": "2018-05-27T00:00:00Z,2018-05-26T00:00:00Z", "missing": "0", "distinct": "811"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "reviewsdateSeen"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "reviewsdoRecommend", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "5839", "mean": "0.72", "stddev": "0.45", "min": "0", "max": "1", "missing": "0"}, "updatedLabel": "reviewsdoRecommend"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "reviewsnumHelpful", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "5839", "mean": "0.6", "stddev": "3.22", "min": "0", "max": "128", "missing": "0"}, "updatedLabel": "reviewsnumHelpful"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "reviewsrating", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "5839", "mean": "4.43", "stddev": "0.96", "min": "1", "max": "5", "missing": "0"}, "updatedLabel": "reviewsrating"}, {"transformationsData": [{"feature_label": "reviewssourceURLs", "transformation_label": "String Indexer"}], "feature": "reviewssourceURLs", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "", "stddev": "", "min": "http://reviews.bestbuy.com/3545/1416147/reviews.htm?format=embedded", "max": "https://www.walmart.com/reviews/product/51933274", "missing": "0", "distinct": "1962"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "reviewssourceURLs"}, {"transformationsData": [{"feature_label": "reviewstext", "transformation_label": "String Indexer"}], "feature": "reviewstext", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "", "stddev": "", "min": "\"...power supply is a little low for an XPS with a 7th gen i7. All the specs are great, and video card is more that adequate, but if the XPS series is \"\"built\"\" to allow for upgrades", "max": "yamaha 5.1 speaker sistema sound is very good, all in te box, I buy a little more wire.", "missing": "0", "distinct": "5760"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "reviewstext"}, {"transformationsData": [{"feature_label": "reviewstitle", "transformation_label": "String Indexer"}], "feature": "reviewstitle", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "", "stddev": "", "min": " \"\"Whatch DVD\"\" etc. You can also have custom settings like; \"\" \"\"Play Pandora\"\" click your choice and it does all the work.If something doesn't come on right. Like if you don't point it right", "max": "you need to hold it out like the statue of liberty while all the steps happen ...", "missing": "0", "distinct": "4460"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "reviewstitle"}, {"transformationsData": [{"feature_label": "reviewsusername", "transformation_label": "String Indexer"}], "feature": "reviewsusername", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "2004474.55", "stddev": "6415011.39", "min": " \"\"Hey", "max": "zznj", "missing": "0", "distinct": "5277"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "reviewsusername"}, {"transformationsData": [{"feature_label": "sourceURLs", "transformation_label": "String Indexer"}], "feature": "sourceURLs", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "", "stddev": "", "min": " \"\"Battery high/low\"\"", "max": "weatherworrier", "missing": "0", "distinct": "141"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "sourceURLs"}, {"transformationsData": [{"feature_label": "upc", "transformation_label": "String Indexer"}], "feature": "upc", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "399675369461.03", "stddev": "367815926707.34", "min": " BluRay Player", "max": "wNg11", "missing": "0", "distinct": "121"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "upc"}, {"transformationsData": [{"feature_label": "weight", "transformation_label": "String Indexer"}], "feature": "weight", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "5839", "mean": "464461852205.3", "stddev": "374729125974.19", "min": " & decided on the 40\"\" HD...PIC is RAZOR", "max": "scapula", "missing": "0", "distinct": "127"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "weight"}]}))

	#transformationPostExecutionHook(autofe)

except Exception as ex: 
	logging.error(ex)


***WRITING DATAFRAME***

In [None]:
#%run gettingfileHooks.ipynb
try:
	#sinkPreExecutionHook()

	amazonandbestbuyelectronicsoutput = NumtraConnector.fetch(spark, "{'samplefile': '/FileStore/platform/sampleData/65d57f872731429b05e8d16e/part-00000-feb0e2f3-62d3-44bc-bab7-260176a6a7bb-c000.csv', 'samplecount': 734, 'originalcount': 7299, 'orignalKey': '/FileStore/platform/uploadedSourceFiles/ElectronicsProductDataNewUpdate.csv', 'pathOnly': '/Amazon and Best Buy Electronics', 'project_id': '65d42b557d79ea9151569822', 'parent_id': '65d42b557d79ea9151569822', 'original_schema': [{'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2aa', 'field': 'reviewsdoRecommend', 'alias': 'reviewsdoRecommend', 'type': 'numeric', 'position': '0'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2ab', 'field': 'reviewsnumHelpful', 'alias': 'reviewsnumHelpful', 'type': 'numeric', 'position': '1'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2ac', 'field': 'reviewsrating', 'alias': 'reviewsrating', 'type': 'numeric', 'position': '2'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2ad', 'field': 'id_stringindexer', 'alias': 'id_stringindexer', 'type': 'real', 'position': '3'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2ae', 'field': 'asins_stringindexer', 'alias': 'asins_stringindexer', 'type': 'numeric', 'position': '4'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2af', 'field': 'brand_stringindexer', 'alias': 'brand_stringindexer', 'type': 'numeric', 'position': '5'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2b0', 'field': 'categories_stringindexer', 'alias': 'categories_stringindexer', 'type': 'real', 'position': '6'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2b1', 'field': 'colors_stringindexer', 'alias': 'colors_stringindexer', 'type': 'numeric', 'position': '7'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2b2', 'field': 'dateAdded_dayofmonth', 'alias': 'dateAdded_dayofmonth', 'type': 'numeric', 'position': '8', 'generated': 'true'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2b3', 'field': 'dateAdded_month', 'alias': 'dateAdded_month', 'type': 'numeric', 'position': '9', 'generated': 'true'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2b4', 'field': 'dateAdded_year', 'alias': 'dateAdded_year', 'type': 'numeric', 'position': '10', 'generated': 'true'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2b5', 'field': 'dateUpdated_dayofmonth', 'alias': 'dateUpdated_dayofmonth', 'type': 'numeric', 'position': '11', 'generated': 'true'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2b6', 'field': 'dateUpdated_month', 'alias': 'dateUpdated_month', 'type': 'numeric', 'position': '12', 'generated': 'true'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2b7', 'field': 'dateUpdated_year', 'alias': 'dateUpdated_year', 'type': 'numeric', 'position': '13', 'generated': 'true'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2b8', 'field': 'dimension_stringindexer', 'alias': 'dimension_stringindexer', 'type': 'numeric', 'position': '14'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2b9', 'field': 'ean_binarizer', 'alias': 'ean_binarizer', 'type': 'real', 'position': '15'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2ba', 'field': 'imageURLs_stringindexer', 'alias': 'imageURLs_stringindexer', 'type': 'numeric', 'position': '16'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2bb', 'field': 'keys_stringindexer', 'alias': 'keys_stringindexer', 'type': 'real', 'position': '17'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2bc', 'field': 'manufacturer_stringindexer', 'alias': 'manufacturer_stringindexer', 'type': 'real', 'position': '18'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2bd', 'field': 'manufacturerNumber_stringindexer', 'alias': 'manufacturerNumber_stringindexer', 'type': 'numeric', 'position': '19'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2be', 'field': 'name_stringindexer', 'alias': 'name_stringindexer', 'type': 'real', 'position': '20'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2bf', 'field': 'primaryCategories_stringindexer', 'alias': 'primaryCategories_stringindexer', 'type': 'numeric', 'position': '21'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2c0', 'field': 'reviewsdate_dayofmonth', 'alias': 'reviewsdate_dayofmonth', 'type': 'numeric', 'position': '22', 'generated': 'true'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2c1', 'field': 'reviewsdate_month', 'alias': 'reviewsdate_month', 'type': 'numeric', 'position': '23', 'generated': 'true'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2c2', 'field': 'reviewsdate_year', 'alias': 'reviewsdate_year', 'type': 'numeric', 'position': '24', 'generated': 'true'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2c3', 'field': 'reviewsdateSeen_stringindexer', 'alias': 'reviewsdateSeen_stringindexer', 'type': 'real', 'position': '25'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2c4', 'field': 'reviewssourceURLs_stringindexer', 'alias': 'reviewssourceURLs_stringindexer', 'type': 'real', 'position': '26'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2c5', 'field': 'reviewstext_stringindexer', 'alias': 'reviewstext_stringindexer', 'type': 'real', 'position': '27'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2c6', 'field': 'reviewstitle_stringindexer', 'alias': 'reviewstitle_stringindexer', 'type': 'real', 'position': '28'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2c7', 'field': 'reviewsusername_stringindexer', 'alias': 'reviewsusername_stringindexer', 'type': 'real', 'position': '29'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2c8', 'field': 'sourceURLs_stringindexer', 'alias': 'sourceURLs_stringindexer', 'type': 'real', 'position': '30'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2c9', 'field': 'upc_stringindexer', 'alias': 'upc_stringindexer', 'type': 'real', 'position': '31'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2ca', 'field': 'weight_stringindexer', 'alias': 'weight_stringindexer', 'type': 'real', 'position': '32'}], 'actual_schema': [{'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d28f', 'field': 'id', 'alias': 'id', 'type': 'String', 'position': '0'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d290', 'field': 'asins', 'alias': 'asins', 'type': 'String', 'position': '1'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d291', 'field': 'brand', 'alias': 'brand', 'type': 'String', 'position': '2'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d292', 'field': 'categories', 'alias': 'categories', 'type': 'String', 'position': '3'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d293', 'field': 'colors', 'alias': 'colors', 'type': 'String', 'position': '4'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d294', 'field': 'dateAdded', 'alias': 'dateAdded', 'type': 'numeric', 'position': '5'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d295', 'field': 'dateUpdated', 'alias': 'dateUpdated', 'type': 'numeric', 'position': '6'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d296', 'field': 'dimension', 'alias': 'dimension', 'type': 'String', 'position': '7'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d297', 'field': 'ean', 'alias': 'ean', 'type': 'real', 'position': '8'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d298', 'field': 'imageURLs', 'alias': 'imageURLs', 'type': 'String', 'position': '9'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d299', 'field': 'keys', 'alias': 'keys', 'type': 'String', 'position': '10'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d29a', 'field': 'manufacturer', 'alias': 'manufacturer', 'type': 'String', 'position': '11'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d29b', 'field': 'manufacturerNumber', 'alias': 'manufacturerNumber', 'type': 'String', 'position': '12'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d29c', 'field': 'name', 'alias': 'name', 'type': 'String', 'position': '13'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d29d', 'field': 'primaryCategories', 'alias': 'primaryCategories', 'type': 'String', 'position': '14'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d29e', 'field': 'reviewsdate', 'alias': 'reviewsdate', 'type': 'real', 'position': '15'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d29f', 'field': 'reviewsdateSeen', 'alias': 'reviewsdateSeen', 'type': 'String', 'position': '16'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2a0', 'field': 'reviewsdoRecommend', 'alias': 'reviewsdoRecommend', 'type': 'String', 'position': '17'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2a1', 'field': 'reviewsnumHelpful', 'alias': 'reviewsnumHelpful', 'type': 'real', 'position': '18'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2a2', 'field': 'reviewsrating', 'alias': 'reviewsrating', 'type': 'real', 'position': '19'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2a3', 'field': 'reviewssourceURLs', 'alias': 'reviewssourceURLs', 'type': 'String', 'position': '20'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2a4', 'field': 'reviewstext', 'alias': 'reviewstext', 'type': 'String', 'position': '21'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2a5', 'field': 'reviewstitle', 'alias': 'reviewstitle', 'type': 'String', 'position': '22'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2a6', 'field': 'reviewsusername', 'alias': 'reviewsusername', 'type': 'String', 'position': '23'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2a7', 'field': 'sourceURLs', 'alias': 'sourceURLs', 'type': 'String', 'position': '24'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2a8', 'field': 'upc', 'alias': 'upc', 'type': 'String', 'position': '25'}, {'checked': True, 'inherited': True, 'is_categorical': False, 'bad_values': '', 'nullable': 'true', '_id': '65d584c62731429b05e8d2a9', 'field': 'weight', 'alias': 'weight', 'type': 'String', 'position': '26'}], 'server': 'https://ml.colaberry.com:443', 'server_url': '/numtraPlatform/NumtraPlatformV3/uploads/platform/', 'delimiter': ',', 'file_type': 'Delimeted', 'filename': 'AmazonFile.csv', 'token': '', 'domain': '', 'is_header': 'Use Header Line', 'url': '/FileStore/platform/uploadedSourceFiles/part-00000-ae43c572-e9a7-4e58-a2a5-57a585e4e046-c000.csv', 'results_url': 'http://ml.colaberry.com:44040/api/read/hdfs'}")

except Exception as ex: 
	logging.error(ex)
