***GENERATED CODE FOR forcastingpropertyvaluesregression PIPELINE.***

***DON'T EDIT THIS CODE.***

***CONNECTOR FUNCTIONS TO READ DATA.***

In [None]:
import os
import datetime
import logging
import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)


class HDFSConnector:

    def fetch(spark, config):
        ################### INPUT HADOOP HOST PORT TO CONNECT WITH ###############################
        hdfs_server = str(os.environ['HDFS_SERVER'])
        hdfs_port = int(os.environ['HDFS_PORT'])
        df = spark.read.options(header='true', inferschema='true').csv(
            f"hdfs://{hdfs_server}:{hdfs_port}{eval(config)['url']}", header='true')
        display(df.limit(2).toPandas())
        return df

    def put(df, spark, config):
        return df.write.format('csv').options(header='true' if eval(config)["is_header"] == "Use Header Line" else 'false',
                                              delimiter=eval(config)["delimiter"]).save(("%s %s") % (datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")+"_", eval(config)['url']))


***TRANSFORMATIONS FUNCTIONS THAT WILL BE APPLIED ON DATA***

In [None]:
import json
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import mean, stddev, min, max, col


class CleanseData:
    # def __init__(self,df):
    #     #print()

    def cleanValueForFE(self, value):
        if value == None:
            return ""
        elif str(value) == 'nan':
            return "nan"
        else:
            return value

    def replaceByMean(self, feature, df, mean_=-1):
        df1 = df
        df1 = df1.dropna()
        meanValue = self.cleanValueForFE(df1.select(
            mean(col(feature.name)).alias('mean')).collect()[0]["mean"])
        df = df.fillna(meanValue, subset=[feature.name])
        df.withColumn(feature.name, when(col(feature.name) == " ",
                      meanValue).otherwise(col(feature.name).cast("Integer")))
        return df

    def replaceByMax(self, feature, df, max_=-1):
        df1 = df
        df1 = df1.dropna()
        maxValue = self.cleanValueForFE(df1.select(
            max(col(feature.name)).alias('max')).collect()[0]["max"])
        df = df.fillna(maxValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", maxValue).otherwise(col(feature.name)))
        return df

    def replaceByMin(self, feature, df, min_=-1):
        df1 = df
        df1 = df1.dropna()
        minValue = self.cleanValueForFE(df1.select(
            min(col(feature.name)).alias('min')).collect()[0]["min"])
        df = df.fillna(minValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", minValue).otherwise(col(feature.name)))
        return df

    def replaceByStandardDeviation(self, feature, df, stddev_=-1):
        df1 = df
        df1 = df1.dropna()
        stddevValue = self.cleanValueForFE(df1.select(
            stddev(col(feature.name)).alias('stddev')).collect()[0]["stddev"])
        df = df.fillna(stddevValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", stddevValue).otherwise(col(feature.name)))
        return df

    def replaceDateRandomly(self, feature, df):
        df1 = df
        df1 = df1.dropna()
        fillValue = self.cleanValueForFE(
            df.where(col(feature.name).isNotNull()).head(1)[0][feature.name])
        df = df.fillna(str(fillValue), subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", fillValue).otherwise(col(feature.name)))
        # print("CleanseData:replaceDateRandomly Schema : ", df.#printSchema())
        return df

    def replaceNullValues(self, fList, df):
        featuresList = df.schema.fields
        for featureObj in fList:
            for feat in featuresList:
                if featureObj["feature"] in feat.name:
                    featureName = feat
                    if "mean" in featureObj["replaceby"]:
                        df = self.replaceByMean(featureName, df)
                    elif "max" in featureObj["replaceby"]:
                        df = self.replaceByMax(featureName, df)
                    elif "min" in featureObj["replaceby"]:
                        df = self.replaceByMin(featureName, df)
                    elif "stddev" in featureObj["replaceby"]:
                        df = self.replaceByStandardDeviation(featureName, df)
                    elif "random" in featureObj["replaceby"]:
                        df = self.replaceDateRandomly(featureName, df)
        return df


def StringIndexerTransform(df, params, transformationData={}):
    dfReturn = df
    feature = params["feature"]

    dfReturn = dfReturn.fillna({feature: ''})
    outcol = feature + "_stringindexer"
    indexer = StringIndexer(
        inputCol=feature, outputCol=outcol, handleInvalid="skip")
    indexed = indexer.fit(dfReturn).transform(dfReturn)
    dfReturn = indexed
    distinct_values_list = dfReturn.select(
        outcol).distinct().rdd.map(lambda r: r[0]).collect()
    len_distinct_values_list = len(distinct_values_list)
    if len_distinct_values_list <= 4:
        changed_type_df = dfReturn.withColumn(
            outcol, dfReturn[outcol].cast(IntegerType()))
        return changed_type_df
    return dfReturn


class TransformationMain:
    # TODO: change df argument in run with following
    def run(transformationDF, config):
        configObj = json.loads(config)
        featureData = configObj["FE"]
        transformationDF = CleanseData().replaceNullValues(featureData, transformationDF)
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'InterviewDate', 'transformation_label': 'String Indexer'}], 'feature': 'InterviewDate', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
                                                  'count': '143', 'mean': '', 'stddev': '', 'min': '10/10/2023', 'max': '9/8/2023', 'missing': '0', 'distinct': '71'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'InterviewDate'}, {'feature_label': 'InterviewDate', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('InterviewDate')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Job_Title', 'transformation_label': 'String Indexer'}], 'feature': 'Job_Title', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '', 'stddev': '', 'min': 'Analytics Consultant', 'max': 'siness intelligence developer', 'missing': '0', 'distinct': '89'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Job_Title'}, {'feature_label': 'Job_Title', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Job_Title')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Job_Description', 'transformation_label': 'String Indexer'}], 'feature': 'Job_Description', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {'count': '143', 'mean': '', 'stddev': '', 'min': '"Description: Leading with our people, Digital Consultants\' mission is to deliver the highest level of professional solutions while being a trusted partner and advisor to our customers. With a culture of practicality, opportunity, and creativity we remain dedicated to being honest, trustworthy, respectful, and ethical in everything we do. We are a certified SBA 8(a) small-disadvantaged business that supports multiple IT customers within the Federal, civilian, and private sectors. Digital Consultants also offers our employees growth opportunities, competitive wages, and a full benefits package. Our founding principles, Fairness and Common Sense make working here more than a job, it is the Digital family.  Digital Consultants is seeking a Power BI Reports Developer to join our team! As the Power BI Reports Developer, you will be designing ETL processes and working directly with developers, testers, and other roles active to translate product roadmap features into well-defined product requirements including features, user stories, and acceptance test criteria.  Duties to include:  Maintain and support data analytics platforms Develop and update relevant technical documentation Collaborate directly with analysts and end users to integrate systems and create effective reports Develop and execute database queries and conduct analyses as needed Maintain status updates for all assigned ad hoc report requests Knowledge of SQL queries, SQL Server Reporting Services (SSRS), and MS SQL Server An expert that understands the Power BI platform and its tools very well. Takes complete responsibility for the development and administration of BI tools including transforming raw data into valuable and meaningful insights in the form of appealing dashboards and reports. Recreating existing reports from the current platform into Power BI."" Build complex dimensional data models and reports from the bottom up Audit existing Power BI efforts with a focus on improvement Visualize compelling data stories on the report canvas Produce KPIs with user-friendly display dashboards Collaborate with client\'s data team Familiarity with data warehouse design (e.g. dimensional modeling) and data mining Requirements: Active DOD Secret Clearance IAT level II Certification: GSEC',
                                                                                                                                                                                                                                                                                'max': "ob Description:   BlueCross, as a federal contractor, may be required to implement a COVID-19 vaccine mandate.   Job Responsibilities   Managing the planning, scheduling and deployment of all hardware facilities and associated software for major or multiple sites. Trains, develops, and mentors staff by providing subject matter direction and guidance. Assigns, monitors, and reviews progress and accuracy of work.Provides staff management oversight including hiring, promoting, or making recommendations for staff.Conducts performance reviews and assessments, manages performance, engages in skills assessment, and promotes professional development of staff. Directing, motivating and developing staff, maximizing their individual contribution, professional growth and ability to function effectively with their colleagues as a team. Ensuring adherence to policies, plans, objectives, budgets and staffing allocations for the assigned site. Optimizes the utilization of resources and adheres to Departmental budgeting and fiscal responsibilities. Actively participates in project teams, manages and tracks project details, and provides leadership during periods of transition. Ensures compliance with corporate and industry standards, policies, procedures, and regulations.   Job Qualifications   Education   Bachelor's Degree in a Computer Sciences related field or equivalent work experience   Experience   5 years - Experience in Information Technology required   Skills\\Certifications   Knowledge of information technology concepts, methodology, terminology and standards. Demonstrated ability to interpret and translate technical and/or or complex concepts into information meaningful to project team members and/or business personnel. Strong interpersonal and organizational skills PC Skills required (Basic Microsoft Office and E-Mail) Adaptive to high pace and changing environment Must be able to communicate effectively with both technical and non-technical co-workers Must be willing to adjust schedule to perform maintenance and upgrades outside of business hours and be part of an on-call rotation.   Job Specific Requirements:    Working knowledge with regards to End User Experience (Business user experience) Prior experience with Agile is preferred.   Preferred Skills:   Number of Openings Available:   1   Worker Type:   Employee   Worker Sub-Type:   Employee   Company:   BCBST BlueCross BlueShield of Tennessee, Inc.   Applying for this job indicates your acknowledgement and understanding of the following statements:   BCBST is an Equal Opportunity employer (EEO), and all employees and applicants will be entitled to equal employment opportunities when employment decisions are made. BCBST will take affirmative action to recruit, hire, train and promote individuals in all job classifications without regard to race, religion, color, age, sex, national origin, citizenship, pregnancy, veteran status, sexual orientation, physical or mental disability, gender identity, or any other characteristic protected by applicable law.   Further information regarding BCBST's EEO Policies/Notices may be found by reviewing the following page:   BCBST's EEO Policies/Notices   BlueCross BlueShield of Tennessee is not accepting unsolicited assistance from search firms for this employment opportunity. All resumes submitted by search firms to any employee at BlueCross BlueShield of Tennessee via-email, the Internet or any other method without a valid, written Direct Placement Agreement in place for this position from BlueCross BlueShield of Tennessee HR/Talent Acquisition will not be considered. No fee will be paid in the event the applicant is hired by BlueCross BlueShield of Tennessee as a result of the referral or through other means.    All applicants will be advised that BlueCross, as a federal contractor, may be required to implement a COVID-19 vaccine mandate.", 'missing': '0', 'distinct': '138'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Job_Description'}, {'feature_label': 'Job_Description', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Job_Description')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Recruiter_First_Name', 'transformation_label': 'String Indexer'}], 'feature': 'Recruiter_First_Name', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '', 'stddev': '', 'min': ' NC', 'max': 'prashant', 'missing': '0', 'distinct': '104'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Recruiter_First_Name'}, {'feature_label': 'Recruiter_First_Name', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Recruiter_First_Name')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Recruiter_Last_Name', 'transformation_label': 'String Indexer'}], 'feature': 'Recruiter_Last_Name', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '', 'stddev': '', 'min': ' or', 'max': 'tolia', 'missing': '0', 'distinct': '108'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Recruiter_Last_Name'}, {'feature_label': 'Recruiter_Last_Name', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Recruiter_Last_Name')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Recruiter_Email_Address', 'transformation_label': 'String Indexer'}], 'feature': 'Recruiter_Email_Address', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '', 'stddev': '', 'min': ' SSCP Power BI Data Analyst or associate degree in a related field 5+ Power BI specific focus Experience w/ DevOps Framework Experience visualizing data in Power BI Desktop and Power BI Service Extensive experience with building dashboards and custom reports in Power BI Strong analytical orientation and problem-solving skills Focusing on backend work with very large and intricate data models to create high-performing dashboards and paginated reports Rebuild existing SAP Business Intelligence-based reports', 'max': 'wesley.mercer@matrixres.com', 'missing': '0', 'distinct': '110'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Recruiter_Email_Address'}, {'feature_label': 'Recruiter_Email_Address', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Recruiter_Email_Address')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Company_name', 'transformation_label': 'String Indexer'}], 'feature': 'Company_name', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '', 'stddev': '', 'min': ' our focus has always been on being the best', 'max': 'unifyconsulting.com', 'missing': '0', 'distinct': '111'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Company_name'}, {'feature_label': 'Company_name', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Company_name')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'SetName', 'transformation_label': 'String Indexer'}], 'feature': 'SetName', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '', 'stddev': '', 'min': ' not the biggest.  We believe that trust matters and relationships are the foundation of good business. We put people first with flexible methodologies', 'max': 'abebe woldeargay_08/31/2023_CompTIA_Data Specialist', 'missing': '0', 'distinct': '138'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'SetName'}, {'feature_label': 'SetName', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('SetName')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'CandidateID', 'transformation_label': 'String Indexer'}], 'feature': 'CandidateID', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '36821.76', 'stddev': '2032.31', 'min': ' client focus', 'max': 'OLUKAYODE AKINRIMISI_11/28/2023_Medasource_BI Developer', 'missing': '0', 'distinct': '23'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'CandidateID'}, {'feature_label': 'CandidateID', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('CandidateID')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'SurveyResponse', 'transformation_label': 'String Indexer'}], 'feature': 'SurveyResponse', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '31661.0', 'stddev': 'nan', 'min': ' and forward-thinking. Our seasoned team applies deep industry knowledge to create practical solutions.  Company Description StoneLaurel is a management consulting firm with the capabilities of a large provider and the personal touch of a boutique partner. Headquartered in Charlotte', 'max': 'rufus.benhur@digitalconsultants.org', 'missing': '0', 'distinct': '8'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'SurveyResponse'}, {'feature_label': 'SurveyResponse', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('SurveyResponse')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'InterviewType', 'transformation_label': 'String Indexer'}], 'feature': 'InterviewType', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '', 'stddev': '', 'min': ' NC', 'max': 'Technical', 'missing': '0', 'distinct': '8'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'InterviewType'}, {'feature_label': 'InterviewType', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('InterviewType')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Preparationscore', 'transformation_label': 'String Indexer'}], 'feature': 'Preparationscore', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {'count': '143', 'mean': '16.68', 'stddev': '22.91', 'min': ' and relevant technical aspects. Strong performance in algorithm design and data structures',
                                                                                                                                                                                                                                                                                  'max': 'Avworo  OGHENEKEVWE _10/03/2023_Digital Consultants, LLC_Power BI Reports Developer', 'missing': '0', 'distinct': '17'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Preparationscore'}, {'feature_label': 'Preparationscore', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Preparationscore')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'AutoInterviewsCount', 'transformation_label': 'String Indexer'}], 'feature': 'AutoInterviewsCount', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '323.84', 'stddev': '3525.29', 'min': ' and relevant technical aspects. Strong performance in algorithm design and data structures', 'max': '6', 'missing': '0', 'distinct': '12'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'AutoInterviewsCount'}, {'feature_label': 'AutoInterviewsCount', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('AutoInterviewsCount')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'AutoInterviewsDuration', 'transformation_label': 'String Indexer'}], 'feature': 'AutoInterviewsDuration', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '743.87', 'stddev': '610.7', 'min': '      ""ScaleQuestions"": {     ""ClearFeedbackOnNextSteps"": 4', 'max': 'NULL', 'missing': '0', 'distinct': '51'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'AutoInterviewsDuration'}, {'feature_label': 'AutoInterviewsDuration', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('AutoInterviewsDuration')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'MentorInterviewsCount', 'transformation_label': 'String Indexer'}], 'feature': 'MentorInterviewsCount', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '0.19', 'stddev': '0.43', 'min': '      ""ScaleQuestions"": {     ""ClearFeedbackOnNextSteps"": 1', 'max': 'Recruiter', 'missing': '0', 'distinct': '13'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'MentorInterviewsCount'}, {'feature_label': 'MentorInterviewsCount', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('MentorInterviewsCount')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'MentorInterviewsDuration', 'transformation_label': 'String Indexer'}], 'feature': 'MentorInterviewsDuration', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '606.8', 'stddev': '1035.23', 'min': '      ""ScaleQuestions"": {     ""ClearFeedbackOnNextSteps"": 2', 'max': 'NULL', 'missing': '0', 'distinct': '29'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'MentorInterviewsDuration'}, {'feature_label': 'MentorInterviewsDuration', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop(
            'MentorInterviewsDuration')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Mentor', 'transformation_label': 'String Indexer'}], 'feature': 'Mentor', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '0.0', 'stddev': 'nan', 'min': '     ""ComfortWhileAnsweringQuestions"": 5', 'max': 'Sowmya Attur', 'missing': '0', 'distinct': '21'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Mentor'}, {'feature_label': 'Mentor', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Mentor')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Applicant', 'transformation_label': 'String Indexer'}], 'feature': 'Applicant', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '', 'stddev': '', 'min': '     ""AlignmentWithExpectations"": 4', 'max': 'Yama Touray', 'missing': '0', 'distinct': '30'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Applicant'}, {'feature_label': 'Applicant', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Applicant')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Recruiter_InterviewCount', 'transformation_label': 'String Indexer'}], 'feature': 'Recruiter_InterviewCount', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '5.0', 'stddev': '5.24', 'min': '     ""AlignmentWithExpectations"": 1', 'max': 'Williams', 'missing': '0', 'distinct': '32'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Recruiter_InterviewCount'}, {'feature_label': 'Recruiter_InterviewCount', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop(
            'Recruiter_InterviewCount')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Technical_InterviewCount', 'transformation_label': 'String Indexer'}], 'feature': 'Technical_InterviewCount', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '14.31', 'stddev': '136.43', 'min': '     ""AlignmentWithExpectations"": 3', 'max': 'gwilliams@stonelaurel.com', 'missing': '0', 'distinct': '15'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Technical_InterviewCount'}, {'feature_label': 'Technical_InterviewCount', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop(
            'Technical_InterviewCount')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'InterviewScore', 'transformation_label': 'String Indexer'}], 'feature': 'InterviewScore', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '', 'stddev': '', 'min': '     ""InterviewerEngagement"": 3', 'max': 'StoneLaurel', 'missing': '0', 'distinct': '10'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'InterviewScore'}, {'feature_label': 'InterviewScore', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('InterviewScore')
        display(transformationDF.limit(2).toPandas())
        return transformationDF


***AUTOML FUNCTIONS***

In [None]:
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor
import pyspark


def functionRegression(sparkDF, listOfFeatures, label):
    sparkDF.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
    df = sparkDF.toPandas()
    X = (df.drop(label, axis=1))[listOfFeatures].values
    y = df[label].values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=1, test_size=0.1)
    tpotModel = TPOTRegressor(verbosity=3, generations=10, max_time_mins=5,
                              n_jobs=-1, random_state=25, population_size=15, use_dask=True)
    tpotModel.fit(X_train, y_train)
    display(" Error rate of Model : %s" % tpotModel.score(X_test, y_test))
    data = {'model': tpotModel,
            'X_test': X_test,
            'y_test': y_test,
            'label': label,
            'columnNames': listOfFeatures}
    return data


***READING DATAFRAME***

In [None]:
############## CREATE SPARK SESSION ############################ ENTER YOUR SPARK MASTER IP AND PORT TO CONNECT TO SERVER ################
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[1]').getOrCreate()
#%run forcastingpropertyvaluesregressionHooks.ipynb
try:
	#sourcePreExecutionHook()

	colaberryinterviewprep = HDFSConnector.fetch(spark, "{'url': '/FileStore/platform/uploadedSourceFiles/Colaberry Interview Prep.csv', 'filename': 'Colaberry Interview Prep.csv', 'delimiter': ',', 'file_type': 'Delimeted', 'is_header': 'Use Header Line', 'domain': 'http://172.31.59.158', 'port': '40070', 'dirPath': '/FileStore/platform', 'server_url': '/numtraPlatform/NumtraPlatformV3/uploads/platform/'}")
	#sourcePostExecutionHook(colaberryinterviewprep)

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***TRANSFORMING DATAFRAME***

In [None]:
#%run forcastingpropertyvaluesregressionHooks.ipynb
try:
	#transformationPreExecutionHook()

	forcastingpropertyvaluesregressionautofe = TransformationMain.run(colaberryinterviewprep,json.dumps( {"FE": [{"transformationsData": [{"feature_label": "InterviewDate", "transformation_label": "String Indexer"}], "feature": "InterviewDate", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "", "stddev": "", "min": "10/10/2023", "max": "9/8/2023", "missing": "0", "distinct": "71"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "InterviewDate"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "NoofDays", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "143", "mean": "-58.34", "stddev": "34.75", "min": "-111", "max": "6", "missing": "0"}, "updatedLabel": "NoofDays"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "LogInterviewID", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "143", "mean": "155.01", "stddev": "67.12", "min": "42", "max": "288", "missing": "0"}, "updatedLabel": "LogInterviewID"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "QID", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "143", "mean": "2614.15", "stddev": "228.25", "min": "0", "max": "2745", "missing": "0"}, "updatedLabel": "QID"}, {"transformationsData": [{"feature_label": "Job_Title", "transformation_label": "String Indexer"}], "feature": "Job_Title", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "", "stddev": "", "min": "Analytics Consultant", "max": "siness intelligence developer", "missing": "0", "distinct": "89"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Job_Title"}, {"transformationsData": [{"feature_label": "Job_Description", "transformation_label": "String Indexer"}], "feature": "Job_Description", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "", "stddev": "", "min": "\"Description: Leading with our people, Digital Consultants' mission is to deliver the highest level of professional solutions while being a trusted partner and advisor to our customers. With a culture of practicality, opportunity, and creativity we remain dedicated to being honest, trustworthy, respectful, and ethical in everything we do. We are a certified SBA 8(a) small-disadvantaged business that supports multiple IT customers within the Federal, civilian, and private sectors. Digital Consultants also offers our employees growth opportunities, competitive wages, and a full benefits package. Our founding principles, Fairness and Common Sense make working here more than a job, it is the Digital family.  Digital Consultants is seeking a Power BI Reports Developer to join our team! As the Power BI Reports Developer, you will be designing ETL processes and working directly with developers, testers, and other roles active to translate product roadmap features into well-defined product requirements including features, user stories, and acceptance test criteria.  Duties to include:  Maintain and support data analytics platforms Develop and update relevant technical documentation Collaborate directly with analysts and end users to integrate systems and create effective reports Develop and execute database queries and conduct analyses as needed Maintain status updates for all assigned ad hoc report requests Knowledge of SQL queries, SQL Server Reporting Services (SSRS), and MS SQL Server An expert that understands the Power BI platform and its tools very well. Takes complete responsibility for the development and administration of BI tools including transforming raw data into valuable and meaningful insights in the form of appealing dashboards and reports. Recreating existing reports from the current platform into Power BI.\"\" Build complex dimensional data models and reports from the bottom up Audit existing Power BI efforts with a focus on improvement Visualize compelling data stories on the report canvas Produce KPIs with user-friendly display dashboards Collaborate with client's data team Familiarity with data warehouse design (e.g. dimensional modeling) and data mining Requirements: Active DOD Secret Clearance IAT level II Certification: GSEC", "max": "ob Description:   BlueCross, as a federal contractor, may be required to implement a COVID-19 vaccine mandate.   Job Responsibilities   Managing the planning, scheduling and deployment of all hardware facilities and associated software for major or multiple sites. Trains, develops, and mentors staff by providing subject matter direction and guidance. Assigns, monitors, and reviews progress and accuracy of work.Provides staff management oversight including hiring, promoting, or making recommendations for staff.Conducts performance reviews and assessments, manages performance, engages in skills assessment, and promotes professional development of staff. Directing, motivating and developing staff, maximizing their individual contribution, professional growth and ability to function effectively with their colleagues as a team. Ensuring adherence to policies, plans, objectives, budgets and staffing allocations for the assigned site. Optimizes the utilization of resources and adheres to Departmental budgeting and fiscal responsibilities. Actively participates in project teams, manages and tracks project details, and provides leadership during periods of transition. Ensures compliance with corporate and industry standards, policies, procedures, and regulations.   Job Qualifications   Education   Bachelor's Degree in a Computer Sciences related field or equivalent work experience   Experience   5 years - Experience in Information Technology required   Skills\\Certifications   Knowledge of information technology concepts, methodology, terminology and standards. Demonstrated ability to interpret and translate technical and/or or complex concepts into information meaningful to project team members and/or business personnel. Strong interpersonal and organizational skills PC Skills required (Basic Microsoft Office and E-Mail) Adaptive to high pace and changing environment Must be able to communicate effectively with both technical and non-technical co-workers Must be willing to adjust schedule to perform maintenance and upgrades outside of business hours and be part of an on-call rotation.   Job Specific Requirements:    Working knowledge with regards to End User Experience (Business user experience) Prior experience with Agile is preferred.   Preferred Skills:   Number of Openings Available:   1   Worker Type:   Employee   Worker Sub-Type:   Employee   Company:   BCBST BlueCross BlueShield of Tennessee, Inc.   Applying for this job indicates your acknowledgement and understanding of the following statements:   BCBST is an Equal Opportunity employer (EEO), and all employees and applicants will be entitled to equal employment opportunities when employment decisions are made. BCBST will take affirmative action to recruit, hire, train and promote individuals in all job classifications without regard to race, religion, color, age, sex, national origin, citizenship, pregnancy, veteran status, sexual orientation, physical or mental disability, gender identity, or any other characteristic protected by applicable law.   Further information regarding BCBST's EEO Policies/Notices may be found by reviewing the following page:   BCBST's EEO Policies/Notices   BlueCross BlueShield of Tennessee is not accepting unsolicited assistance from search firms for this employment opportunity. All resumes submitted by search firms to any employee at BlueCross BlueShield of Tennessee via-email, the Internet or any other method without a valid, written Direct Placement Agreement in place for this position from BlueCross BlueShield of Tennessee HR/Talent Acquisition will not be considered. No fee will be paid in the event the applicant is hired by BlueCross BlueShield of Tennessee as a result of the referral or through other means.    All applicants will be advised that BlueCross, as a federal contractor, may be required to implement a COVID-19 vaccine mandate.", "missing": "0", "distinct": "138"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Job_Description"}, {"transformationsData": [{"feature_label": "Recruiter_First_Name", "transformation_label": "String Indexer"}], "feature": "Recruiter_First_Name", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "", "stddev": "", "min": " NC", "max": "prashant", "missing": "0", "distinct": "104"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Recruiter_First_Name"}, {"transformationsData": [{"feature_label": "Recruiter_Last_Name", "transformation_label": "String Indexer"}], "feature": "Recruiter_Last_Name", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "", "stddev": "", "min": " or", "max": "tolia", "missing": "0", "distinct": "108"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Recruiter_Last_Name"}, {"transformationsData": [{"feature_label": "Recruiter_Email_Address", "transformation_label": "String Indexer"}], "feature": "Recruiter_Email_Address", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "", "stddev": "", "min": " SSCP Power BI Data Analyst or associate degree in a related field 5+ Power BI specific focus Experience w/ DevOps Framework Experience visualizing data in Power BI Desktop and Power BI Service Extensive experience with building dashboards and custom reports in Power BI Strong analytical orientation and problem-solving skills Focusing on backend work with very large and intricate data models to create high-performing dashboards and paginated reports Rebuild existing SAP Business Intelligence-based reports", "max": "wesley.mercer@matrixres.com", "missing": "0", "distinct": "110"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Recruiter_Email_Address"}, {"transformationsData": [{"feature_label": "Company_name", "transformation_label": "String Indexer"}], "feature": "Company_name", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "", "stddev": "", "min": " our focus has always been on being the best", "max": "unifyconsulting.com", "missing": "0", "distinct": "111"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Company_name"}, {"transformationsData": [{"feature_label": "SetName", "transformation_label": "String Indexer"}], "feature": "SetName", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "", "stddev": "", "min": " not the biggest.  We believe that trust matters and relationships are the foundation of good business. We put people first with flexible methodologies", "max": "abebe woldeargay_08/31/2023_CompTIA_Data Specialist", "missing": "0", "distinct": "138"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "SetName"}, {"transformationsData": [{"feature_label": "CandidateID", "transformation_label": "String Indexer"}], "feature": "CandidateID", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "36821.76", "stddev": "2032.31", "min": " client focus", "max": "OLUKAYODE AKINRIMISI_11/28/2023_Medasource_BI Developer", "missing": "0", "distinct": "23"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "CandidateID"}, {"transformationsData": [{"feature_label": "SurveyResponse", "transformation_label": "String Indexer"}], "feature": "SurveyResponse", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "31661.0", "stddev": "nan", "min": " and forward-thinking. Our seasoned team applies deep industry knowledge to create practical solutions.  Company Description StoneLaurel is a management consulting firm with the capabilities of a large provider and the personal touch of a boutique partner. Headquartered in Charlotte", "max": "rufus.benhur@digitalconsultants.org", "missing": "0", "distinct": "8"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "SurveyResponse"}, {"transformationsData": [{"feature_label": "InterviewType", "transformation_label": "String Indexer"}], "feature": "InterviewType", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "", "stddev": "", "min": " NC", "max": "Technical", "missing": "0", "distinct": "8"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "InterviewType"}, {"transformationsData": [{"feature_label": "Preparationscore", "transformation_label": "String Indexer"}], "feature": "Preparationscore", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "16.68", "stddev": "22.91", "min": " and relevant technical aspects. Strong performance in algorithm design and data structures", "max": "Avworo  OGHENEKEVWE _10/03/2023_Digital Consultants, LLC_Power BI Reports Developer", "missing": "0", "distinct": "17"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Preparationscore"}, {"transformationsData": [{"feature_label": "AutoInterviewsCount", "transformation_label": "String Indexer"}], "feature": "AutoInterviewsCount", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "323.84", "stddev": "3525.29", "min": " and relevant technical aspects. Strong performance in algorithm design and data structures", "max": "6", "missing": "0", "distinct": "12"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "AutoInterviewsCount"}, {"transformationsData": [{"feature_label": "AutoInterviewsDuration", "transformation_label": "String Indexer"}], "feature": "AutoInterviewsDuration", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "743.87", "stddev": "610.7", "min": "      \"\"ScaleQuestions\"\": {     \"\"ClearFeedbackOnNextSteps\"\": 4", "max": "NULL", "missing": "0", "distinct": "51"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "AutoInterviewsDuration"}, {"transformationsData": [{"feature_label": "MentorInterviewsCount", "transformation_label": "String Indexer"}], "feature": "MentorInterviewsCount", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "0.19", "stddev": "0.43", "min": "      \"\"ScaleQuestions\"\": {     \"\"ClearFeedbackOnNextSteps\"\": 1", "max": "Recruiter", "missing": "0", "distinct": "13"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "MentorInterviewsCount"}, {"transformationsData": [{"feature_label": "MentorInterviewsDuration", "transformation_label": "String Indexer"}], "feature": "MentorInterviewsDuration", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "606.8", "stddev": "1035.23", "min": "      \"\"ScaleQuestions\"\": {     \"\"ClearFeedbackOnNextSteps\"\": 2", "max": "NULL", "missing": "0", "distinct": "29"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "MentorInterviewsDuration"}, {"transformationsData": [{"feature_label": "Mentor", "transformation_label": "String Indexer"}], "feature": "Mentor", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "0.0", "stddev": "nan", "min": "     \"\"ComfortWhileAnsweringQuestions\"\": 5", "max": "Sowmya Attur", "missing": "0", "distinct": "21"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Mentor"}, {"transformationsData": [{"feature_label": "Applicant", "transformation_label": "String Indexer"}], "feature": "Applicant", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "", "stddev": "", "min": "     \"\"AlignmentWithExpectations\"\": 4", "max": "Yama Touray", "missing": "0", "distinct": "30"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Applicant"}, {"transformationsData": [{"feature_label": "Recruiter_InterviewCount", "transformation_label": "String Indexer"}], "feature": "Recruiter_InterviewCount", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "5.0", "stddev": "5.24", "min": "     \"\"AlignmentWithExpectations\"\": 1", "max": "Williams", "missing": "0", "distinct": "32"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Recruiter_InterviewCount"}, {"transformationsData": [{"feature_label": "Technical_InterviewCount", "transformation_label": "String Indexer"}], "feature": "Technical_InterviewCount", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "14.31", "stddev": "136.43", "min": "     \"\"AlignmentWithExpectations\"\": 3", "max": "gwilliams@stonelaurel.com", "missing": "0", "distinct": "15"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Technical_InterviewCount"}, {"transformationsData": [{"feature_label": "InterviewScore", "transformation_label": "String Indexer"}], "feature": "InterviewScore", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "", "stddev": "", "min": "     \"\"InterviewerEngagement\"\": 3", "max": "StoneLaurel", "missing": "0", "distinct": "10"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "InterviewScore"}]}))

	#transformationPostExecutionHook(forcastingpropertyvaluesregressionautofe)

except Exception as ex: 
	logging.error(ex)


***TRAIN MODEL***

In [None]:
#%run forcastingpropertyvaluesregressionHooks.ipynb
try:
	#mlPreExecutionHook()

	dataAutoML=functionRegression(forcastingpropertyvaluesregressionautofe, ["NoofDays", "LogInterviewID", "QID", "InterviewDate_stringindexer", "Job_Title_stringindexer", "Job_Description_stringindexer", "Recruiter_First_Name_stringindexer", "Recruiter_Last_Name_stringindexer", "Recruiter_Email_Address_stringindexer", "Company_name_stringindexer", "SetName_stringindexer", "CandidateID_stringindexer", "SurveyResponse_stringindexer", "InterviewType_stringindexer", "Preparationscore_stringindexer", "AutoInterviewsCount_stringindexer", "AutoInterviewsDuration_stringindexer", "MentorInterviewsCount_stringindexer", "MentorInterviewsDuration_stringindexer", "Mentor_stringindexer", "Applicant_stringindexer", "Recruiter_InterviewCount_stringindexer", "Technical_InterviewCount_stringindexer"], "InterviewScore_stringindexer")

	#mlPostExecutionHook(dataAutoML)

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***PREDICT ON TRAINED MODEL***

In [None]:
import pandas as pd
import numpy as np
import sklearn.metrics

try:
    model=dataAutoML ['model']
    X_test=dataAutoML['X_test']
    y_test=dataAutoML['y_test']
    label=dataAutoML['label']
    columnNames=dataAutoML['columnNames']
    if label in columnNames:
        columnNames.remove(label)
    predicted=label+"_predicted"
    y_predicted=model.predict(X_test)
    df =pd.DataFrame(X_test , columns=columnNames)
    df[label]=y_test
    df[predicted]=y_predicted
    columnNames.insert(0,predicted)
    columnNames.insert(0,label)
    df = df[columnNames]
    R2 = np.round(sklearn.metrics.r2_score(y_test, y_predicted), 1)
    Mean_Squared_Error = np.round(sklearn.metrics.mean_squared_error(y_test, y_predicted), 1)
    Mean_Absolute_Error = np.round(sklearn.metrics.mean_absolute_error(y_test, y_predicted), 1)
    display(" R2 score of Prediction on test data    : %s"%R2)
    display(" Mean Squared Error of Prediction on test data    : %s"%Mean_Squared_Error)
    display(" Mean Absolute Error of Prediction on test data   : %s"%Mean_Absolute_Error)
    display(df.head())
except Exception as ex:
    logging.error(ex)

spark.stop()

