In [37]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings("ignore")


"""
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.model_selection import TimeSùiesSplit
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
#from prettytable import PrettyTable
"""

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import *


In [38]:
#create version without iceberg extension options for CDE
spark = SparkSession.builder\
  .appName("0.2 - Batch Load into Icerberg Table") \
  .config("spark.hadoop.fs.s3a.s3guard.ddb.region", "us-west-2")\
  .config("spark.kerberos.access.hadoopFileSystems", "s3a://ps-uat2")\
  .config("spark.jars","/home/cdsw/lib/iceberg-spark-runtime-3.2_2.12-0.13.2.jar") \
  .config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
  .config("spark.sql.catalog.spark_catalog","org.apache.iceberg.spark.SparkSessionCatalog") \
  .config("spark.sql.catalog.spark_catalog.type","hive") \
  .getOrCreate()

In [39]:
## Load data from Iceberg
df_raw = spark.sql("SELECT * FROM spark_catalog.default.pump_raw")

In [40]:
#df_raw_pandas =  df_raw.toPandas()
#df_raw_pandas

In [42]:
# NORMAL = 1 and RECOVERING & BROKEN = 0
df = df_raw.withColumn('machine_status_tmp', when(df_raw.machine_status == "NORMAL", 1)\
                                    .when(df_raw.machine_status == "RECOVERING", 0)\
                                    .when(df_raw.machine_status == "BROKEN", 0)\
                                    .otherwise('Unknown'))\
                                    .drop(df_raw.machine_status)\
                                    .withColumnRenamed("machine_status_tmp", "machine_status")

In [43]:
print("Distributions of target class: ")
print(df.groupBy('machine_status').count().orderBy('count').show())

Distributions of target class: 


[Stage 8:>                                                          (0 + 1) / 1]

+--------------+------+
|machine_status| count|
+--------------+------+
|             0| 21757|
|             1|308214|
+--------------+------+

None


                                                                                

In [44]:
# Drops sensors with nan values
cols = ("sensor_15","sensor_50","sensor_00", "sensor_06", "sensor_07", "sensor_08", "sensor_09")
df = df.drop(*cols)

In [47]:
# fill nan with
#df = df.fillna(method="pad", limit=30)

TypeError: fillna() got an unexpected keyword argument 'method'

In [48]:
# drops rows that contains nan values
df = df.na.drop("any")

In [52]:
# Reset index
#df.reset_index(drop=True)
#df.show()

AttributeError: 'DataFrame' object has no attribute 'reset_index'

In [54]:
# Select the relevant Feature
final_sensors = ['sensor_04', 'sensor_19', 'sensor_20', 'sensor_21', 
                 'sensor_38', 'sensor_39', 'sensor_40', 'sensor_41', 
                 'sensor_42']
df = df.select(final_sensors)

In [60]:
df.writeTo("spark_catalog.default.pump_processed").using("iceberg").overwrite()

TypeError: overwrite() missing 1 required positional argument: 'condition'

In [1]:
import boto
data = pd.read_csv('s3a://ps-uat2/user/dciciani/pump_sensor.csv')

NameError: name 'pd' is not defined

In [26]:
import mlflow
import pandas as pd
import cdsw, numpy, sklearn
from cmlapi.utils import Cursor

logged_model = '/home/cdsw/.experiments/n03k-3b0z-nbdp-wz8l/94k7-jivb-s3j1-g7r9/artifacts/model'


@cdsw.model_metrics
def predict(data):
    
    df = pd.DataFrame(data, index=[0])
    df.columns = ['sensor_04', 'sensor_19', 'sensor_20', 'sensor_21', 'sensor_38', 'sensor_39', 'sensor_40', 'sensor_41', 'sensor_42']

    #data = args.get('input')
    # Load model as a PyFuncModel.
    loaded_model = mlflow.pyfunc.load_model(logged_model)
 
    # Predict on a Pandas DataFrame.
    pred = loaded_model.predict(df)
    
    cdsw.track_metric("prediction", str(pred))
    cdsw.track_metric("data", data)
   
    return {'input_data': str(data), 'pred': str(pred[0])}



Not running in a model replica, so using a local development
version of the model metrics service. Please use the following
CRN's to consume metrics:
   model_crn: "crn:cdp:ml:::workspace:dev/model" (cdsw.dev_model_crn)
   model_build_crn: "crn:cdp:ml:::workspace:dev/model-build" (cdsw.dev_model_build_crn)
   model_deployment_crn: "crn:cdp:ml:::workspace:dev/model-deployment" (cdsw.dev_model_deployment_crn)



In [28]:
data = {
    
  'sensor_04' : '4219',
  'sensor_19' : '31294',
  'sensor_20' : '421',
  'sensor_21' : '645',
  'sensor_38' : '664',
  'sensor_39' : '7654',
  'sensor_40' : '12',
  'sensor_41' : '1321',
  'sensor_42' : '3124',
}

predict(data)

{'prediction': {'input_data': "{'sensor_04': '4219', 'sensor_19': '31294', 'sensor_20': '421', 'sensor_21': '645', 'sensor_38': '664', 'sensor_39': '7654', 'sensor_40': '12', 'sensor_41': '1321', 'sensor_42': '3124'}",
  'pred': '0'},
 'model_deployment_crn': 'crn:cdp:ml:::workspace:dev/model-deployment',
 'uuid': '0e5235cc-a485-45b2-bf86-dcb76c6783ef'}