In [2]:
%matplotlib inline
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import seaborn as sn
import pandas as pd
import random as rd
import datetime as dt



from bigdl.dataset.transformer import *
from bigdl.dataset.base import *
from bigdl.nn.layer import *
from bigdl.nn.criterion import *
from bigdl.optim.optimizer import *
from bigdl.util.common import *
from utils import *
from bigdl.models.ml_pipeline.dl_classifier import *

from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col, udf
from pyspark.ml import  Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MultiClassClassificationEvaluator


init_engine()

In [4]:
learning_rate = 0.2
training_epochs = 25
batch_size = 16
display_step = 1

# Network Parameters
n_input = 4
n_classes = 3
n_hidden_1 = 10 # 1st layer number of features
n_hidden_2 = 20 # 2nd layer number of features
n_hidden_3 = 30 # 3rd layer number of features

In [5]:
iris_training = spark.read.csv("../data/iris/iris_training.csv", header=True, inferSchema="true", mode="DROPMALFORMED")
iris_test = spark.read.csv("../data/iris/iris_test.csv", header=True, inferSchema="true", mode="DROPMALFORMED")

In [6]:
iris_training = iris_training.select([col(c).cast("double") for c in iris_training.columns])
iris_test = iris_test.select([col(c).cast("double") for c in iris_test.columns])



In [7]:
assembler =  VectorAssembler(inputCols=['c1','c2','c3','c4'], outputCol="assembled")
scaler = StandardScaler(inputCol="assembled", outputCol="features")
pipeline = Pipeline(stages = [assembler, scaler])
pipelineTraining = pipeline.fit(iris_training)
iris_data_training = pipelineTraining.transform(iris_training)
pipelineTest = pipeline.fit(iris_test)
iris_data_test = pipelineTraining.transform(iris_test)

In [8]:
iris_data_training.select('features', 'label').show(10, False)

+----------------------------------------------------------------------------+-----+
|features                                                                    |label|
+----------------------------------------------------------------------------+-----+
|[7.3683612551017434,6.554983394668502,3.07337626655598,2.813157930275381]   |3.0  |
|[5.7565322305482365,5.384450645620555,1.8110967285062027,1.2787081501251731]|2.0  |
|[5.641401585937273,5.8526637452397345,2.469677357053913,2.1738038552127943] |3.0  |
|[5.641401585937273,7.257303044097271,0.8232257856846377,0.12787081501251732]|1.0  |
|[6.56244674282499,8.896048892764396,0.9329892237759226,0.38361244503755193] |1.0  |
|[5.065748362882449,7.491409593906861,0.7134623475933526,0.25574163002503464]|1.0  |
|[6.2170548089920965,7.959622693526039,0.8232257856846377,0.5114832600500693]|1.0  |
|[7.944014478156568,7.257303044097271,2.798967671327768,2.941028745287898]   |3.0  |
|[7.713753188934637,7.257303044097271,2.4147956380082705,1.790191

In [20]:

bigDLModel = Sequential().add(Linear(n_input, 10)).add(Linear(10, n_classes)).add(LogSoftMax())
classnll_criterion = ClassNLLCriterion()
dlClassifier = DLClassifier(model=bigDLModel, criterion=classnll_criterion, feature_size=[n_input])
dlClassifier.setLabelCol("label").setMaxEpoch(100).setBatchSize(8)
model = dlClassifier.fit(iris_data_training)
print("\ninitial model training finished.")

creating: createSequential
creating: createLinear
creating: createLinear
creating: createLogSoftMax
creating: createClassNLLCriterion
creating: createDLClassifier

initial model training finished.


In [22]:
predictionDF = model.transform(iris_data_test)

In [None]:
# convert the prediction and label column back to {0, 1}
evaluateModel(predictionDF: DataFrame): Unit = {
  predictionDF.cache()
     
  val metrics = new BinaryClassificationEvaluator().setRawPredictionCol("prediction").setLabelCol("Class")
  val auPRC = metrics.evaluate(finalData)
  println("\nArea under precision-recall curve: = " + auPRC)
    
  val recall = new MulticlassClassificationEvaluator().setLabelCol("Class").setMetricName("weightedRecall").evaluate(finalData)
  println("\nrecall = " + recall)

  val precisoin = new MulticlassClassificationEvaluator().setLabelCol("Class").setMetricName("weightedPrecision").evaluate(finalData)
  println("\nPrecision = " + precisoin)  
  predictionDF.unpersist()
}

evaluateModel(predictionDF)

In [None]:
loss = np.array(train_summary.read_scalar("Loss"))
top1 = np.array(val_summary.read_scalar("Top1Accuracy"))

plt.figure(figsize = (12,12))
plt.subplot(2,1,1)
plt.plot(loss[:,0],loss[:,1],label='loss')
plt.xlim(0,loss.shape[0]+10)
plt.grid(True)
plt.title("loss")
plt.subplot(2,1,2)
plt.plot(top1[:,0],top1[:,1],label='top1')
plt.xlim(0,loss.shape[0]+10)
plt.title("top1 accuracy")
plt.grid(True)


In [None]:
predictions = trained_model.predict(iris_rdd_test).collect()

def map_predict_label(l):
    return np.array(l).argmax()
def map_groundtruth_label(l):
    return l.to_ndarray()[0] - 1

y_pred = np.array([ map_predict_label(s) for s in predictions])

y_true = np.array([map_groundtruth_label(s.label) for s in iris_rdd_test.collect()])

In [None]:
acc = accuracy_score(y_true, y_pred)
print("The prediction accuracy is %.2f%%"%(acc*100))

cm = confusion_matrix(y_true, y_pred)
cm.shape
df_cm = pd.DataFrame(cm)
plt.figure(figsize = (10,8))
sn.heatmap(df_cm, annot=True,fmt='d');