In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, StringIndexerModel
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Transformer

from pyspark.sql.types import StringType

import mlflow
from mlflow.tracking import MlflowClient

In [2]:
spark = SparkSession.builder.appName("PySparkTitanikJob")\
    .getOrCreate()

24/05/12 22:30:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
spark

In [4]:
mlflow.start_run()

<ActiveRun: >

In [10]:
from pyspark.sql.functions import split, col
df = spark.read.parquet('../part.0.parquet')
#df2 = df.withColumn("dangerous_commands", split(col("dangerous_commands"), ",").cast("array<string>"))
tag_index = StringIndexer(inputCol='tag', outputCol="image_tag_index")
user_index = StringIndexer(inputCol='user', outputCol="user_index")
image_index = StringIndexer(inputCol='image', outputCol="image_index")
qos_index = StringIndexer(inputCol='qos', outputCol="qos_index")

df = tag_index.fit(df).transform(df)
df = user_index.fit(df).transform(df)
df = image_index.fit(df).transform(df)

df = qos_index.fit(df).transform(df)


In [11]:
df.show()

+-----------+-------+-------+-------------------------------+-----------------------+--------------------+---+-------------------+--------------------------------+-------------------------+----------------------+--------------------+-------------+--------------------+--------------------+--------------------+----+----------+------+---------------+----------+-----------+---------+
|hostNetwork|hostPID|hostIPC|has_privileged_security_context|count_of_dangerous_caps|                user|UID|has_mounted_secrets|has_secret_environment_variables|read_only_root_fs_checker|has_dangerous_commands|count_dangerous_dirs|exposed_ports|               image|                 tag|has_wide_permissions| qos|has_probes|result|image_tag_index|user_index|image_index|qos_index|
+-----------+-------+-------+-------------------------------+-----------------------+--------------------+---+-------------------+--------------------------------+-------------------------+----------------------+--------------------+-

In [13]:
feature = VectorAssembler(
    inputCols=["hostNetwork", 
                             "hostPID", 
                             "hostIPC", 
                             "has_privileged_security_context",
                             "count_of_dangerous_caps",
                             "user_index",
                             "UID",
                             "has_mounted_secrets",
                             "has_secret_environment_variables",
                             "read_only_root_fs_checker",
                             "has_dangerous_commands",
                             "count_dangerous_dirs",
                             "exposed_ports",
                             "image_index",
                             "image_tag_index",
                             "has_wide_permissions",
                             "qos_index",
                             "has_probes"],
    outputCol="features")
feature_vector= feature.transform(df)
feature_vector.show()

+-----------+-------+-------+-------------------------------+-----------------------+--------------------+---+-------------------+--------------------------------+-------------------------+----------------------+--------------------+-------------+--------------------+--------------------+--------------------+----+----------+------+---------------+----------+-----------+---------+--------------------+
|hostNetwork|hostPID|hostIPC|has_privileged_security_context|count_of_dangerous_caps|                user|UID|has_mounted_secrets|has_secret_environment_variables|read_only_root_fs_checker|has_dangerous_commands|count_dangerous_dirs|exposed_ports|               image|                 tag|has_wide_permissions| qos|has_probes|result|image_tag_index|user_index|image_index|qos_index|            features|
+-----------+-------+-------+-------------------------------+-----------------------+--------------------+---+-------------------+--------------------------------+-------------------------+---

24/05/12 22:35:15 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [14]:
(training_data, test_data) = feature_vector.randomSplit([0.8, 0.2],seed = 42)
training_data.show()

+-----------+-------+-------+-------------------------------+-----------------------+---------------------+---+-------------------+--------------------------------+-------------------------+----------------------+--------------------+-------------+--------------------+------+--------------------+----+----------+------+---------------+----------+-----------+---------+--------------------+
|hostNetwork|hostPID|hostIPC|has_privileged_security_context|count_of_dangerous_caps|                 user|UID|has_mounted_secrets|has_secret_environment_variables|read_only_root_fs_checker|has_dangerous_commands|count_dangerous_dirs|exposed_ports|               image|   tag|has_wide_permissions| qos|has_probes|result|image_tag_index|user_index|image_index|qos_index|            features|
+-----------+-------+-------+-------------------------------+-----------------------+---------------------+---+-------------------+--------------------------------+-------------------------+----------------------+-----

In [15]:
#LogisticRegression

evaluator = MulticlassClassificationEvaluator(
    labelCol="result", predictionCol="prediction", metricName="accuracy"
)
from pyspark.ml.classification import LogisticRegression
# создаем модель и указываем целевую колонку и колонку с эмбэддингами
lr = LogisticRegression(labelCol="result", featuresCol="features")

# обучаем модель на тренировочных данных
lrModel = lr.fit(training_data)     # lr - эстиматор, lrModel - трансформер
# применим модель на тестовых данных, получим предсказания
lr_prediction = lrModel.transform(test_data)
lr_prediction.select("prediction", "result", "features").show(5)

lr_accuracy = evaluator.evaluate(lr_prediction)
print("LogisticRegression [Accuracy] = %g"% (lr_accuracy))
print("LogisticRegression [Error] = %g " % (1.0 - lr_accuracy))


24/05/12 22:41:27 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/05/12 22:41:27 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


+----------+------+--------------------+
|prediction|result|            features|
+----------+------+--------------------+
|       1.0|     1|(18,[13,14,15],[2...|
|       1.0|     0|     (18,[15],[1.0])|
|       1.0|     0|     (18,[15],[1.0])|
|       1.0|     0|     (18,[15],[1.0])|
|       1.0|     0|     (18,[15],[1.0])|
+----------+------+--------------------+
only showing top 5 rows

LogisticRegression [Accuracy] = 0.792683
LogisticRegression [Error] = 0.207317 


In [16]:
#DecisionTreeClassifier

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="result", featuresCol="features", maxBins=16000)
dt_model = dt.fit(training_data)
dt_prediction = dt_model.transform(training_data)

dt_prediction.select("prediction", "result", "features").show(5)


# Create a MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="result", predictionCol="prediction")

# Calculate the accuracy of the model
accuracy = evaluator.evaluate(dt_prediction)

# Calculate the error of the model
error = 1.0 - accuracy

print("DecisionTreeClassifier [Accuracy] = %g" % (accuracy))
print("DecisionTreeClassifier [Error] = %g" % (error))

24/05/12 22:41:33 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 16000 to 840 (= number of training instances)


+----------+------+--------------------+
|prediction|result|            features|
+----------+------+--------------------+
|       1.0|     1|(18,[5,15],[1.0,1...|
|       0.0|     0|(18,[5,12,13,14,1...|
|       0.0|     0|(18,[13,14,15,17]...|
|       1.0|     1|(18,[13,14,15],[3...|
|       1.0|     0|     (18,[15],[1.0])|
+----------+------+--------------------+
only showing top 5 rows

DecisionTreeClassifier [Accuracy] = 0.914551
DecisionTreeClassifier [Error] = 0.0854494


In [17]:
#RandomForestClassifier

from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="result", featuresCol="features", maxBins=16000)
rf_model = rf.fit(training_data)
rf_prediction = rf_model.transform(test_data)
rf_prediction.select("prediction", "result", "features").show(5)


rf_accuracy = evaluator.evaluate(rf_prediction)
print("RandomForestClassifier [Accuracy] = %g"% (rf_accuracy))
print("RandomForestClassifier [Error] = %g" % (1.0 - rf_accuracy))


24/05/12 22:41:39 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 16000 to 840 (= number of training instances)


+----------+------+--------------------+
|prediction|result|            features|
+----------+------+--------------------+
|       1.0|     1|(18,[13,14,15],[2...|
|       1.0|     0|     (18,[15],[1.0])|
|       1.0|     0|     (18,[15],[1.0])|
|       1.0|     0|     (18,[15],[1.0])|
|       1.0|     0|     (18,[15],[1.0])|
+----------+------+--------------------+
only showing top 5 rows

RandomForestClassifier [Accuracy] = 0.770975
RandomForestClassifier [Error] = 0.229025


In [18]:
#Gradient-boosted tree classifier

from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(labelCol="result", featuresCol="features",maxIter=10,maxBins=16000)
gbt_model = gbt.fit(training_data)
gbt_prediction = gbt_model.transform(test_data)
gbt_prediction.select("prediction", "result", "features").show(5)

gbt_accuracy = evaluator.evaluate(gbt_prediction)
print("Gradient-boosted [Accuracy] = %g"% (gbt_accuracy))
print("Gradient-boosted [Error] = %g"% (1.0 - gbt_accuracy))


24/05/12 22:41:42 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 16000 to 840 (= number of training instances)


+----------+------+--------------------+
|prediction|result|            features|
+----------+------+--------------------+
|       1.0|     1|(18,[13,14,15],[2...|
|       1.0|     0|     (18,[15],[1.0])|
|       1.0|     0|     (18,[15],[1.0])|
|       1.0|     0|     (18,[15],[1.0])|
|       1.0|     0|     (18,[15],[1.0])|
+----------+------+--------------------+
only showing top 5 rows

Gradient-boosted [Accuracy] = 0.713836
Gradient-boosted [Error] = 0.286164


In [19]:
#Save model

dt_model.write().overwrite().save('dt_model')


24/05/12 22:41:50 WARN MemoryManager: Total allocation exceeds 95,00% (1 020 054 720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers


In [23]:
#Pipeline

from pyspark.ml.pipeline import PipelineModel

df_pipe = spark.read.parquet('../part.0.parquet')

train, test = df_pipe.randomSplit([0.8, 0.2])


tag_index = StringIndexer(inputCol='tag', outputCol="image_tag_index", handleInvalid="skip")
user_index = StringIndexer(inputCol='user', outputCol="user_index", handleInvalid="skip")
image_index = StringIndexer(inputCol='image', outputCol="image_index", handleInvalid="skip")
qos_index = StringIndexer(inputCol='qos', outputCol="qos_index", handleInvalid="skip")

feature = VectorAssembler(
    inputCols=["hostNetwork", 
                             "hostPID", 
                             "hostIPC", 
                             "has_privileged_security_context",
                             "count_of_dangerous_caps",
                             "user_index",
                             "UID",
                             "has_mounted_secrets",
                             "has_secret_environment_variables",
                             "read_only_root_fs_checker",
                             "has_dangerous_commands",
                             "count_dangerous_dirs",
                             "exposed_ports",
                             "image_index",
                             "image_tag_index",
                             "has_wide_permissions",
                             "qos_index",
                             "has_probes"],
    outputCol="features")


dt_classifier = DecisionTreeClassifier(labelCol="result", featuresCol="features", maxBins=200)

pipeline = Pipeline(stages=[tag_index, user_index, image_index, qos_index, feature, dt_classifier])

p_model = pipeline.fit(train)

p_model.write().overwrite().save('p_model')
model = PipelineModel.load('p_model')


evaluator = MulticlassClassificationEvaluator(
    labelCol="result", predictionCol="prediction", metricName="accuracy"
)

prediction = p_model.transform(test)
# test.show(10)
p_accuracy = evaluator.evaluate(prediction)
print("Pipeline model [Accuracy] = %g"% (p_accuracy))
print("Pipeline model [Error] = %g " % (1.0 - p_accuracy))


Pipeline model [Accuracy] = 0.88
Pipeline model [Error] = 0.12 


In [24]:
#Hyperparams

from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

paramGrid = ParamGridBuilder() \
   .addGrid(dt_classifier.maxDepth, [2, 3, 4]) \
   .addGrid(dt_classifier.minInfoGain, [0.05, 0.1, 0.15]) \
   .build()

tvs = TrainValidationSplit(estimator=pipeline,
                            estimatorParamMaps=paramGrid,
                            evaluator=evaluator,
                            trainRatio=0.8)
model = tvs.fit(train)

best_model = model.bestModel
print("Best model hyperparameters:")
print("maxDepth:", best_model.stages[-1].getOrDefault("maxDepth"))
print("maxBins:", best_model.stages[-1].getOrDefault("maxBins"))
print("minInfoGain:", best_model.stages[-1].getOrDefault("minInfoGain"))

prediction = best_model.transform(test)
accuracy = evaluator.evaluate(prediction)
error = 1.0 - accuracy
print("Best model accuracy:", accuracy)
print("Best model error:", error)

Best model hyperparameters:
maxDepth: 2
maxBins: 200
minInfoGain: 0.05
Best model accuracy: 0.8866666666666667
Best model error: 0.11333333333333329
