In [1]:
%pip install findspark pymongo

Note: you may need to restart the kernel to use updated packages.


# Crate Directory and Output File

In [2]:
#Use '/resource' to syncronize folder with host

!mkdir -p ~/output/spark-model/decission-tree/feature-importance

In [3]:
input_path  = '/home/jovyan/output/renamed-data/multi/TrainDataUnderSampling.csv'
output_path = '/home/jovyan/output/spark-model/decission-tree/feature-importance'

# Import Package

In [4]:
import os
import time
import json
import pyspark
import findspark
from gridfs import GridFS
from datetime import datetime
from pymongo import MongoClient

from pyspark.ml.classification import DecisionTreeClassifier

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from sklearn.metrics import confusion_matrix
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

findspark.init()

# DB Connections

In [5]:
client = MongoClient("mongodb://mongodb:27017")
db = client['mataelanglab']

result_col = db['spark_result']
cv_col = db['spark_cv']
model_col = GridFS(db, 'spark_model')

# Spark session & context (Local / Cluster)

## Local Version

In [6]:
spark = (SparkSession
         .builder
         .master('local[2]')
         .appName('DecissionTree-FeatureImportance')
         .config("spark.executor.memory", "4g") #optional
         .config("spark.executor.cores","1") #optional
         .getOrCreate())
sc = spark.sparkContext

## Cluster Version

<b>Note : </b> \
Jika menggunakan mode ini pastikan data dapat diakses oleh masing worker yang ada. \
Apabila menggunakan cluster pada MataElangLab (1 master dan 1 worker), taruh data pada '/resource'

In [None]:
spark_uri = os.environ['SPARK_MASTER']
print(spark_uri) #MataElangLab Spark Cluster URL

In [None]:
spark = (SparkSession
         .builder
         .master(spark_uri)
         .appName('DecissionTree-FeatureImportance')
         .config("spark.executor.memory", "2g") #optional
         .config("spark.executor.cores","1") #optional
         .getOrCreate())
sc = spark.sparkContext

# Read Data

In [7]:
df = spark.read.csv(input_path, header="true", inferSchema =True)

# Normalize

In [8]:
df = df.select([F.col(column).cast('double') for column in df.columns])

# Classification Using Feature Importance

In [9]:
feature = [
    'idle_max',
    'packet_length_min',
    'syn_flag_count',
    'flow_iat_mean',
    'idle_mean',
    'flow_duration',
    'down_per_up_ratio',
    'bwd_packets_per_s',
    'fwd_seg_size_min',
    'total_fwd_packet',
]

# Pre-Processing

In [10]:
#Vector Assembler
vector_assembler = VectorAssembler(inputCols=feature, outputCol="SS_features")
df = vector_assembler.transform(df)

#Standard Scaler
scaler = StandardScaler(inputCol="SS_features", outputCol="scaledFeatures", withStd=True, withMean=False).fit(df)
df = scaler.transform(df)

# Processing

In [11]:
# Split the data
(training_data, test_data) = df.randomSplit([0.7,0.3], 42)
print("Training Dataset Count: " + str(training_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

Training Dataset Count: 6208
Test Dataset Count: 2532


In [12]:
start_time = time.time()

# Create Decision Tree classifer object
dt = DecisionTreeClassifier(featuresCol = 'scaledFeatures', labelCol = 'label')

# Train Decision Tree Classifer
dtModel = dt.fit(training_data)

fit_duration = (time.time() - start_time)
print("--- %s seconds ---" % fit_duration)

--- 7.585740327835083 seconds ---


In [13]:
start_time = time.time()

#Predict the response for test dataset
dt_predictions = dtModel.transform(test_data)

pred_duration = (time.time() - start_time)
print("--- %s seconds ---" % pred_duration)

--- 0.2120521068572998 seconds ---


In [14]:
# Calculate Confusion matrix
ypred = dt_predictions.select("prediction").collect()
ytest = dt_predictions.select("label").collect()

acc_eval    = MulticlassClassificationEvaluator(predictionCol="prediction", metricName="accuracy")
recall_eval = MulticlassClassificationEvaluator(predictionCol="prediction", metricName="weightedRecall")
prec_eval   = MulticlassClassificationEvaluator(predictionCol="prediction", metricName="weightedPrecision")
f1_eval     = MulticlassClassificationEvaluator(predictionCol="prediction", metricName="f1")

accuracy    = acc_eval.evaluate(dt_predictions)
recall      = recall_eval.evaluate(dt_predictions)
precision   = prec_eval.evaluate(dt_predictions)
f1_score    = f1_eval.evaluate(dt_predictions)

print("confusion matrix :",confusion_matrix(ytest, ypred).ravel())
print(str('accuracy :  %0.4f' % accuracy) +"\n" + str('recall : %0.4f' % recall) +
      "\n" + str('precision :  %0.4f' %precision) + "\n" + str('f1-measure : %0.4f' %f1_score))

confusion matrix : [431   0   0   2  45  12 501   1   0   0   1   0 510   4   0   0   0   0
 485   0  12   0   0   0 528]
accuracy :  0.9696
recall : 0.9696
precision :  0.9702
f1-measure : 0.9696


In [15]:
# Save Model
ml_path = output_path+"/model"
ss_path = output_path+"/standard-scaler"

!rm -rf $ml_path $ss_path

In [16]:
dtModel.save(ml_path)
scaler.save(ss_path)

In [17]:
# Store to MongoDB

# with open(path, "rb") as f:
#     model_col.put(f, filename="spark-all-feature")
    
result_col.insert_one({
    'machine_learning': "Decission Tree",
    'feature': "Feature Importance",
    'label': "Multi-Label",
    'fit_duration': fit_duration,
    'pred_duration': pred_duration,
    'accuracy': accuracy,
    'recall': recall,
    'precision': precision,
    'f1_score': f1_score,
    'created_at': datetime.fromtimestamp(time.time())
})

<pymongo.results.InsertOneResult at 0x7f35b7fa2040>

In [18]:
sc.stop()