In [1]:
%pip install findspark pymongo

Note: you may need to restart the kernel to use updated packages.


# Crate Directory and Output File

In [2]:
#Use '/resource' to syncronize folder with host

!mkdir -p ~/output/spark-model/random-forest/feature-importance

In [3]:
input_path  = '/home/jovyan/output/renamed-data/multi/TrainDataUnderSampling.csv'
output_path = '/home/jovyan/output/spark-model/random-forest/feature-importance'

# Import Package

In [4]:
import time
import json
import pyspark
import findspark
from gridfs import GridFS
from datetime import datetime
from pymongo import MongoClient

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from sklearn.metrics import confusion_matrix
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

findspark.init()

# DB Connections

In [5]:
client = MongoClient("mongodb://mongodb:27017")
db = client['mataelanglab']

result_col = db['spark_result']
cv_col = db['spark_cv']
model_col = GridFS(db, 'spark_model')

# Spark session & context

## Local Version

In [6]:
spark = (SparkSession
         .builder
         .master('local[2]')
         .appName('RandomForest-FeatureImportance')
         .config("spark.executor.memory", "4g") #optional
         .config("spark.executor.cores","1") #optional
         .getOrCreate())
sc = spark.sparkContext

## Cluster Version

<b>Note : </b> \
Jika menggunakan mode ini pastikan data dapat diakses oleh masing worker yang ada. \
Apabila menggunakan cluster pada MataElangLab (1 master dan 1 worker), taruh data pada '/resource'

In [None]:
spark_uri = os.environ['SPARK_MASTER']
print(spark_uri) #MataElangLab Spark Cluster URL

In [None]:
spark = (SparkSession
         .builder
         .master(spark_uri)
         .appName('RandomForest-FeatureImportance')
         .config("spark.executor.memory", "2g") #optional
         .config("spark.executor.cores","1") #optional
         .getOrCreate())
sc = spark.sparkContext

# Read Data

In [7]:
df = spark.read.csv(input_path, header="true", inferSchema =True)

# Normalize

In [8]:
df = df.select([F.col(column).cast('double') for column in df.columns])

# Classification Using Feature Importance

In [9]:
feature = [
    'idle_max',
    'idle_mean',
    'idle_min',
    'fwd_packet_length_min',
    'idle_std',
    'packet_length_min',
    'fwd_init_win_bytes',
    'fwd_iat_min',
    'fin_flag_count',
    'fwd_iat_total'
]

# Pre-Processing

In [10]:
#Vector Assembler
vector_assembler = VectorAssembler(inputCols=feature, outputCol="SS_features")
df = vector_assembler.transform(df)

#Standard Scaler
scaler = StandardScaler(inputCol="SS_features", outputCol="scaledFeatures", withStd=True, withMean=False)
df = scaler.fit(df).transform(df)

# Processing

In [11]:
start_time = time.time()
# Split the data
(training_data, test_data) = df.randomSplit([0.7,0.3], 42)
print("Training Dataset Count: " + str(training_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

# Create Random Forest Classifier object
rf = RandomForestClassifier(labelCol="label", featuresCol="scaledFeatures")

# Train Random Forest Classifier
rfModel = rf.fit(training_data)

#Predict the response for test dataset
rf_predictions = rfModel.transform(test_data)
duration = (time.time() - start_time)
print("--- %s seconds ---" % duration)

Training Dataset Count: 6208
Test Dataset Count: 2532
--- 17.23194146156311 seconds ---


In [12]:
# Calculate Confusion matrix
ypred = rf_predictions.select("prediction").collect()
ytest = rf_predictions.select("label").collect()

acc_eval    = MulticlassClassificationEvaluator(predictionCol="prediction", metricName="accuracy")
recall_eval = MulticlassClassificationEvaluator(predictionCol="prediction", metricName="weightedRecall")
prec_eval   = MulticlassClassificationEvaluator(predictionCol="prediction", metricName="weightedPrecision")
f1_eval     = MulticlassClassificationEvaluator(predictionCol="prediction", metricName="f1")

accuracy    = acc_eval.evaluate(rf_predictions)
recall      = recall_eval.evaluate(rf_predictions)
precision   = prec_eval.evaluate(rf_predictions)
f1_score    = f1_eval.evaluate(rf_predictions)

print("confusion matrix :",confusion_matrix(ytest, ypred).ravel())
print(str('accuracy :  %0.4f' % accuracy) +"\n" + str('recall : %0.4f' % recall) +
      "\n" + str('precision :  %0.4f' %precision) + "\n" + str('f1-measure : %0.4f' %f1_score))

confusion matrix : [458   0   1   2  17   1 513   0   0   0   2   0 513   0   0   0   2   7
 476   0  62   0   0   0 478]
accuracy :  0.9629
recall : 0.9629
precision :  0.9645
f1-measure : 0.9629


In [13]:
# Save Model
ml_path = output_path+"/model"
ss_path = output_path+"/standard-scaler"

!rm -rf $ml_path $ss_path

In [14]:
rfModel.save(ml_path)
scaler.save(ss_path)

In [15]:
# Store to MongoDB

# with open(path, "rb") as f:
#     model_col.put(f, filename="spark-all-feature")
    
result_col.insert_one({
    'machine_learning': "Random Forest",
    'feature': "Feature Importance",
    'label': "Multi-Label",
    'duration': duration,
    'accuracy': accuracy,
    'recall': recall,
    'precision': precision,
    'f1_score': f1_score,
    'created_at': datetime.fromtimestamp(time.time())
})

<pymongo.results.InsertOneResult at 0x7f18d0070a00>

In [16]:
sc.stop()