In [None]:
!pip install findspark
!pip install confluent-kafka
# Downloaded from https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-8-assembly_2.11
!wget https://repo1.maven.org/maven2/org/apache/spark/spark-streaming-kafka-0-8-assembly_2.11/2.4.0/spark-streaming-kafka-0-8-assembly_2.11-2.4.0.jar

In [None]:

# # !wget https://raw.githubusercontent.com/grananqvist/Machine-Learning-Web-Application-Firewall-and-Dataset/master/data/payloads.csv
!wget https://raw.githubusercontent.com/faizann24/Fwaf-Machine-Learning-driven-Web-Application-Firewall/master/badqueries.txt
!wget https://raw.githubusercontent.com/faizann24/Fwaf-Machine-Learning-driven-Web-Application-Firewall/master/goodqueries.txt

In [None]:
import os
import findspark
findspark.init('/usr/local/spark/spark-2.4.0-bin-hadoop2.7')
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars spark-streaming-kafka-0-8-assembly_2.11-2.4.0.jar pyspark-shell'

In [None]:
import pyspark
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType
from pyspark.sql.functions import udf
from pyspark.ml.feature import CountVectorizerModel, IDFModel, StandardScalerModel, Tokenizer
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.streaming.kafka import KafkaUtils

from urllib.parse import unquote

APP_NAME = "BigData"
conf = pyspark.SparkConf().setAll([ ('spark.app.name', APP_NAME),
                                    ('spark.executor.memory', '8g'),
                                    ('spark.cores.max', '2'),
                                    ('spark.driver.memory','8g'),
                                   ('spark.master', 'local[2]')])
sc = SparkContext(conf=conf)
sqlc = SQLContext(sc)
sc

In [None]:
good = sc.textFile("goodqueries.txt").map(lambda line: Row(is_malicious=0.0,payload=str(unquote(line))[1:])).distinct()
bad = sc.textFile("badqueries.txt").map(lambda line: Row(is_malicious=1.0,payload=str(unquote(line))[1:])).distinct()
mySchema = StructType([StructField("is_malicious", DoubleType(), True),StructField("payload", StringType(), True)])

df = sqlc.createDataFrame(good.union(bad), mySchema).cache()

In [None]:
def to_ngram(payload_obj):
    n=2
    payload = str(payload_obj)
    ngrams = ''
    for i in range(0,len(payload)-n + 1):
        ngrams += payload[i:i+n]+ ' '
    return ngrams[:-1]


print('EXAMPLE: bigram of the word <script>:')
to_ngram("MARAT")
to_ngram("DMITRI")

In [None]:
#to_ngrams
ngrams = udf(to_ngram, StringType())
df = df.withColumn('ngrams', ngrams(df['payload']))

# tokenize
tokenizer = Tokenizer().setInputCol("ngrams").setOutputCol("tokens")
wordsData = tokenizer.transform(df)

# vectorize
vectorizer = CountVectorizer(inputCol='tokens', outputCol='vectorizer').fit(wordsData)
wordsData = vectorizer.transform(wordsData)

# calculate scores
idf = IDF(minDocFreq=1,inputCol="vectorizer", outputCol="tfidf_features")

idf_model = idf.fit(wordsData)
wordsData = idf_model.transform(wordsData)

wordsData = wordsData.select("is_malicious","tfidf_features")

In [1]:
scaler = StandardScaler(inputCol="tfidf_features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(wordsData)

# Normalize each feature to have unit standard deviation.
wordsData = scalerModel.transform(wordsData).cache()

pca = PCA(k=2, inputCol="scaledFeatures", outputCol="pcaFeatures")
model = pca.fit(wordsData.select('is_malicious','scaledFeatures'))
result = model.transform(wordsData).select('is_malicious',"pcaFeatures").cache()
# result.show(truncate=False)

NameError: name 'StandardScaler' is not defined

In [None]:
anom = np.array(result.filter(result['is_malicious'] == 1.0).rdd.map(lambda x: x['pcaFeatures']).collect()).T
norm = np.array(result.filter(result['is_malicious'] == 0.0).rdd.map(lambda x: x['pcaFeatures']).collect()).T

fig = plt.figure(figsize = [8,8])
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('PC 1', fontsize = 15)
ax.set_ylabel('PC 2', fontsize = 15)
ax.set_title('PCA', fontsize = 20)
ax.axis([-20, 150, -80, 50])
targets = ['Attack','Normal']
ax.scatter(anom[0]
               , anom[1]
               , c = 'r',alpha=0.4)
ax.scatter(norm[0]
               , norm[1]
               , c = 'b'
              ,alpha=0.4)

ax.legend(targets)
ax.grid()
plt.show()


In [None]:
train, test = wordsData.randomSplit([0.8, 0.2], seed = 2018)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

lr = LogisticRegression(featuresCol = 'scaledFeatures', labelCol = 'is_malicious')


paramGrid = ParamGridBuilder().addGrid(lr.maxIter,[10,100,1000]).addGrid(lr.regParam, [0.1, 0.01,0.001])\
    .addGrid(lr.fitIntercept, [False, True]).addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]).build()

tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(labelCol='is_malicious'),
                           trainRatio=0.8)
model = tvs.fit(train)


In [None]:
predictions = model.bestModel.transform(test)
y_true = predictions.select(['is_malicious']).collect()
y_pred = predictions.select(['prediction']).collect()

cf_matrix = confusion_matrix(y_true, y_pred)
print(classification_report(y_true, y_pred))


group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)

sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues');
