In [92]:
import itertools
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, FloatType, Row
from pyspark.ml import Pipeline,Transformer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.feature import Bucketizer,StringIndexer
from pyspark.ml.feature import VectorAssembler
# from pyspark.ml.stat import Correlation
from pyspark.mllib.evaluation import MulticlassMetrics

StatementMeta(sparkPool01, 10, 123, Finished, Available)

In [93]:
%run "Common"

StatementMeta(, , -1, Finished, Available)

In [94]:
csv_file_name = "StarReconNoneEdgesWithHops.csv"
filesystem_endpoint = "smtcrbsynfs@smtcrbsyndl.dfs.core.windows.net"
data_folder = "/data/uwf"

StatementMeta(sparkPool01, 10, 127, Finished, Available)

In [95]:
file_path = f"{data_folder}/{csv_file_name}"
dataset = file_path.split("/")[-1]
conn_df = load_csv(filesystem_endpoint, file_path)

StatementMeta(sparkPool01, 10, 128, Finished, Available)

In [96]:
algorithm = MLAlgorithm.DT
train_pct = 0.84

StatementMeta(sparkPool01, 10, 129, Finished, Available)

In [97]:
result_df = spark.createDataFrame([], schema=schema)

target_cols = ["Tactic"]

all_cols = [
    "From",
    "To",
    "Avg_Duration",
    "Total_Duration",
    "Avg_Bytes",
    "Total_Bytes",
    "Count",
    "Hop_Count",
]

cols_combinations = []

for r in range(1, len(all_cols) + 1):
    cols_combinations.extend(list(itertools.combinations(all_cols, r)))

cols_combinations_len = len(cols_combinations)

for i, combination in enumerate(cols_combinations):
    print(f"Iteration {i+1} of {cols_combinations_len}")
    
    drop_cols = [col for col in all_cols if col not in combination]
    additional_drop_cols = ["Id"]
    drop_cols = drop_cols + additional_drop_cols

    print(f"Feature columns: {combination}")
    print(f"Dropped columns: {drop_cols}")

    iter_df = conn_df.drop(*drop_cols)
    iter_df = iter_df.na.drop(how="any")

    numeric_cols = [
        name
        for name, types in iter_df.dtypes
        if types == "int" or types == "double" or types == "bigint"
    ]

    string_cols = [name for name, types in iter_df.dtypes if types == "string"]

    indexers = [
        StringIndexer(inputCol=column, outputCol=column + "_processed").fit(iter_df)
        for column in string_cols
    ]

    numeric_bucketing = [
        Bucketizer(
            splits=[-float("inf"), 10, 100, float("inf")],
            inputCol=x,
            outputCol=x + "_processed",
        )
        for x in numeric_cols
    ]

    stages_ = indexers + numeric_bucketing

    iter_df = Pipeline(stages=stages_).fit(iter_df).transform(iter_df)

    feature_cols = []
    for col, types in iter_df.dtypes:
        if "_processed" in col:
            if not target_cols[0] in col:
                feature_cols.append(col)

    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    iter_df = assembler.transform(iter_df)

    train, test = iter_df.randomSplit([train_pct, 1 - train_pct], seed=1234)
    label_col = target_cols[0] + "_processed"

    predictions = run_ml_algorithm(algorithm, feature_cols, label_col, iter_df, train_pct)
    
    predictions_and_labels = predictions.select(["prediction", label_col])

    metrics = MulticlassMetrics(predictions_and_labels.rdd.map(tuple))
    mc_evaluator = MulticlassClassificationEvaluator(
        labelCol=label_col, predictionCol="prediction"
    )
    accuracy = mc_evaluator.evaluate(predictions)

    bin_evaluator = BinaryClassificationEvaluator(
        rawPredictionCol="prediction", labelCol=label_col, metricName="areaUnderROC"
    )
    auc_roc = bin_evaluator.evaluate(predictions)

    confusion_matrix = metrics.confusionMatrix().toArray()
    
    confusion_matrix_flattened = list(confusion_matrix.flatten().astype(str))
    
   
    feature_col_str = "&".join(combination)
    
    try:
        precision = metrics.precision(1.0)
    except:
        precision = metrics.weightedPrecision
        
    recall = metrics.recall(0)
    fmeasure = metrics.fMeasure(0.0, 2.0)
    
    try:
        fprate = metrics.falsePositiveRate(1.0)
    except:
        fprate = metrics.weightedFalsePositiveRate
    
    confusion_matrix_values = [float(x) for x in confusion_matrix_flattened]
    
    if len(confusion_matrix_values) == 1:
        confusion_matrix_values = (confusion_matrix_values[0], 0.0, 0.0, 0.0)
    else:
        confusion_matrix_values = tuple(confusion_matrix_values)
        
    result_metrics = (
        feature_col_str,
        accuracy,
        precision,
        recall,
        fmeasure,
        fprate,
        auc_roc,
    ) + confusion_matrix_values
    
    result_df = result_df.union(spark.createDataFrame([result_metrics], schema=schema))


StatementMeta(sparkPool01, 10, 130, Finished, Available)

Iteration 1 of 255
Feature columns: ('From',)
Dropped columns: ['To', 'Avg_Duration', 'Total_Duration', 'Avg_Bytes', 'Total_Bytes', 'Count', 'Hop_Count', 'Id']
Iteration 2 of 255
Feature columns: ('To',)
Dropped columns: ['From', 'Avg_Duration', 'Total_Duration', 'Avg_Bytes', 'Total_Bytes', 'Count', 'Hop_Count', 'Id']
Iteration 3 of 255
Feature columns: ('Avg_Duration',)
Dropped columns: ['From', 'To', 'Total_Duration', 'Avg_Bytes', 'Total_Bytes', 'Count', 'Hop_Count', 'Id']
Iteration 4 of 255
Feature columns: ('Total_Duration',)
Dropped columns: ['From', 'To', 'Avg_Duration', 'Avg_Bytes', 'Total_Bytes', 'Count', 'Hop_Count', 'Id']
Iteration 5 of 255
Feature columns: ('Avg_Bytes',)
Dropped columns: ['From', 'To', 'Avg_Duration', 'Total_Duration', 'Total_Bytes', 'Count', 'Hop_Count', 'Id']
Iteration 6 of 255
Feature columns: ('Total_Bytes',)
Dropped columns: ['From', 'To', 'Avg_Duration', 'Total_Duration', 'Avg_Bytes', 'Count', 'Hop_Count', 'Id']
Iteration 7 of 255
Feature columns: ('Co

In [98]:
save_df_to_csv(result_df, filesystem_endpoint, f"{data_folder}/{algorithm.value}_results_{dataset}.csv")

StatementMeta(sparkPool01, 10, 131, Finished, Available)