# Project context:
## A startup has been hacked and there are 3 potential attackers. They are certain that there are 2 but they are not sure if the third one is involved or not
## Key fact: each attacker should have roughly the same amount of attacks

# Imports

In [2]:
import findspark

findspark.init('C:/spark')

In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

# Create a session

In [4]:
spark = SparkSession.builder.appName('clustering').getOrCreate()

# Read data

In [5]:
data = spark.read.csv('../../data/hack_data.csv',
                      inferSchema=True,
                      header=True)

In [6]:
data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



# Preprocess the data

In [16]:
featureCols = data.columns
featureCols.remove('Location')
featureCols

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'WPM_Typing_Speed']

In [17]:
assembler = VectorAssembler(inputCols=featureCols,
                            outputCol='features')
scaler = StandardScaler(inputCol='features',
                        outputCol='scaledFeatures')

In [19]:
preprocess_pipeline = Pipeline(stages=[
    assembler,
    scaler
])

final_data = preprocess_pipeline.fit(data).transform(data)

In [20]:
final_data.head(1)

[Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37, features=DenseVector([8.0, 391.09, 1.0, 2.96, 7.0, 72.37]), scaledFeatures=DenseVector([0.5679, 1.3658, 1.9976, 1.2859, 2.2849, 5.3963]))]

# Modeling

In [23]:
kmeans2 = KMeans(featuresCol='scaledFeatures', k=2)
kmeans3 = KMeans(featuresCol='scaledFeatures', k=3)

In [25]:
model_k2 = kmeans2.fit(final_data)
model_k3 = kmeans3.fit(final_data)

In [27]:
model_k2.transform(final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [28]:
model_k3.transform(final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   88|
|         2|   79|
|         0|  167|
+----------+-----+



# Conclusion

## Since the results of KMeans with K=2 are equal/close, there must be a 2 hackers.