In [2]:
import findspark

In [3]:
findspark.init('/home/adeola/spark-2.4.2-bin-hadoop2.7')

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('hack').getOrCreate()

In [6]:
data = spark.read.csv('hack_data.csv', inferSchema = True, header = True)

In [7]:
data.show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|             Bolivia|            70.8|
|                   20.0|            408.5|              0|             3.57

In [8]:
data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [9]:
from pyspark.ml.feature import VectorAssembler

In [10]:
assembler = VectorAssembler(inputCols = ['Session_Connection_Time','Bytes Transferred','Kali_Trace_Used','Servers_Corrupted',
                                         'Pages_Corrupted','WPM_Typing_Speed'], outputCol = 'features')

In [11]:
final_data = assembler.transform(data)

In [12]:
final_data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed',
 'features']

In [13]:
#scaling the dataset using StandardScaler
from pyspark.ml.feature import StandardScaler

In [14]:
scaler = StandardScaler(inputCol = 'features', outputCol = 'scaled_features')

In [15]:
scaler_data = scaler.fit(final_data)

In [16]:
df = scaler_data.transform(final_data)

In [17]:
df.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed',
 'features',
 'scaled_features']

In [18]:
from pyspark.ml.clustering import KMeans

In [19]:
kmeans = KMeans(k = 3, featuresCol = 'scaled_features')

In [20]:
model = kmeans.fit(df)

In [21]:
wssse = model.computeCost(df)

In [22]:
print (wssse)

434.75507308487647


In [23]:
centers = model.clusterCenters()

In [24]:
centers

[array([1.26023837, 1.31829808, 0.99280765, 1.36491885, 2.5625043 ,
        5.26676612]),
 array([3.05623261, 2.95754486, 1.99757683, 3.2079628 , 4.49941976,
        3.26738378]),
 array([2.93719177, 2.88492202, 0.        , 3.19938371, 4.52857793,
        3.30407351])]

In [25]:
results = model.transform(df)

In [48]:
results.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   88|
|         2|   79|
|         0|  167|
+----------+-----+



In [27]:
results.select('prediction').describe().show()

+-------+------------------+
|summary|        prediction|
+-------+------------------+
|  count|               334|
|   mean|0.7365269461077845|
| stddev|0.8179928031458795|
|    min|                 0|
|    max|                 2|
+-------+------------------+



In [28]:
kmeans1 = KMeans(k = 2, featuresCol = 'scaled_features')

In [29]:
model1 = kmeans1.fit(df)

In [30]:
centers1 = model1.clusterCenters()

In [31]:
centers1

[array([1.26023837, 1.31829808, 0.99280765, 1.36491885, 2.5625043 ,
        5.26676612]),
 array([2.99991988, 2.92319035, 1.05261534, 3.20390443, 4.51321315,
        3.28474   ])]

In [32]:
result1 = model1.transform(df)

In [38]:
predict_2 = result1.select('Location','prediction')

In [47]:
predict_2.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [None]:
#There are defifnitely 2 hackers