https://archive.ics.uci.edu/ml/datasets/KDD+Cup+1999+Data

In [None]:
raw_data = sc.textFile('data/kddcup.data_10_percent.gz')

In [None]:
kdd_names = """duration: continuous.
protocol_type: symbolic.
service: symbolic.
flag: symbolic.
src_bytes: continuous.
dst_bytes: continuous.
land: symbolic.
wrong_fragment: continuous.
urgent: continuous.
hot: continuous.
num_failed_logins: continuous.
logged_in: symbolic.
num_compromised: continuous.
root_shell: continuous.
su_attempted: continuous.
num_root: continuous.
num_file_creations: continuous.
num_shells: continuous.
num_access_files: continuous.
num_outbound_cmds: continuous.
is_host_login: symbolic.
is_guest_login: symbolic.
count: continuous.
srv_count: continuous.
serror_rate: continuous.
srv_serror_rate: continuous.
rerror_rate: continuous.
srv_rerror_rate: continuous.
same_srv_rate: continuous.
diff_srv_rate: continuous.
srv_diff_host_rate: continuous.
dst_host_count: continuous.
dst_host_srv_count: continuous.
dst_host_same_srv_rate: continuous.
dst_host_diff_srv_rate: continuous.
dst_host_same_src_port_rate: continuous.
dst_host_srv_diff_host_rate: continuous.
dst_host_serror_rate: continuous.
dst_host_srv_serror_rate: continuous.
dst_host_rerror_rate: continuous.
dst_host_srv_rerror_rate: continuous."""

In [None]:
names = [i.split(':')[0] for i in kdd_names.split('\n')] + ['label']

In [None]:
len(names)

In [None]:
raw_data.take(5)

In [None]:
raw_data.count()

In [None]:
def floatOrStr(x):
    try:
        return float(x)
    except ValueError:
        return x

In [None]:
from pyspark.sql import Row

In [None]:
df = spark.createDataFrame(raw_data
 .map(lambda x: [floatOrStr(i) for i in x.split(',')])
 .map(lambda x: Row(**{name: value for name, value in zip(names, x)})))

In [None]:
df.printSchema()

In [None]:
df.select(['duration', 'protocol_type', 'service']).show(5)

In [None]:
from pyspark.sql.types import StringType

In [None]:
string_cols = [i.name for i in df.schema if i.dataType == StringType()]

In [None]:
feature_cols = [i.name for i in df.schema if i.dataType != StringType()]

In [None]:
string_cols.remove('label')

In [None]:
string_cols

In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler

In [None]:
stages = []
for col in string_cols:
    indexer = StringIndexer(inputCol=col, outputCol=col+"_index")
    one_hot_encoder = OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol=col+"_vec")
    stages += [indexer, one_hot_encoder]

In [None]:
stages.append(VectorAssembler(inputCols=feature_cols + [col + "_vec" for col in string_cols],
                              outputCol='raw_features'))
stages.append(StandardScaler(withMean=True, inputCol='raw_features', outputCol='features'))

In [None]:
from pyspark.ml import Pipeline

In [None]:
pre_processing = Pipeline(stages=stages).fit(df)

In [None]:
from pyspark.ml.clustering import KMeans

In [None]:
processd = pre_processing.transform(df.sample(False, 0.25))

In [None]:
processd.cache()

In [None]:
for k in range(10, 110, 10):
    

In [None]:
knn_model = KMeans().setK(10).fit(processd)

In [None]:
knn_model.computeCost(processd)

In [None]:
ks = range(20, 220, 20)
costs = [KMeans().setK(k).fit(processd).computeCost(processd) for k in ks]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.plot(ks, costs)

In [None]:
model = KMeans().setK(110).fit(processd)

In [None]:
model.hasSummary

In [None]:
model.summary.predictions

In [None]:
pred = model.transform(processd)

In [None]:
from pyspark.ml.linalg import VectorUDT

In [None]:
from pyspark.sql.functions import UserDefinedFunction

In [None]:
from pyspark.ml.linalg import Vectors

In [None]:
centers = [Vectors.dense(x) for x in model.clusterCenters()]

In [None]:
center = UserDefinedFunction(lambda x, y: centers[x] - y, VectorUDT())

In [None]:
pred.select(center(pred['prediction'], pred['features'])).take(10)