# Lab 04 - Optional part

In [22]:
inputPath = '/data/students/bigdata_internet/lab4/log_tcp_complete_classes.txt'

in_DF = spark.read.load(inputPath, format='csv', \
            sep=' ', header=True, inferSchema=True)

                                                                                

## 3.1 - Number of clients

In [23]:
in_DF.select(in_DF.columns[0]).distinct().count()

3844

## 3.2 - Average number of connections

In [24]:
# Isolate useful columns:
indices = [0, 8, 22, 24, 51, 32]
features = indices[1:]

cols = [in_DF.columns[i] for i in indices]
print(cols)
in_DF.createOrReplaceTempView("input_table")

['#31#c_ip:1', 'c_bytes_all:9', 's_bytes_all:23', 's_bytes_retx:25', 's_rtt_avg:52', 's_first:33']


In [25]:
# At the same time, also isolate the features that will be needed after
start_DF = in_DF.select(cols).groupBy(cols[0])\
        .agg({"*": "count",f"{cols[1]}": "sum", 
    f"{cols[2]}": "sum", f"{cols[3]}": "sum",
    f"{cols[4]}": "avg", f"{cols[5]}": "avg"
}).withColumnRenamed(f'{cols[0]}', 'src_ip')\
.withColumnRenamed('count(1)', 'n_conn')\
.withColumnRenamed(f'sum({cols[1]})', 'sum_upl_b')\
.withColumnRenamed(f'sum({cols[2]})', 'sum_dwn_b')\
.withColumnRenamed(f'sum({cols[3]})', 'sum_ret_b')\
.withColumnRenamed(f'avg({cols[4]})', 'avg_rtt')\
.withColumnRenamed(f'avg({cols[5]})', 'avg_proc')

new_cols = ['src_ip', 'n_conn', 'sum_upl_b', 'sum_dwn_b', 'sum_ret_b', 'avg_rtt', 'avg_proc']

In [26]:
avg_conn_DF = start_DF.agg({"n_conn": 'avg'})\
            .withColumnRenamed('avg(count(1))', 'avg_n_conn')

avg_conn_DF.show(4)

+------------------+
|       avg(n_conn)|
+------------------+
|26.014568158168576|
+------------------+



## 3.3 - Top 5 active users

In [27]:
start_DF.select('src_ip', 'n_conn').sort('n_conn', ascending=False).show(5)

+--------------+------+
|        src_ip|n_conn|
+--------------+------+
| 246.25.63.193|  1175|
|246.25.221.106|   620|
|  180.102.5.86|   528|
|  246.25.63.82|   419|
|  180.102.5.42|   403|
+--------------+------+
only showing top 5 rows



## Clustering

### Pre-processing

- Vector assembler: put the features together
- StandardScaler - it could improve the performance of clustering

In [28]:
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans, GaussianMixture
from pyspark.ml.evaluation import ClusteringEvaluator

feat_cols = new_cols[1:]
print(feat_cols)
VA = VectorAssembler(inputCols=feat_cols, outputCol='features')
# Removing the mean does not change anything because of how clustering works
SS = StandardScaler(inputCol='features', outputCol='features_norm', withStd=True, withMean=True)

['n_conn', 'sum_upl_b', 'sum_dwn_b', 'sum_ret_b', 'avg_rtt', 'avg_proc']


In [29]:
# Instantiate a performance evaluator
eval_clust = ClusteringEvaluator(featuresCol='features_norm')

### K-means clustering

In [30]:
kmeans_obj = KMeans(featuresCol='features_norm', k=6, initMode='k-means||', maxIter=500)
pipeline_kmeans = Pipeline(stages=[VA, SS, kmeans_obj]).fit(start_DF)

                                                                                

In [31]:
kmeans_DF = pipeline_kmeans.transform(start_DF)
# Performance
sil_kmeans = eval_clust.evaluate(kmeans_DF)
print("Silhouette measure - K-means: ", sil_kmeans)
#0.42501770247509

Silhouette measure - K-means:  0.8761586097734072


### Gaussian Mixture Model clustering

In [33]:
gmm_obj = GaussianMixture(featuresCol='features_norm', k=6)
pipeline_gmm = Pipeline(stages=[VA, SS, gmm_obj]).fit(start_DF)

                                                                                

In [34]:
gmm_DF = pipeline_gmm.transform(start_DF)
# Performance
sil_gmm = eval_clust.evaluate(gmm_DF)
print("Silhouette measure - gmm: ", sil_gmm)

Silhouette measure - gmm:  0.1833907975267755
