# Machine learning

## Cluster Analysis

* divide the questions tags into some groups/clusters

In [None]:
import findspark
findspark.init()

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, count, explode, split, regexp_replace, collect_list, avg, when, sum, lit

from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType

from pyspark.ml.feature import VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.clustering import KMeans

from pyspark.ml import Pipeline

import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Cluster Analysis I')
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

data_input_path = os.path.join(project_path, 'output/questions-transformed')

In [None]:
questionsDF = (
    spark
    .read
    .option('path', data_input_path)
    .load()
).cache()

In [None]:
questionsDF.printSchema()

In [None]:
questionsDF.count()

In [None]:
questionsDF.select('tags').show(truncate=False)

## Compute tag basic features

* frequency
* avg_views
* avg_comments
* number of accepted_answers

In [None]:
tag_basic_features = (
    questionsDF
    .withColumn('tag', explode('tags'))
    .groupBy('tag')
    .agg(
        count('*').alias('tag_frequency'),
        avg('views').alias('avg_views'),
        avg('comments').alias('avg_comments'),
        sum(when(col('accepted_answer_id').isNotNull(), lit(1)).otherwise(lit(0))).alias('accepted_answers')
    )
    .orderBy(desc('tag_frequency'))
)

In [None]:
tag_basic_features.show()

# Construct the pipeline

hint
* use VectorAssembler, MinMaxScaler

In [None]:
features_array = ['tag_frequency', 'avg_views', 'avg_comments', 'accepted_answers']

assembler = VectorAssembler(inputCols=(features_array), outputCol='features')

scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features')

kmeans = KMeans(featuresCol='scaled_features', predictionCol='prediction', k=4, seed=1)

pipeline = Pipeline(stages=[assembler, scaler, kmeans])

model = pipeline.fit(tag_basic_features)

<b>Apply the model on the data:</b>

In [None]:
predictions = model.transform(tag_basic_features)

<b>See cluster label for each tag:</b>

In [None]:
predictions.select('tag', 'prediction').show()

<b>See how many tags are in each cluster:</b>

In [None]:
(
    predictions
    .groupBy('prediction')
    .count()
).show()

<b>See what tags are in each cluster:</b>

In [None]:
(
    predictions
    .select('tag', *features_array)
    .filter(col('prediction') == 1)
).show(truncate=False)

In [None]:
(
    predictions
    .select('tag', *features_array)
    .filter(col('prediction') == 2)
).show(truncate=False)

In [None]:
(
    predictions
    .select('tag', *features_array)
    .filter(col('prediction') == 3)
).show(truncate=False)

In [None]:
(
    predictions
    .select('tag', *features_array)
    .filter(col('prediction') == 0)
).show(truncate=False)