# bigdata-LA3 tutorial

## 1. Frequent itemsets
### pyspark

In [None]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext('local','bigdata-LA3')

### Create RDD

In [None]:
import sys

datalist = [[1,2,5],[1,2,3,5],[1,2],[1,4,5],[1,3,5],[2,3,4],[2,4],[2,3]]
rdd = sc.parallelize(datalist)
rdd = rdd.zipWithIndex()
rdd.collect()

### Create dataframe

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("bigdata-LA3") \
    .getOrCreate()

In [None]:
def get_row(row):
    from pyspark.sql import Row
    
    id = row[1]
    return Row(id=int(id), items=row[0])

In [None]:
dfRDD = rdd.map(get_row)
df = spark.createDataFrame(dfRDD)
df.show()

### FPGrowth algorithm
[FP-Growth](http://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html)

In [None]:
from pyspark.ml.fpm import FPGrowth

fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.5)
model = fpGrowth.fit(df)

# Display frequent itemsets.
freq = model.freqItemsets
freq.show()

# Display generated association rules.
asso = model.associationRules
asso.show()
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df).show()

### Interest

In [None]:
totalItems = df.count()
freq_df = freq.filter(size(freq.items)==1)
frequent = asso.join(freq_df, asso.consequent == freq_df.items)
# interest
interset = frequent.withColumn("interest", abs(frequent.confidence-frequent.freq/totalItems)) \
            .select("antecedent","consequent", "confidence", "items", "freq", "interest")
interset.show()

## 2. Clustering

[K-means](https://github.com/glatard/big-data-analytics-course/tree/master/kmeans)

### data preparation

In [None]:
def getItemsVector(rdd):
    items = rdd[0]
    basket = rdd[1]
    output = []
    for item in range(1,6):
        output.append((item, (basket, int(item in items))))
    return output


In [None]:
#items and baskets
print(rdd.collect())
print("\nitems in baskets\n(item, [(basket, T/F)...])")
data = rdd.flatMap(getItemsVector).groupByKey().map(lambda x : [x[0], list(x[1])])
data.collect()

### distance

In [None]:
def getCoordinate(data):
    return tuple([t[1] for t in data[1]])

def getDistance(t1, t2):
    import math
    assert(len(t1) == len(t2))

    distance = 0
    for i in range(0, len(t1)):
        distance += math.pow(t1[i]-t2[i],2)
    
    return distance

for d in data.collect():
    print(getCoordinate(d))
    
print("0-1 distance:",getDistance(getCoordinate(data.collect()[0]), getCoordinate(data.collect()[1])))

### initialization

In [None]:
def getInit(s, n):
    import random
    random.seed(s)
    return random.sample([1, 2, 3, 4, 5],  n)  # Choose 3 elements
getInit(1,3)

### init centroids

In [None]:
inits =  getInit(1,2)
print("inits",inits)
centroids = data.filter(lambda x: x[0] in inits)
print("centroids",centroids.collect())

centroids_coordinate = centroids.map(lambda x: getCoordinate(x)).collect()
print("centroids coordinate",centroids_coordinate)

### first iterate

In [None]:
def getCluster(coordinate, centroids):
    distance = float("inf")
    c = -1
    for i in range(0, len(centroids)):
        d = getDistance(coordinate, centroids[i])
        if(d < distance):
            distance = d
            c = i
    return c

def getNewCentroid(clusterRDD):
    centroids_coordinate = []
    for cluter in clusterRDD:
        tuples = []
        for c in cluter:
            tuples.append(getCoordinate(c))

        from numpy import mean
        tt = tuple(map(mean, zip(*tuples)))
        centroids_coordinate.append(tt)
            

    return centroids_coordinate

def iterate_kmeans(items, centroids):
    cluster = []
    clusterRDD = []
    for i in range(0, len(centroids)):
        cluster.append([])
        clusterRDD.append([])

    for item in items:
        coordinate = getCoordinate(item)
        clusterIndex = getCluster(coordinate, centroids)
        cluster[clusterIndex].append(item[0])
        clusterRDD[clusterIndex].append(item)

    return cluster, clusterRDD

### K-means

In [None]:
preCluster = None
while True: 
    cluster, clusterRDD = iterate_kmeans(data.collect(), centroids_coordinate)
    if cluster == preCluster:
        break
    else:
        print("iterate", cluster)
        preCluster = cluster
        centroids_coordinate = getNewCentroid(clusterRDD)

print("finish", cluster)

In [None]:
sc.stop()
spark.stop()