In [1]:
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.sql import Row, SparkSession
from pyspark.sql.types import *
from math import log
import pickle
import numpy as np

In [2]:
spark = SparkSession \
    .builder \
    .getOrCreate()
sc = spark.sparkContext

# New in version 2.0.0.
classmethod train(rdd, k=4, maxIterations=20, minDivisibleClusterSize=1.0, seed=-1888008604)
     # Runs the bisecting k-means algorithm return the model.
     Parameters:	
               rdd – Training points as an RDD of Vector or convertible sequence types.
               k – The desired number of leaf clusters. The actual number could be smaller if there are no divisible leaf clusters. (default: 4)
               maxIterations – Maximum number of iterations allowed to split clusters. (default: 20)
               minDivisibleClusterSize – Minimum number of points (if >= 1.0) or the minimum proportion of points (if < 1.0) of a divisible cluster. (default: 1)
               seed – Random seed value for cluster initialization. (default: -1888008604 from classOf[BisectingKMeans].getName.##)


In [3]:
data = np.array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2)

In [4]:
model = KMeans.train(
        sc.parallelize(data), 2, maxIterations=10, initializationMode="random",
                       seed=50, initializationSteps=5, epsilon=1e-4)

In [5]:
model.predict(np.array([0.0, 0.0])) == model.predict(np.array([1.0, 1.0]))

True

In [6]:
model.k

2

In [7]:
model.computeCost(sc.parallelize(data))

2.0000000000000004

In [8]:
rdd = sc.parallelize([(1,1),(2,2),(3,3),(4,4),(5,5)])

In [9]:
def runKmeans(data, sample_dataset, k, count):
    clusters = KMeans.train(sample_dataset, k, maxIterations=1, initializationMode="kmean++")
    cost = clusters.computeCost(data)
    finalcost = cost/data.count()
    return finalcost
runKmeans(rdd, sc.parallelize([(1,1),(2,2),(3,3)]), 3, rdd.count())

2.0

In [10]:
n1 = 10
n2 = 100
e1 = 10000
e2 = 100

In [11]:
def computeIntrinsicDimension(n1, e1, n2, e2):
    a1 = (log(n2) - log(n1))
    a2 = (log(e1) - log(e2))
    d = 2*a1/a2
    return d
print (computeIntrinsicDimension (n1, e1, n2, e2))
    #
    # YOUR CODE HERE
    #



1.0


# Convert dataframe to rdd
rdd = df.rdd.map(tuple)
or

rdd = df.rdd.map(list)

In [12]:
#df = spark.read.parquet(file_path)
df = spark.read.parquet("/Users/user/Desktop/PSDS/edX/UCSandiago_MicroMasters/BigDataAUSpark/wk5_6/hw5-small.parquet")

In [14]:
def run(df):
    for each in df.columns:
        df = df.withColumn(each, df[each].cast(FloatType()))
    rdd = df.rdd.map(list)
    sample1 = sc.parallelize(rdd.takeSample(0, 10000)) #10k
    sample2 = sc.parallelize(rdd.takeSample(0, 20000)) #20k
    results ={}
    k = [10, 200, 700, 2000]
    for num, s in zip(['10000', '20000'], [sample1, sample2]): # s is sample
        MSD_list = [runKmeans(rdd, s, per_k, rdd.count()) for per_k in k] #mean square distance
        d1, d2, d3, d4 = MSD_list
        k1, k2, k3, k4 = k
        results['ID_{0}_10_200'.format(num)] = computeIntrinsicDimension(k1, d1, k2, d2)
        results['ID_{0}_200_700'.format(num)] = computeIntrinsicDimension(k2, d2, k3, d3)
        results['ID_{0}_700_2000'.format(num)] = computeIntrinsicDimension(k3, d3, k4, d4)
    return results
#print(run(df))
   

{'ID_10000_10_200': 1.5255694332344738, 'ID_10000_200_700': 1.2442099315834783, 'ID_10000_700_2000': 1.2585590236134818, 'ID_20000_10_200': 1.577912839175332, 'ID_20000_200_700': 1.2111828964759064, 'ID_20000_700_2000': 1.152710266127354}
