# KMeans Streaming Consumer 

Kafka Utils:

    kafka-run-class.sh kafka.tools.GetOffsetShell --topic Throughput --broker-list c251-132:9092

In [24]:
from pykafka import KafkaClient
import numpy as np
import time
import datetime
import dateutil.parser
import ast
import sklearn.cluster
import threading
from threading import Thread
import datetime
import pickle

zkKafka='c251-137.wrangler.tacc.utexas.edu:2181'
client = KafkaClient(zookeeper_hosts=zkKafka)
#client = KafkaClient(hosts='c251-138.wrangler.tacc.utexas.edu:9092')
topic = client.topics['Throughput']
producer = topic.get_sync_producer()
consumer = topic.get_simple_consumer()

In [25]:
client.brokers

{0: <pykafka.broker.Broker at 0x2b187df55350 (host=c251-137, port=9092, id=0)>}

# KMeans
## Deserialization

In [26]:
message = consumer.consume(block=True)

NoBrokersAvailableError: Unable to connect to a broker to fetch metadata. See logs.

In [5]:
%%time
data_np = np.array(ast.literal_eval(message.value))

CPU times: user 79.6 ms, sys: 23.8 ms, total: 103 ms
Wall time: 97.3 ms


In [6]:
data_np.shape

(5000, 3)

In [7]:
# Generate initial centroids
number_centroids = 100
number_dimensions = 3
centroids = np.random.randn(number_centroids, number_dimensions)

In [8]:
len(centroids)

100

## Scikit

In [20]:
%%time
kmeans = sklearn.cluster.KMeans(n_clusters=len(centroids), init=centroids, n_init=1).fit(data_np)

CPU times: user 417 ms, sys: 0 ns, total: 417 ms
Wall time: 412 ms


In [18]:
centroids

array([[-0.7002028 ,  0.77576458, -0.17041676],
       [-1.82605451, -0.42699323, -1.52393022],
       [-1.17083184, -0.23171123, -0.45316833],
       [ 0.91608726,  0.77776993, -0.18034497],
       [-0.83603306,  0.79606075,  1.23327943],
       [ 0.3435504 , -0.83231553,  0.54099276],
       [-0.13133628,  0.97030142,  0.62037197],
       [-0.0217872 , -0.95542082,  2.0665584 ],
       [ 1.89659217,  1.4582982 , -0.74867488],
       [-1.00632922,  0.08732205, -1.3864677 ],
       [-0.46760615,  0.90589473,  0.22941262],
       [ 0.05311013,  0.19882665, -0.54920606],
       [ 0.95847111,  2.20862734,  0.4518484 ],
       [-0.23262562, -0.74994096, -0.90978214],
       [ 0.28979571, -1.34760853, -0.99600084],
       [ 2.00307908,  0.02691777, -0.58832343],
       [-0.93352221,  0.12301137,  1.7616828 ],
       [-0.46733317, -1.0277732 , -0.92056329],
       [ 0.3911268 , -1.16388057, -0.23015355],
       [-1.32049963, -0.64727141,  0.24639691],
       [-0.44779005, -0.91229342,  0.821

In [14]:
kmeans.cluster_centers_

array([[-1.42000834, -2.85080644, -1.08248241],
       [-3.01945235, -3.44440102, -2.01832067],
       [-1.69100832, -2.76633513, -3.21336799],
       [-1.34000764, -1.42502401, -2.99535839],
       [-3.32349688, -1.84917654, -1.85779039],
       [-1.49467397, -3.3369184 , -2.15419978],
       [-3.11494622, -2.41095018, -3.21619598],
       [-1.36614757, -1.71809147, -0.90242641],
       [-2.66814605, -2.6442657 , -1.04970949],
       [-1.93742978, -1.79181426, -1.86237772],
       [-0.83769497, -2.23131365, -2.11030812],
       [-2.64334721, -0.75015289, -2.32554955],
       [-2.55181604, -1.20061773, -1.02884268],
       [-2.33934223, -1.71091359, -2.945366  ],
       [-1.24558403, -0.85574291, -1.82340672],
       [-2.28062948, -2.54097752, -2.14290006]])

In [None]:
kmeans.labels_.shape

## Mini Batch KMeans

In [21]:
%%time
kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=len(centroids), init=centroids, n_init=1).partial_fit(data_np)

CPU times: user 12.9 ms, sys: 28 µs, total: 12.9 ms
Wall time: 11.8 ms


In [None]:
kmeans.cluster_centers_

# Benchmark

Streaming Data from Kafka

## Multi-threading

In [22]:
def process_messages(number_messages=1, threadid=0):
    global kmeans
    global result
    global number_threads 
    global number_points_per_message
    
    print "Thread: %d, Process %d messages"%(threadid, number_messages)
    count = 0
    while count < number_messages:
        start = time.time()
        message = consumer.consume(block=True)
        end_kafka = time.time()
        data_np = np.array(ast.literal_eval(message.value))
        num_points = len(data_np)
        number_points_per_message = num_points
        end_parsing = time.time()
        kmeans = kmeans.partial_fit(data_np)
        end_kmeans = time.time()    
        result += "kmeans-kafka,   %d, %d, %d, %d, %.5f\n"%(num_points, number_dimensions, number_centroids, number_threads, end_kafka-start)
        result += "kmeans-parsing, %d, %d, %d, %d, %.5f\n"%(num_points, number_dimensions, number_centroids, number_threads, end_parsing-end_kafka)
        result += "kmeans-model,   %d, %d, %d, %d, %.5f\n"%(num_points, number_dimensions, number_centroids, number_threads, end_kmeans-end_parsing)
        if count % 100 == 0:
            print "Messages processed: %d"%count
        count += 1

In [None]:
%%time

global kmeans
global result
global number_points_per_message
global number_threads 


# configuration
number_centroids = 10
number_dimensions = 3
number_messages = 1000
number_threads = 24
repeats = 3

for i in range(repeats):
    # Generate initial centroids
    centroids = np.random.randn(number_centroids, number_dimensions)
    kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=len(centroids), init=centroids, n_init=1)
    consumer = topic.get_simple_consumer(reset_offset_on_start=True)
    result = ""   
    global_start = time.time()
    per_thread_messages = number_messages/number_threads
    threads = []
    for i in range(number_threads):
        t = Thread(target=process_messages, kwargs={"number_messages":per_thread_messages, 
                                                    "threadid":i})
        t.start()
        threads.append(t)
            
    for t in threads:        
        t.join()
        #t.stop()
    
    global_end = time.time()
    
    result += "kmeans-run,   %d, %d, %d, %d, %.5f\n"%(number_points_per_message, number_dimensions, number_centroids, number_threads, global_end-global_start)
    
    output_file.write(result)
    output_file.flush()

## Redis Parameter Server

Start Redis (e.g. the one installed with Anaconda) using the following command:

    redis-server --protected-mode no

In [6]:
import redis
r = redis.StrictRedis(host='c251-132', port=6379, db=0)

def put_model(model):
    r.set('kmeans', pickle.dumps(model))
    
def get_model():
    return pickle.loads(r.get("kmeans"))

In [7]:
def process_messages_kmeans_redis(number_messages=1, threadid=0):
    global result
    global number_threads 
    global number_points_per_message
    global kmeans
    print "Thread: %d, Process %d messages"%(threadid, number_messages)
    count = 0
    while count < number_messages:
        start = time.time()
        message = consumer.consume(block=True)
        end_kafka = time.time()
        data_np = np.array(ast.literal_eval(message.value))
        num_points = len(data_np)
        number_points_per_message = num_points
        end_parsing = time.time()
        try:
            kmeans = get_model()
        except:
            # no model in Redis - create one
            pass
   
        end_model_get = time.time()
        kmeans = kmeans.partial_fit(data_np)
        end_kmeans = time.time()
        put_model(kmeans)
        end_model_put = time.time()    
        res =  "kmeans-kafka,   %d, %d, %d, %d, %.5f\n"%(num_points, number_dimensions, number_centroids, number_threads, end_kafka-start) 
        res += "kmeans-parsing, %d, %d, %d, %d, %.5f\n"%(num_points, number_dimensions, number_centroids, number_threads, end_parsing-end_kafka) 
        res += "kmeans-model-get,   %d, %d, %d, %d, %.5f\n"%(num_points, number_dimensions, number_centroids, number_threads, end_model_get-end_parsing) 
        res += "kmeans-model,   %d, %d, %d, %d, %.5f\n"%(num_points, number_dimensions, number_centroids, number_threads, end_kmeans-end_model_get) 
        res += "kmeans-model-put,   %d, %d, %d, %d, %.5f\n"%(num_points, number_dimensions, number_centroids, number_threads, end_model_put-end_kmeans)
        if count % 100 == 0:
            print "Number Points/Message: %d, Messages processed: %d"%(num_points, count)
        count += 1
    
    result += res

In [8]:
run_timestamp=datetime.datetime.now()
RESULT_FILE= "results/kafka-throughput-kmeans-multithread-" + run_timestamp.strftime("%Y%m%d-%H%M%S") + ".csv"
try:
    os.makedirs("results")
except:
    pass
output_file=open(RESULT_FILE, "w")
output_file.write("Type, Number_Points, Dimensions, Number_Centroids, Number_Threads, Time\n")

In [None]:
%%time

global kmeans
global result
global number_points_per_message
global number_threads 


# configuration
number_centroids = 10
number_dimensions = 3
number_messages = 1000
number_threads = 1
repeats = 3

for i in range(repeats):
    # Generate initial centroids
    centroids = np.random.randn(number_centroids, number_dimensions)
    kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=len(centroids), init=centroids, n_init=1)
    consumer = topic.get_simple_consumer(reset_offset_on_start=True)
    result = ""   
    global_start = time.time()
    per_thread_messages = number_messages/number_threads
    threads = []
    for i in range(number_threads):
        t = Thread(target=process_messages_kmeans_redis, kwargs={"number_messages":per_thread_messages, 
                                                    "threadid":i})
        t.start()
        threads.append(t)
            
    for t in threads:        
        t.join()
        #t.stop()
    
    global_end = time.time()
    
    result += "kmeans-run,   %d, %d, %d, %d, %.5f\n"%(number_points_per_message, number_dimensions, number_centroids, number_threads, global_end-global_start)
    
    output_file.write(result)
    output_file.flush()

Thread: 0, Process 1000 messages
Number Points/Message: 5000, Messages processed: 0
Number Points/Message: 5000, Messages processed: 100


In [4]:
m=get_model()

In [8]:
m.cluster_centers_.shape[0]

10