# Streaming Consumer for Measuring Latency

In [1]:
from pykafka import KafkaClient
import numpy as np
import time
import datetime
import dateutil.parser
import ast
import sklearn.cluster
import threading
from threading import Thread
import datetime
import pickle

zkKafka='c251-122.wrangler.tacc.utexas.edu:2181'
client = KafkaClient(zookeeper_hosts=zkKafka)
#client = KafkaClient(hosts='c251-142.wrangler.tacc.utexas.edu:9092')
topic = client.topics['Throughput']
producer = topic.get_sync_producer()
consumer = topic.get_simple_consumer()

No handlers could be found for logger "kazoo.client"


NoBrokersAvailableError: Unable to connect to a broker to fetch metadata. See logs.

# KMeans
## Deserialization

In [None]:
message = consumer.consume(block=True)

In [None]:
%%time
data_np = np.array(ast.literal_eval(message.value))

In [None]:
data_np.shape

In [None]:
# Generate initial centroids
number_centroids = 16
number_dimensions = 3
centroids = np.random.randn(number_centroids, number_dimensions)

In [None]:
len(centroids)

## Scikit

In [None]:
%%time
kmeans = sklearn.cluster.KMeans(n_clusters=len(centroids), init=centroids, n_init=1).fit(data_np)

In [None]:
centroids

In [None]:
kmeans.cluster_centers_

In [None]:
kmeans.labels_.shape

## Mini Batch KMeans

In [None]:
%%time
kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=len(centroids), init=centroids, n_init=1).partial_fit(data_np)

In [None]:
kmeans.cluster_centers_

# Benchmark

Streaming Data from Kafka

In [None]:
def process_messages(number_messages=1, threadid=0):
    global kmeans
    global result
    global number_threads 
    global number_points_per_message
    
    print "Thread: %d, Process %d messages"%(threadid, number_messages)
    count = 0
    while count < number_messages:
        start = time.time()
        message = consumer.consume(block=True)
        end_kafka = time.time()
        data_np = np.array(ast.literal_eval(message.value))
        num_points = len(data_np)
        number_points_per_message = num_points
        end_parsing = time.time()
        kmeans = kmeans.partial_fit(data_np)
        end_kmeans = time.time()    
        result += "kmeans-kafka,   %d, %d, %d, %d, %.5f\n"%(num_points, number_dimensions, number_centroids, number_threads, end_kafka-start)
        result += "kmeans-parsing, %d, %d, %d, %d, %.5f\n"%(num_points, number_dimensions, number_centroids, number_threads, end_parsing-end_kafka)
        result += "kmeans-model,   %d, %d, %d, %d, %.5f\n"%(num_points, number_dimensions, number_centroids, number_threads, end_kmeans-end_parsing)
        if count % 100 == 0:
            print "Messages processed: %d"%count
        count += 1

In [3]:
import redis
r = redis.StrictRedis(host='c251-123', port=6379, db=0)

def put_model(model):
    r.set('kmeans', pickle.dumps(model))
    
def get_model():
    return pickle.loads(r.get("kmeans"))

In [None]:
def process_messages_kmeans_redis(number_messages=1, threadid=0):
    global result
    global number_threads 
    global number_points_per_message
    print "Thread: %d, Process %d messages"%(threadid, number_messages)
    count = 0
    while count < number_messages:
        start = time.time()
        message = consumer.consume(block=True)
        end_kafka = time.time()
        data_np = np.array(ast.literal_eval(message.value))
        num_points = len(data_np)
        number_points_per_message = num_points
        end_parsing = time.time()
        kmeans = get_model()
        end_model_get = time.time()
        kmeans = kmeans.partial_fit(data_np)
        end_kmeans = time.time()
        put_model(kmeans)
        end_model_put = time.time()    
        res =  "kmeans-kafka,   %d, %d, %d, %d, %.5f\n"%(num_points, number_dimensions, number_centroids, number_threads, end_kafka-start) 
        res += "kmeans-parsing, %d, %d, %d, %d, %.5f\n"%(num_points, number_dimensions, number_centroids, number_threads, end_parsing-end_kafka) 
        res += "kmeans-model-get,   %d, %d, %d, %d, %.5f\n"%(num_points, number_dimensions, number_centroids, number_threads, end_model_get-end_parsing) 
        res += "kmeans-model,   %d, %d, %d, %d, %.5f\n"%(num_points, number_dimensions, number_centroids, number_threads, end_kmeans-end_model_get) 
        res += "kmeans-model-put,   %d, %d, %d, %d, %.5f\n"%(num_points, number_dimensions, number_centroids, number_threads, end_model_put-end_kmeans)
        if count % 100 == 0:
            print "Messages processed: %d"%count
        count += 1
    
    result += res

In [None]:
run_timestamp=datetime.datetime.now()
RESULT_FILE= "results/kafka-throughput-kmeans-multithread-" + run_timestamp.strftime("%Y%m%d-%H%M%S") + ".csv"
try:
    os.makedirs("results")
except:
    pass
output_file=open(RESULT_FILE, "w")
output_file.write("Type, Number_Points, Dimensions, Number_Centroids, Number_Threads, Time\n")

In [None]:
%%time

global kmeans
global result
global number_points_per_message
global number_threads 


# configuration
number_centroids = 10
number_dimensions = 3
number_messages = 1000
number_threads = 24
repeats = 3

for i in range(repeats):
    # Generate initial centroids
    centroids = np.random.randn(number_centroids, number_dimensions)
    kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=len(centroids), init=centroids, n_init=1)
    consumer = topic.get_simple_consumer(reset_offset_on_start=True)
    result = ""   
    global_start = time.time()
    per_thread_messages = number_messages/number_threads
    threads = []
    for i in range(number_threads):
        t = Thread(target=process_messages, kwargs={"number_messages":per_thread_messages, 
                                                    "threadid":i})
        t.start()
        threads.append(t)
            
    for t in threads:        
        t.join()
        #t.stop()
    
    global_end = time.time()
    
    result += "kmeans-run,   %d, %d, %d, %d, %.5f\n"%(number_points_per_message, number_dimensions, number_centroids, number_threads, global_end-global_start)
    
    output_file.write(result)
    output_file.flush()

In [4]:
m=get_model()

In [8]:
m.cluster_centers_.shape[0]

10