# Part 1: Kafka Producer

In [2]:
import datetime, time, random, string

def one_station(name):
    # temp pattern
    month_avg = [27,31,44,58,70,79,83,81,74,61,46,32]
    shift = (random.random()-0.5) * 30
    month_avg = [m + shift + (random.random()-0.5) * 5 for m in month_avg]
    
    # rain pattern
    start_rain = [0.1,0.1,0.3,0.5,0.4,0.2,0.2,0.1,0.2,0.2,0.2,0.1]
    shift = (random.random()-0.5) * 0.1
    start_rain = [r + shift + (random.random() - 0.5) * 0.2 for r in start_rain]
    stop_rain = 0.2 + random.random() * 0.2

    # day's state
    today = datetime.date(2000, 1, 1)
    temp = month_avg[0]
    raining = False
    
    # gen weather
    while True:
        # choose temp+rain
        month = today.month - 1
        temp = temp * 0.8 + month_avg[month] * 0.2 + (random.random()-0.5) * 20
        if temp < 32:
            raining=False
        elif raining and random.random() < stop_rain:
            raining = False
        elif not raining and random.random() < start_rain[month]:
            raining = True

        yield (today.strftime("%Y-%m-%d"), name, temp, raining)

        # next day
        today += datetime.timedelta(days=1)
        
def all_stations(count=10, sleep_sec=1):
    assert count <= 26
    stations = []
    for name in string.ascii_uppercase[:count]:
        stations.append(one_station(name))
    while True:
        for station in stations:
            yield next(station)
        time.sleep(sleep_sec)

In [3]:
# loops forever because the weather never ends...
count = 0
for row in all_stations(3):
    count += 1
    print(row) # date, station, temp, raining
    if count == 10:
        break

('2000-01-01', 'A', 23.106934846763195, False)
('2000-01-01', 'B', 26.868381336544704, False)
('2000-01-01', 'C', 30.00862060013644, False)
('2000-01-02', 'A', 22.701053120504632, False)
('2000-01-02', 'B', 18.15590022054152, False)
('2000-01-02', 'C', 29.33548732825825, False)
('2000-01-03', 'A', 28.510150537314907, False)
('2000-01-03', 'B', 12.677082076290707, False)
('2000-01-03', 'C', 41.19896433804881, False)
('2000-01-04', 'A', 31.41734889560798, False)


In [4]:
from kafka import KafkaAdminClient, KafkaProducer, KafkaConsumer, TopicPartition
from kafka.admin import NewTopic
from kafka.errors import TopicAlreadyExistsError, UnknownTopicOrPartitionError

admin = KafkaAdminClient(bootstrap_servers=["kafka:9092"])
try:
    admin.delete_topics(["stations", "stations-json"])
    print("deleted")
except UnknownTopicOrPartitionError:
    print("cannot delete (may not exist yet)")

time.sleep(1)
admin.create_topics([NewTopic("stations", 6, 1)])
admin.create_topics([NewTopic("stations-json", 6, 1)])
admin.list_topics()

cannot delete (may not exist yet)


['stations-json', 'stations']

In [5]:
import os
if "report_pb2.py" in os.listdir():
    ! rm report_pb2.py
! python3 -m grpc_tools.protoc -I=. --python_out=. report.proto

In [6]:
from report_pb2 import *
import json, threading

def produce():
    producer = KafkaProducer(bootstrap_servers=["kafka:9092"], acks="all", retries=10)
    
    for date, station, degrees, raining in all_stations(15):
        # send to "stations" stream using protobuf
        stations_proto = Report(date = date, station = station, degrees = degrees, raining = raining)
        proto_value = stations_proto.SerializeToString()
        producer.send("stations", value = proto_value, key = bytes(station, "utf-8")) 
        
        # send to "stations-json" using JSON
        raining_int = int(raining)
        stations_json = {"date":date, "station":station, "degrees":degrees, "raining":raining_int}
        json_value = bytes(json.dumps(stations_json), "utf-8")
        producer.send("stations-json", value = json_value, key = bytes(station, "utf-8"))

# start thread to run produce
# never join thread because we want it to run forever
threading.Thread(target=produce).start()

# Part 2: Kafka Consumer

In [7]:
for partition in range(6):
    path = f"partition-{partition}.json"
    if os.path.exists(path):
        os.remove(path)

In [8]:
def load_partition(partition_num):
    path = f"partition-{partition_num}.json"
    if os.path.exists(path):
        with open(path, "r") as file:
            return json.load(file)
    else:
        return {"partition":partition_num, "offset":0}

def save_partition(partition):
    path = f"partition-{partition['partition']}.json"
    with open(path, "w") as file:
        json.dump(partition, file)

In [9]:
def station_update(prev_json, messages):
    for msg in messages:
        report = Report.FromString(msg.value)
        if report.station in prev_json.keys():
            # checking if message date is before end date in json file, using datetime to evaluate
            curr_end = datetime.datetime.strptime(prev_json[report.station]["end"], '%Y-%m-%d')
            new_date = datetime.datetime.strptime(report.date, '%Y-%m-%d')
            
            # if it's already been seen, next iteration
            if new_date <= curr_end:
                continue
                
            # else, add new date's stats and update for most recent data
            prev_json[report.station]["sum"] += report.degrees
            prev_json[report.station]["count"] += 1
            prev_json[report.station]["avg"] = prev_json[report.station]["sum"] / prev_json[report.station]["count"]
            
            # convert date strings to time, use datetime to evaluate
            curr_start = datetime.datetime.strptime(prev_json[report.station]["start"], '%Y-%m-%d')
            
            if new_date > curr_end:
                prev_json[report.station]["end"] = report.date
            elif new_date < curr_start:
                prev_json[report.station]["start"] = report.date
            else:
                continue
        else:
            # station not in json, update with single current info
            # (to be updated later)
            prev_json[report.station] = {}
            prev_json[report.station]["sum"] = report.degrees
            prev_json[report.station]["count"] = 1
            prev_json[report.station]["avg"] = prev_json[report.station]["sum"] / prev_json[report.station]["count"]
            
            # setting start and end date to be the first date received
            prev_json[report.station]["start"] = report.date
            prev_json[report.station]["end"] = report.date
            
    return prev_json

In [10]:
def consume(part_nums=[], iterations=10):
    consumer = KafkaConsumer(bootstrap_servers=["kafka:9092"])
    # create list of TopicPartition objects
    consumer.assign([TopicPartition("stations", part_num) for part_num in part_nums])

    # PART 1: initialization
    partitions = {} # key=partition num, value=snapshot dict
    
    for part_num in part_nums:
         # load partitions from JSON files (if they exist) or create fresh dicts
        partition_data = load_partition(part_num)
        partitions[part_num] = partition_data
        # if offsets were specified in previous JSON files, the consumer
        # should seek to those; else, seek to offset 0 (in load_partition).
        offset = partition_data["offset"]
        consumer.seek(TopicPartition("stations", part_num), offset)
    
    # PART 2: process batches
    for i in range(iterations):
        batch = consumer.poll(1000) # 1s timeout
        for topic, messages in batch.items():
            partition_dict = partitions[topic.partition]
            
            # update the partitions based on new messages
            updated_json = station_update(partition_dict, messages)
            updated_json["offset"] = consumer.position(topic)
            
            # save the data back to the JSON file
            save_partition(updated_json)
    print("exiting")

for i in range(2):
    print("ROUND", i)
    t1 = threading.Thread(target=consume, args=([0,1], 30))
    t2 = threading.Thread(target=consume, args=([2,3], 30))
    t3 = threading.Thread(target=consume, args=([4,5], 30))
    t1.start()
    t2.start()
    t3.start()
    t1.join()
    t2.join()
    t3.join()

ROUND 0
exiting
exiting
exiting
ROUND 1
exiting
exiting
exiting


In [11]:
! cat partition*.json

{"partition": 0, "offset": 41, "N": {"sum": 1979.369445279752, "count": 41, "avg": 48.277303543408586, "start": "2000-01-01", "end": "2000-02-10"}}{"partition": 1, "offset": 82, "E": {"sum": 1380.0503764560815, "count": 41, "avg": 33.659765279416625, "start": "2000-01-01", "end": "2000-02-10"}, "O": {"sum": 1182.5191599373866, "count": 41, "avg": 28.84193073018016, "start": "2000-01-01", "end": "2000-02-10"}}{"partition": 2, "offset": 126, "F": {"sum": 1588.2846551420698, "count": 42, "avg": 37.816301312906425, "start": "2000-01-01", "end": "2000-02-11"}, "I": {"sum": 788.9024095308246, "count": 42, "avg": 18.783390703114872, "start": "2000-01-01", "end": "2000-02-11"}, "J": {"sum": 966.4667346622, "count": 42, "avg": 23.011112730052382, "start": "2000-01-01", "end": "2000-02-11"}}{"partition": 3, "offset": 126, "D": {"sum": 947.1333179027599, "count": 42, "avg": 22.550793283399045, "start": "2000-01-01", "end": "2000-02-11"}, "G": {"sum": 855.1238244642853, "count": 42, "avg": 20.3600