In [1]:
from kafka import KafkaProducer
from kafka.admin import KafkaAdminClient, NewTopic

import boto3
import time
import os
import json
import struct

In [2]:
KAFKA_BOOTSTRAP_SERVERS = ['10.67.22.8:9092']

In [3]:
# connect to the cluster to run admin functions
kafka_admin = KafkaAdminClient(
    bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS,
)

Let us setup the S3 Client, necessary to obtain the data files.

In [4]:
# set s3 client
s3_client = boto3.client('s3',
                         endpoint_url='https://cloud-areapd.pd.infn.it:5210',
                         aws_access_key_id='ec71c86cfc994f95b5a3a3a6d173bccc',     # DO NOT WRITE KEYS
                         aws_secret_access_key='--------------------------------', # DIRECTLY ON FILE!!!
                         verify=False)

In [16]:
download_path = "/home/lupi/Project/LocalData/"

# fake fucntion to mimick arrival of new data
# actually it downloads it from S3 bucket
def get_new_data(i):
    
    test_names = []
    for s in ["q", "i"]:
        file_name     = "duck_" + s + "_" + '{:0>5}'.format(i) + ".dat" # ex. duck_q_00001.dat
        test_name     = file_name.replace("duck", "temp")               #     temp_q_00001.dat
        download_name = download_path + test_name                       #     /home/lupi/Project/LocalData/temp_q_00001.dat
    
        # download file to mimick arrival of new data
        s3_client.download_file('quax', file_name, download_name)
        
        test_names.append(test_name)
        
    return test_names

## Send Large Files Directly (Not Recommended)

In this first case we will use Kafka to directly stream the raw data files for the consumer and Spark Stream. This means that we will create a topic for the raw data files and a producer to publish them.

Please note that the total size of the transmitted data will be ~64 MB, while the default maximum size of messages handled by Kafka is 1 MB: we will thus need to set up the Kafka broker, producer and later consumer accordingly. In any case, the performances will not be ideal. 

In [None]:
# create a new topic with the following parameters:
#    number of partitions = 2
#    replication factor   = 1 (i.e. no replication)
rawdata_topic = NewTopic(name='raw_data',
                         num_partitions=4, 
                         replication_factor=1)

kafka_admin.create_topics(new_topics=[rawdata_topic])

In [None]:
# list the available topics
kafka_admin.list_topics()

Now we will create the producer that publishes data to this topic. 

In [None]:
# Create a Kafka producer instance
raw_producer = KafkaProducer(bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS,
                             max_request_size=73400320)

Every ~10 seconds we will download two files from the amazon bucket and to mimick the arrival of new data from the detector. We will then publish a message with their content and remove the files from our local system so as not to crowd it.

In [None]:
def send_rawdata(i, cleanup=True):
    
    # get new data
    test_names = get_raw_data(i) 
    
    # create single byte array of 64 MB
    msg = bytearray()
    for name in test_names:
        fh = open(download_path+name, 'rb')
        msg.extend(bytearray(fh.read()))
        
    # print some content of msg just for checking purpose
    var = struct.unpack('f'*(len(msg)//4), msg)
    print(var[0], var[10], var[8388608], var[8388618])
    
    raw_producer.send(topic = "raw_data",
                      #key   = i.to_bytes(1, "big"),
                      value = msg)
    raw_producer.flush()  # Flush the producer buffer
    
    # delete downloaded files
    if cleanup:
        for name in test_names:
            os.remove(download_path + name)
    
    time.sleep(5)  # Sleep for a short duration before sending the next message
                   # to mimick waiting time for new data

In [None]:
for i in range(2, 4):
    send_rawdata(i, True)

## Send Files into Chunks

Le us first define the characteristics sizes of the dataset.

In [8]:
n_samples = 8192 * 2**10 # = 2**13 * 2**10 = len(file) / 4
n_bins = 4 * 2**10
n_fft = n_samples // n_bins
S_R = 2 * 10**6 # sample rate
delta_nu = S_R / n_bins

print("N. samples:", n_samples,
      "\nN. bins in frequency spectrum:", n_bins,
      "\nN. of FFT computed:", n_fft,
      "\ndelta_nu:", delta_nu)

N. samples: 8388608 
N. bins in frequency spectrum: 4096 
N. of FFT computed: 2048 
delta_nu: 488.28125


In [9]:
# create a new topic with the following parameters:
#    number of partitions = 2
#    replication factor   = 1 (i.e. no replication)
chunk_topic = NewTopic(name='chunk_data',
                       num_partitions=4, 
                       replication_factor=1)

kafka_admin.create_topics(new_topics=[chunk_topic])

TopicAlreadyExistsError: [Error 36] TopicAlreadyExistsError: Request 'CreateTopicsRequest_v3(create_topic_requests=[(topic='chunk_data', num_partitions=4, replication_factor=1, replica_assignment=[], configs=[])], timeout=30000, validate_only=False)' failed with response 'CreateTopicsResponse_v3(throttle_time_ms=0, topic_errors=[(topic='chunk_data', error_code=36, error_message="Topic 'chunk_data' already exists.")])'.

In [10]:
# Create a Kafka producer instance
chunk_producer = KafkaProducer(bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS)

In [18]:
def send_chunks(i, cleanup=True):
    
    # get new data
    test_names = get_new_data(i) 
    
    # create byte arrays fro each file
    fr = open(download_path+test_names[0], 'rb')
    real = bytearray(fr.read())
    fi = open(download_path+test_names[1], 'rb')
    imag = bytearray(fi.read())
    
    for f in range(0, n_fft):
        # divide bytearrays into n_fft (2048) chunks
        # of size n_bins (4096)
        r_bin = real[f*n_bins:f*n_bins+n_bins]
        i_bin = imag[f*n_bins:f*n_bins+n_bins]
        
        msg = r_bin + i_bin
        
        # key = file + bin number
        key = i.to_bytes(2, "big") + f.to_bytes(2, "big")
        
        chunk_producer.send(topic = "chunk_data",
                            key   = key,
                            value = msg)
        chunk_producer.flush()  # Flush the producer buffer
    
    # delete downloaded files
    if cleanup:
        for name in test_names:
            os.remove(download_path + name)
    
    time.sleep(5)  # Sleep for a short duration before sending the next message
                   # to mimick waiting time for new data

In [24]:
for i in range(0, 10):
    send_chunks(i, True)









## Send Files over S3

Let us connect to the cluster and create a new topic for the raw data. Due to their large size (~32 MB) we will not use Kafka directly to distribute them, but we will load them on an Amazon S3 bucket and send with Kafka the locaton of this bucket and the file name.

In [None]:
# create a new topic with the following parameters:
#    number of partitions = 2
#    replication factor   = 1 (i.e. no replication)
datafile_topic = NewTopic(name='data_file',
                          num_partitions=3, 
                          replication_factor=1)

kafka_admin.create_topics(new_topics=[datafile_topic])

In [None]:
# list the available topics
kafka_admin.list_topics()

Now we will create the producer that publishes data to this topic. 

In [None]:
# Create a Kafka producer instance
producer = KafkaProducer(bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS,
                         max_request_size=70000000)

Every ~10 seconds we will download two files from the amazon bucket and re-upload them with a "fake" name to mimick the arrival of new data from the detector and the subsequent upload on the bucket. We will then publish a message with the name of the new files and remove the files from our local system so as not to crowd it.

In [None]:
download_path = "/home/lupi/Project/LocalData/"

# fake fucntion to mimick arrival of new data
# actually it downloads it from S3 bucket
def get_new_data(i):
    
    test_names = []
    for s in ["q", "i"]:
        file_name     = "duck_" + s + "_" + '{:0>5}'.format(i) + ".dat" # ex. duck_q_00001.dat
        test_name     = file_name.replace("duck", "temp")               #     temp_q_00001.dat
        download_name = download_path + test_name                       #     /home/lupi/Project/LocalData/temp_q_00001.dat
    
        # download file to mimick arrival of new data
        #s3_client.download_file('quax', file_name, download_name)
        
        test_names.append(test_name)
        
    return test_names

In [None]:
def send_message(i, cleanup=True):
    
    # get new data
    test_names = get_new_data(i) 
    
    # upload file to S3 bucket to make it available to consumer
    for name in test_names:
        download_name = download_path + name
        #s3_client.upload_file(download_name, 'quax', name)
    
    # encode message in a json file containing bucket name
    # and file names for real and imaginary parts
    msg = {
           "bucket_name" : "quax",
           "real_file"   : test_names[0],
           "imag_file"   : test_names[1]
          }
    
    producer.send(topic = "data_file",
                  value = json.dumps(msg).encode("utf-8"))
    producer.flush()  # Flush the producer buffer
    
    # delete downloaded files
    if cleanup:
        for name in test_names:
            os.remove(download_path + name)
    
    time.sleep(5)  # Sleep for a short duration before sending the next message
                   # to mimick waiting time for new data

In [None]:
for i in range(2, 4):
    send_message(i, False)

In [None]:
 s3_client.upload_file(test_name, 'quax', test_name)

In [None]:
s3_client.download_file('quax', '/home/lupi/Project/LocalData/temp_q_00000.dat', "ciao.dat")

In [None]:
s3_client.delete_object(Bucket='quax', Key='temp_q_00002.dat')

In [None]:
bucket='quax'
result = s3_client.list_objects(Bucket = bucket, Prefix = 'duck_i_00000.dat')
for o in result.get('Contents'):
    data = s3_client.get_object(Bucket=bucket, Key=o.get('Key'))
    contents = data['Body'].read()
    print(type(contents))

In [None]:
from pathlib import Path

def serialize_bin_file(file, path="/home/lupi/Project/LocalData/"):
    fileName = path + file
    data = Path(fileName).read_bytes()  
    
    return data