In [22]:
from kafka import KafkaConsumer

import json
import boto3
import struct
import numpy as np

In [2]:
# define the list of brokers in the cluster
KAFKA_BOOTSTRAP_SERVERS = ['10.67.22.8:9092']

## Receive Large Files Directly (Not Recommended)

In this first case we will receive directly the pair of 32 MB raw data files.

In [None]:
# create a Kafka consumer instance
raw_consumer = KafkaConsumer(
    bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS,  # list of Kafka brokers
    max_partition_fetch_bytes=73400320,
    fetch_max_bytes=73400320,
    consumer_timeout_ms=1000000                   # maximum time to wait for a new message 
                                                # before stopping the consumer
)

In [None]:
# list all available topics on the kafka brokers
raw_consumer.topics()

In [None]:
# subscribe to a topic
raw_consumer.subscribe('raw_data')

# check the active subscriptions
raw_consumer.subscription()

In [None]:
# set up the polling strategy for the consumer
raw_consumer.poll(timeout_ms=0,         # do not enable dead-times before one poll to the next
              max_records=None,     # do not limit the number of records to consume at once 
              update_offsets=True   # update the reading offsets on this topic
             )

In [None]:
def process_rawmessage(msg):
    #file_number = int.from_bytes(msg.value, "big")
    var = struct.unpack('f'*(len(msg.value)//4), msg.value)
    
    #print(file_number)
    print(var[0], var[10], var[8388608], var[8388618])
    

In [None]:
# this consumer will keep polling for messages 
# until stopped by the user
# (or reaches the consumer_timeout_ms, if specified)
for message in raw_consumer:
    process_rawmessage(message)


In [None]:
raw_consumer.close()

## Receive Data Chunks

In [17]:
# create a Kafka consumer instance
chunk_consumer = KafkaConsumer(
    bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS,  # list of Kafka brokers
    consumer_timeout_ms=10000                  # maximum time to wait for a new message 
                                                # before stopping the consumer
)

In [4]:
# list all available topics on the kafka brokers
chunk_consumer.topics()

{'chunk_data', 'data_file', 'my_awesome_topic', 'raw_data'}

In [18]:
# subscribe to a topic
chunk_consumer.subscribe('chunk_data')

# check the active subscriptions
chunk_consumer.subscription()

{'chunk_data'}

In [19]:
# set up the polling strategy for the consumer
chunk_consumer.poll(timeout_ms=0,         # do not enable dead-times before one poll to the next
                    max_records=None,     # do not limit the number of records to consume at once 
                    update_offsets=True   # update the reading offsets on this topic
                    )

{}

In [20]:
def process_chunkmessage(msg, verbose=True):
    file_number, bin_number = struct.unpack(">hh", msg.key)
    #var = struct.unpack('f'*(len(msg.value)//4), msg.value)
    
    if verbose:
        print(file_number, bin_number)
    #print(var[0], var[10], var[8388608], var[8388618])
    return file_number, bin_number

In [29]:
# this consumer will keep polling for messages 
# until stopped by the user
# (or reaches the consumer_timeout_ms, if specified)
n_bin = np.zeros(shape=(10, 2048))

key_prev = 0
for message in chunk_consumer:
    key, n = process_chunkmessage(message, verbose=False)
    
    if(key_prev > key):
        print("File sent not in order")
        
    n_bin[key - 2][n] = n
    key_prev = key

In [30]:
for row in n_bin:
    missing_elements = list(filter(lambda x: x not in row, range(0, 2048)))
    print(missing_elements)

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


In [None]:
chunk_consumer.close()

### Spark Structured Streaming

In [51]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("spark://10.67.22.8:7077")\
    .appName("Spark structured streaming application")\
    .config("spark.executor.memory", "512m")\
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")\
    .config("spark.sql.adaptive.enabled", "false")\
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "false")\
    .config("spark.sql.shuffle.partitions", 4)\
    .config("spark.jars", "/usr/local/spark/jars/spark-sql-kafka-0-10_2.12-3.4.1.jar" + "," 
                        + "/usr/local/spark/jars/kafka-clients-2.1.1.jar") \
    .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2")\
    .getOrCreate()

23/07/14 10:36:48 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [52]:
sc = spark.sparkContext
sc

In [53]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:2.0.2'

In [54]:
inputDF = spark\
    .readStream\
    .format("kafka")\
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS)\
    .option("kafkaConsumer.pollTimeoutMs", 5000)\
    .option("startingOffsets", "latest") \
    .option("maxOffsetsPerTrigger", 2048)\
    .option("subscribe", "chunk_data")\
    .load()    

AnalysisException: Failed to find data source: kafka. Please deploy the application as per the deployment section of Structured Streaming + Kafka Integration Guide.

## Receive S3 File Names

Let us create a new Kafka consumer that will listen to the "data_file" topic: it will thus receive information on the S3 buckets where the data files are located.

In [None]:
# create a Kafka consumer instance
consumer = KafkaConsumer(
    bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS,  # list of Kafka brokers
    consumer_timeout_ms=10000                   # maximum time to wait for a new message 
                                                # before stopping the consumer
)

In [None]:
# list all available topics on the kafka brokers
consumer.topics()

In [None]:
# subscribe to a topic
consumer.subscribe('data_file')

# check the active subscriptions
consumer.subscription()

In [None]:
# set up the polling strategy for the consumer
consumer.poll(timeout_ms=0,         # do not enable dead-times before one poll to the next
              max_records=None,     # do not limit the number of records to consume at once 
              update_offsets=True   # update the reading offsets on this topic
             )

Each time a new message arrives, the value will be converted to a dictionary containing the bucket and file data names. Then *rdds containing the datasets will be created* * and optionally the data files in the bucket will be deleted. 

In [None]:
# set s3 client
s3_client = boto3.client('s3',
                         endpoint_url='https://cloud-areapd.pd.infn.it:5210',
                         aws_access_key_id='ec71c86cfc994f95b5a3a3a6d173bccc',     # DO NOT WRITE KEYS
                         aws_secret_access_key='--------------------------------', # DIRECTLY ON FILE!!!
                         verify=False)

In [None]:
download_path = "/home/lupi/Project/LocalData/"

def process_message(message, cleanup=True):
    val_dict = json.loads(message.value)
    bucket = val_dict["bucket_name"]
    real_file = val_dict["real_file"]
    imag_file = val_dict["imag_file"]
    
    
    # ... get data from bucket
    
    print(real_file, imag_file)
    #s3_client.download_file('quax', real_file, (download_path+real_file))
    
    if cleanup:
        s3_client.delete_object(Bucket=bucket, Key=real_file)
        s3_client.delete_object(Bucket=bucket, Key=imag_file)
    

In [None]:
# this consumer will keep polling for messages 
# until stopped by the user
# (or reaches the consumer_timeout_ms, if specified)
for message in consumer:
    process_message(message, False)
