In [1]:
from confluent_kafka import Consumer
from time import time, sleep
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from conf import conf

import pandas as pd
import numpy as np
import json
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

In [None]:
spark = sparknlp.start()

In [None]:
MODEL_NAME='classifierdl_use_emotion'

In [2]:
consumerConf = {'bootstrap.servers': conf.BOOTSTRAP_SERVER,
        'group.id': "AfekaFinalProj",
        'auto.offset.reset': 'smallest'}

consumer = Consumer(consumerConf)

In [None]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = ClassifierDLModel.pretrained(name=MODEL_NAME)\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])

In [None]:
empty_df = spark.createDataFrame([['']]).toDF("text")
pipelineModel = nlpPipeline.fit(empty_df)

In [3]:
running = True

def basic_consume_loop(consumer, topics):
    try:        
        consumer.subscribe(topics)
        
        start_time = time()
        seconds = 2

        index = 1
        batch = []
        
        while running:         
            msg = consumer.poll(timeout=3.0)            
            
            current_time = time()
            elapsed_time = current_time - start_time
    
            # create a new file
            if elapsed_time > seconds:               
                
                if len(batch) > 0:      

                    df = spark.createDataFrame(pd.DataFrame({"text":batch}))
                    result = pipelineModel.transform(df) 
                    result.select(F.explode(F.arrays_zip('document.result', 'sentiment.result')).alias("cols")).select(F.expr("cols['0']").alias("document"),F.expr("cols['1']").alias("sentiment")).show(truncate=False)
                    print('----------------')

                    file_name = 'AMAZON_FASHION_' + str(index) + '.txt'
                    print("File {} has been created. File size: {} lines".format(file_name, len(batch)))

                    # Create the folder if it does not exists
                    path = 'AMAZON_FASHION'                    
                    os.makedirs(path, exist_ok=True) 
                    
                    complete_name = os.path.join('AMAZON_FASHION', file_name)
                    with open(complete_name, 'w') as f:
                        for item in batch:
                            f.write("%s\n" % item)
                    
                    batch = []
                    index +=1
                
                start_time = time()  
                
            if msg is None: continue

            if msg.error():
                if msg.error().code() == KafkaError._PARTITION_EOF:
                    # End of partition event
                    sys.stderr.write('%% %s [%d] reached end at offset %d\n' %
                                     (msg.topic(), msg.partition(), msg.offset()))
                elif msg.error():
                    raise KafkaException(msg.error())
            else:
                batch.append(msg.value().decode('utf-8'))
                
    finally:
        # Close down consumer to commit final offsets.
        consumer.close()

def shutdown():
    running = False

In [None]:
basic_consume_loop(consumer, [conf.KAFKA_TOKEN])

File AMAZON_FASHION_1.txt has been created. File size: 100 lines
File AMAZON_FASHION_2.txt has been created. File size: 100 lines
File AMAZON_FASHION_3.txt has been created. File size: 100 lines
File AMAZON_FASHION_4.txt has been created. File size: 100 lines
File AMAZON_FASHION_5.txt has been created. File size: 100 lines
File AMAZON_FASHION_6.txt has been created. File size: 100 lines
File AMAZON_FASHION_7.txt has been created. File size: 100 lines
File AMAZON_FASHION_8.txt has been created. File size: 100 lines
File AMAZON_FASHION_9.txt has been created. File size: 100 lines
File AMAZON_FASHION_10.txt has been created. File size: 100 lines
