In [None]:
import json

from kafka import KafkaConsumer
from pyspark.ml.feature import VectorAssembler, MinMaxScalerModel
from pyspark.ml.regression import LinearRegressionModel
from pyspark.sql import SparkSession
from pyspark.sql.functions import lag
from pyspark.sql.functions import to_date
from pyspark.sql.window import Window

In [None]:
HDFS_PATH = 'hdfs://10.84.129.52:9000/trab/g05'
TICKERS = ['AAPL', 'MSFT', 'GOOG', 'AMZN', 'V']

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Define the Kafka consumer
consumer = KafkaConsumer(
    'g05in',  # Kafka topic to consume from
    bootstrap_servers='10.204.131.11:9092',  # List of brokers
    value_deserializer=lambda v: json.loads(v.decode('utf-8'))  # Deserializer function for the messages
)

# Load the saved models
models = {ticker: LinearRegressionModel.load(f'{HDFS_PATH}/models/{ticker}/{ticker}_model') for ticker in TICKERS}
scalers = {ticker: MinMaxScalerModel.load(f'{HDFS_PATH}/models/{ticker}/{ticker}_scaler') for ticker in TICKERS}

In [None]:
def preprocess_real_time_data(df, ticker):
    # Use the saved MinMaxScaler model
    scaler_model = scalers[ticker]

    # TODO: fix date column problem
    # Convert 'Date' column to datetime if it's not already in the datetime format
    df = df.withColumn("Date", to_date(df["Date"], 'dd-MM-yyyy'))

    # Create a new feature: difference between 'Close' and 'Open'
    df = df.withColumn('Close_Open_Diff', df['Close'] - df['Open'])

    # Add a new column 'Prev_Close' with the previous day's 'Close' price
    window_spec = Window.orderBy("Date")
    df = df.withColumn("Prev_Close", lag("Close").over(window_spec))
    df = df.na.drop()

    # Assemble the features into a feature vector
    assembler = VectorAssembler(
        inputCols=["Low", "Open", "Volume", "High", "Close_Open_Diff", "Prev_Close"],
        outputCol="features"
    )
    df = assembler.transform(df)

    # Normalize the features with MinMaxScaler
    scaled_df = scaler_model.transform(df)

    return scaled_df

In [None]:
try:
    # Consume messages
    for message in consumer:
        # Convert the JSON string to a dictionary
        message_dict = json.loads(message.value)

        # Parse the inner JSON
        for ticker, data in message_dict.items():
            # Transform the data into the format expected by the model
            features = [data['Low'], data['Open'], data['Volume'], data['High'], data['Close']]
            features_df = spark.createDataFrame([features], ["Low", "Open", "Volume", "High", "Close"])

            # Preprocess the real-time data
            preprocessed_features_df = preprocess_real_time_data(features_df, ticker)

            # Use the model to make a prediction
            prediction = models[ticker].transform(preprocessed_features_df).select("prediction").first()[0]

            # Compare the predicted value with the real-time value
            real_time_value = data['Close']
            difference = real_time_value - prediction

            # Print the result
            print(
                f"For {ticker}, the real-time value is {real_time_value}, the predicted value is {prediction}, and the difference is {difference}")

except KeyboardInterrupt:
    consumer.close()
    spark.stop()